hermes-agent/tests/agent/test_gemini_native_adapter.py
Teknium ec46f5912e
fix(gemini): default native maxOutputTokens + strip OpenAI extra_body on Gemini endpoints (#39730)
* fix: respect disabled auto-compaction on context overflow

Port from anomalyco/opencode#30749.

When compression.enabled is false, NO automatic compaction trigger may
fire. The proactive token-threshold paths (preflight + post-response
should_compress gate) already honoured the setting, but the three
provider-overflow recovery paths in the agent loop — long-context-tier
429, 413 payload-too-large, and context-overflow — called
_compress_context() unconditionally, silently compressing and rotating
the session against the user's explicit choice.

Add a single guard at the top of the overflow-recovery dispatch: when
compression is disabled and the error is one of those three overflow
classes, surface a terminal error (compaction_disabled: True) telling the
user to /compress manually, /new, switch to a larger-context model, or
reduce attachments. Manual /compress (force=True) is unaffected — it never
enters this loop.

Tests: new TestOverflowWithCompactionDisabled (413 + 400 overflow don't
compress when disabled; control case still compresses when enabled).
Existing overflow-recovery tests updated to enable compaction explicitly
(they verify the recovery fires); fixture defaults flipped to True to
match production (compression.enabled defaults to True).

* fix(gemini): default native maxOutputTokens + strip OpenAI extra_body on Gemini endpoints

Two distinct failures hit users on the gemini provider with only Google
AI Studio keys set.

1. Truncation loop: build_gemini_request() only set maxOutputTokens when
   max_tokens was non-None. Hermes passes None to mean "unlimited", but
   Gemini's native generateContent does NOT treat an absent maxOutputTokens
   as full budget — it applies a low internal default and stops early with
   finishReason=MAX_TOKENS, truncating tool calls. The agent then retries
   3x and refuses the incomplete call. Now default to the published 65,535
   ceiling (shared by all current Gemini text models) when max_tokens=None.

2. HTTP 400 on Gemini endpoint: the chat_completions transport assembles
   profile extra_body (Nous portal 'tags', reasoning, provider prefs) and
   sends it via the OpenAI client to whatever base_url is resolved. When a
   profile that emits extra_body (e.g. Nous) is active but the endpoint is a
   native Gemini base_url — typical when only Google creds exist and a
   fallback/aux call lands on Gemini — Google rejects the unknown 'tags'
   field with a non-retryable 400. Strip all non-thinking_config extra_body
   keys when the resolved endpoint is native Gemini.

Verified E2E against real transport code: tags stripped on native Gemini,
preserved on Nous and the /openai compat endpoint; maxOutputTokens=65535
on None, explicit values respected.
2026-06-05 03:53:59 -07:00

352 lines
12 KiB
Python

"""Tests for the native Google AI Studio Gemini adapter."""
from __future__ import annotations
import json
from types import SimpleNamespace
import pytest
class DummyResponse:
def __init__(self, status_code=200, payload=None, headers=None, text=None):
self.status_code = status_code
self._payload = payload or {}
self.headers = headers or {}
self.text = text if text is not None else json.dumps(self._payload)
def json(self):
return self._payload
def test_build_native_request_preserves_thought_signature_on_tool_replay():
from agent.gemini_native_adapter import build_gemini_request
request = build_gemini_request(
messages=[
{"role": "system", "content": "Be helpful."},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"type": "function",
"function": {
"name": "get_weather",
"arguments": '{"city": "Paris"}',
},
"extra_content": {
"google": {"thought_signature": "sig-123"}
},
}
],
},
],
tools=[],
tool_choice=None,
)
parts = request["contents"][0]["parts"]
assert parts[0]["functionCall"]["name"] == "get_weather"
assert parts[0]["thoughtSignature"] == "sig-123"
def test_build_native_request_uses_original_function_name_for_tool_result():
from agent.gemini_native_adapter import build_gemini_request
request = build_gemini_request(
messages=[
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"type": "function",
"function": {
"name": "get_weather",
"arguments": '{"city": "Paris"}',
},
}
],
},
{
"role": "tool",
"tool_call_id": "call_1",
"content": '{"forecast": "sunny"}',
},
],
tools=[],
tool_choice=None,
)
tool_response = request["contents"][1]["parts"][0]["functionResponse"]
assert tool_response["name"] == "get_weather"
def test_build_native_request_strips_json_schema_only_fields_from_tool_parameters():
from agent.gemini_native_adapter import build_gemini_request
request = build_gemini_request(
messages=[{"role": "user", "content": "Hello"}],
tools=[
{
"type": "function",
"function": {
"name": "lookup_weather",
"description": "Weather lookup",
"parameters": {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"type": "object",
"additionalProperties": False,
"properties": {
"city": {
"type": "string",
"$schema": "ignored",
"description": "City name",
}
},
"required": ["city"],
},
},
}
],
tool_choice=None,
)
params = request["tools"][0]["functionDeclarations"][0]["parameters"]
assert "$schema" not in params
assert "additionalProperties" not in params
assert params["type"] == "object"
assert params["properties"]["city"] == {
"type": "string",
"description": "City name",
}
def test_translate_native_response_surfaces_reasoning_and_tool_calls():
from agent.gemini_native_adapter import translate_gemini_response
payload = {
"candidates": [
{
"content": {
"parts": [
{"thought": True, "text": "thinking..."},
{"functionCall": {"name": "search", "args": {"q": "hermes"}}},
]
},
"finishReason": "STOP",
}
],
"usageMetadata": {
"promptTokenCount": 10,
"candidatesTokenCount": 5,
"totalTokenCount": 15,
},
}
response = translate_gemini_response(payload, model="gemini-2.5-flash")
choice = response.choices[0]
assert choice.finish_reason == "tool_calls"
assert choice.message.reasoning == "thinking..."
assert choice.message.tool_calls[0].function.name == "search"
assert json.loads(choice.message.tool_calls[0].function.arguments) == {"q": "hermes"}
def test_native_client_uses_x_goog_api_key_and_native_models_endpoint(monkeypatch):
from agent.gemini_native_adapter import GeminiNativeClient
recorded = {}
class DummyHTTP:
def post(self, url, json=None, headers=None, timeout=None):
recorded["url"] = url
recorded["json"] = json
recorded["headers"] = headers
return DummyResponse(
payload={
"candidates": [
{
"content": {"parts": [{"text": "hello"}]},
"finishReason": "STOP",
}
],
"usageMetadata": {
"promptTokenCount": 1,
"candidatesTokenCount": 1,
"totalTokenCount": 2,
},
}
)
def close(self):
return None
monkeypatch.setattr("agent.gemini_native_adapter.httpx.Client", lambda *a, **k: DummyHTTP())
client = GeminiNativeClient(api_key="AIza-test", base_url="https://generativelanguage.googleapis.com/v1beta")
response = client.chat.completions.create(
model="gemini-2.5-flash",
messages=[{"role": "user", "content": "Hello"}],
)
assert recorded["url"] == "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent"
assert recorded["headers"]["x-goog-api-key"] == "AIza-test"
assert "Authorization" not in recorded["headers"]
assert response.choices[0].message.content == "hello"
def test_native_http_error_keeps_status_and_retry_after():
from agent.gemini_native_adapter import gemini_http_error
response = DummyResponse(
status_code=429,
headers={"Retry-After": "17"},
payload={
"error": {
"code": 429,
"message": "quota exhausted",
"status": "RESOURCE_EXHAUSTED",
"details": [
{
"@type": "type.googleapis.com/google.rpc.ErrorInfo",
"reason": "RESOURCE_EXHAUSTED",
"metadata": {"service": "generativelanguage.googleapis.com"},
}
],
}
},
)
err = gemini_http_error(response)
assert getattr(err, "status_code", None) == 429
assert getattr(err, "retry_after", None) == 17.0
assert "quota exhausted" in str(err)
def test_native_client_accepts_injected_http_client():
from agent.gemini_native_adapter import GeminiNativeClient
injected = SimpleNamespace(close=lambda: None)
client = GeminiNativeClient(api_key="AIza-test", http_client=injected)
assert client._http is injected
def test_native_client_rejects_empty_api_key_with_actionable_message():
"""Empty/whitespace api_key must raise at construction, not produce a cryptic
Google GFE 'Error 400 (Bad Request)!!1' HTML page on the first request."""
from agent.gemini_native_adapter import GeminiNativeClient
for bad in ("", " ", None):
with pytest.raises(RuntimeError) as excinfo:
GeminiNativeClient(api_key=bad) # type: ignore[arg-type]
msg = str(excinfo.value)
assert "GOOGLE_API_KEY" in msg and "GEMINI_API_KEY" in msg
assert "aistudio.google.com" in msg
@pytest.mark.asyncio
async def test_async_native_client_streams_without_requiring_async_iterator_from_sync_client():
from agent.gemini_native_adapter import AsyncGeminiNativeClient
chunk = SimpleNamespace(choices=[SimpleNamespace(delta=SimpleNamespace(content="hi"), finish_reason=None)])
sync_stream = iter([chunk])
def _advance(iterator):
try:
return False, next(iterator)
except StopIteration:
return True, None
sync_client = SimpleNamespace(
api_key="AIza-test",
base_url="https://generativelanguage.googleapis.com/v1beta",
chat=SimpleNamespace(completions=SimpleNamespace(create=lambda **kwargs: sync_stream)),
_advance_stream_iterator=_advance,
close=lambda: None,
)
async_client = AsyncGeminiNativeClient(sync_client)
stream = await async_client.chat.completions.create(stream=True)
collected = []
async for item in stream:
collected.append(item)
assert collected == [chunk]
def test_stream_event_translation_emits_tool_call_delta_with_stable_index():
from agent.gemini_native_adapter import translate_stream_event
tool_call_indices = {}
event = {
"candidates": [
{
"content": {
"parts": [
{"functionCall": {"name": "search", "args": {"q": "abc"}}}
]
},
"finishReason": "STOP",
}
]
}
first = translate_stream_event(event, model="gemini-2.5-flash", tool_call_indices=tool_call_indices)
second = translate_stream_event(event, model="gemini-2.5-flash", tool_call_indices=tool_call_indices)
assert first[0].choices[0].delta.tool_calls[0].index == 0
assert second[0].choices[0].delta.tool_calls[0].index == 0
assert first[0].choices[0].delta.tool_calls[0].id == second[0].choices[0].delta.tool_calls[0].id
assert first[0].choices[0].delta.tool_calls[0].function.arguments == '{"q": "abc"}'
assert second[0].choices[0].delta.tool_calls[0].function.arguments == ""
assert first[-1].choices[0].finish_reason == "tool_calls"
def test_stream_event_translation_keeps_identical_calls_in_distinct_parts():
from agent.gemini_native_adapter import translate_stream_event
event = {
"candidates": [
{
"content": {
"parts": [
{"functionCall": {"name": "search", "args": {"q": "abc"}}},
{"functionCall": {"name": "search", "args": {"q": "abc"}}},
]
},
"finishReason": "STOP",
}
]
}
chunks = translate_stream_event(event, model="gemini-2.5-flash", tool_call_indices={})
tool_chunks = [chunk for chunk in chunks if chunk.choices[0].delta.tool_calls]
assert tool_chunks[0].choices[0].delta.tool_calls[0].index == 0
assert tool_chunks[1].choices[0].delta.tool_calls[0].index == 1
assert tool_chunks[0].choices[0].delta.tool_calls[0].id != tool_chunks[1].choices[0].delta.tool_calls[0].id
def test_max_tokens_none_defaults_to_gemini_output_ceiling():
"""max_tokens=None must send the model's full output ceiling, not omit it.
Gemini's native generateContent applies a low internal default when
maxOutputTokens is absent, truncating tool calls mid-stream. Hermes passes
None to mean "unlimited", so the adapter must translate that to the
published 65,535 ceiling rather than leaving the field unset.
"""
from agent.gemini_native_adapter import (
build_gemini_request,
GEMINI_DEFAULT_MAX_OUTPUT_TOKENS,
)
req = build_gemini_request(messages=[{"role": "user", "content": "hi"}], max_tokens=None)
assert req["generationConfig"]["maxOutputTokens"] == GEMINI_DEFAULT_MAX_OUTPUT_TOKENS == 65535
def test_explicit_max_tokens_is_respected():
from agent.gemini_native_adapter import build_gemini_request
req = build_gemini_request(messages=[{"role": "user", "content": "hi"}], max_tokens=4096)
assert req["generationConfig"]["maxOutputTokens"] == 4096