mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(gemini): default native maxOutputTokens + strip OpenAI extra_body on Gemini endpoints (#39730)
* fix: respect disabled auto-compaction on context overflow Port from anomalyco/opencode#30749. When compression.enabled is false, NO automatic compaction trigger may fire. The proactive token-threshold paths (preflight + post-response should_compress gate) already honoured the setting, but the three provider-overflow recovery paths in the agent loop — long-context-tier 429, 413 payload-too-large, and context-overflow — called _compress_context() unconditionally, silently compressing and rotating the session against the user's explicit choice. Add a single guard at the top of the overflow-recovery dispatch: when compression is disabled and the error is one of those three overflow classes, surface a terminal error (compaction_disabled: True) telling the user to /compress manually, /new, switch to a larger-context model, or reduce attachments. Manual /compress (force=True) is unaffected — it never enters this loop. Tests: new TestOverflowWithCompactionDisabled (413 + 400 overflow don't compress when disabled; control case still compresses when enabled). Existing overflow-recovery tests updated to enable compaction explicitly (they verify the recovery fires); fixture defaults flipped to True to match production (compression.enabled defaults to True). * fix(gemini): default native maxOutputTokens + strip OpenAI extra_body on Gemini endpoints Two distinct failures hit users on the gemini provider with only Google AI Studio keys set. 1. Truncation loop: build_gemini_request() only set maxOutputTokens when max_tokens was non-None. Hermes passes None to mean "unlimited", but Gemini's native generateContent does NOT treat an absent maxOutputTokens as full budget — it applies a low internal default and stops early with finishReason=MAX_TOKENS, truncating tool calls. The agent then retries 3x and refuses the incomplete call. Now default to the published 65,535 ceiling (shared by all current Gemini text models) when max_tokens=None. 2. HTTP 400 on Gemini endpoint: the chat_completions transport assembles profile extra_body (Nous portal 'tags', reasoning, provider prefs) and sends it via the OpenAI client to whatever base_url is resolved. When a profile that emits extra_body (e.g. Nous) is active but the endpoint is a native Gemini base_url — typical when only Google creds exist and a fallback/aux call lands on Gemini — Google rejects the unknown 'tags' field with a non-retryable 400. Strip all non-thinking_config extra_body keys when the resolved endpoint is native Gemini. Verified E2E against real transport code: tags stripped on native Gemini, preserved on Nous and the /openai compat endpoint; maxOutputTokens=65535 on None, explicit values respected.
This commit is contained in:
parent
6bf55a473e
commit
ec46f5912e
7 changed files with 277 additions and 2 deletions
|
|
@ -2720,6 +2720,61 @@ def run_conversation(
|
|||
# compress history and retry, not abort immediately.
|
||||
status_code = getattr(api_error, "status_code", None)
|
||||
|
||||
# ── Respect disabled auto-compaction on overflow ──────
|
||||
# Ported from anomalyco/opencode#30749. When the user has
|
||||
# turned auto-compaction off (``compression.enabled: false``),
|
||||
# NO automatic compaction trigger may fire — including the
|
||||
# provider/request-size overflow recovery paths below
|
||||
# (long-context-tier 429, 413 payload-too-large, and
|
||||
# context-overflow). Without this guard the proactive
|
||||
# threshold path correctly honours the setting (see the
|
||||
# preflight check and the post-response ``should_compress``
|
||||
# gate) but a provider overflow error would still silently
|
||||
# compress + rotate the session, bypassing the user's
|
||||
# explicit choice. Surface a terminal error instead so the
|
||||
# user can compact manually (``/compress``), start fresh
|
||||
# (``/new``), switch to a larger-context model, or reduce
|
||||
# attachments. Forced compaction via ``/compress``
|
||||
# (``force=True``) is unaffected — it never reaches this loop.
|
||||
_overflow_reasons = {
|
||||
FailoverReason.long_context_tier,
|
||||
FailoverReason.payload_too_large,
|
||||
FailoverReason.context_overflow,
|
||||
}
|
||||
if (
|
||||
classified.reason in _overflow_reasons
|
||||
and not getattr(agent, "compression_enabled", True)
|
||||
):
|
||||
agent._flush_status_buffer()
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix}❌ Context overflow, but auto-compaction is disabled "
|
||||
f"(compression.enabled: false).",
|
||||
force=True,
|
||||
)
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix} 💡 Run /compress to compact manually, /new to start fresh, "
|
||||
f"switch to a larger-context model, or reduce attachments.",
|
||||
force=True,
|
||||
)
|
||||
logger.error(
|
||||
f"{agent.log_prefix}Context overflow ({classified.reason.value}) with "
|
||||
f"auto-compaction disabled — not compressing."
|
||||
)
|
||||
agent._persist_session(messages, conversation_history)
|
||||
return {
|
||||
"messages": messages,
|
||||
"completed": False,
|
||||
"api_calls": api_call_count,
|
||||
"error": (
|
||||
"Context overflow and auto-compaction is disabled "
|
||||
"(compression.enabled: false). Run /compress to compact manually, "
|
||||
"/new to start fresh, or switch to a larger-context model."
|
||||
),
|
||||
"partial": True,
|
||||
"failed": True,
|
||||
"compaction_disabled": True,
|
||||
}
|
||||
|
||||
# ── Anthropic Sonnet long-context tier gate ───────────
|
||||
# Anthropic returns HTTP 429 "Extra usage is required for
|
||||
# long context requests" when a Claude Max (or similar)
|
||||
|
|
|
|||
|
|
@ -33,6 +33,13 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
||||
|
||||
# Published max output-token ceiling shared by every current Gemini text model
|
||||
# (2.5 + 3.x: flash, flash-lite, pro). Used as the default when the caller
|
||||
# passes max_tokens=None, because Gemini's native API otherwise applies a low
|
||||
# internal default and truncates output (unlike OpenAI-compat endpoints where
|
||||
# an omitted limit means full budget).
|
||||
GEMINI_DEFAULT_MAX_OUTPUT_TOKENS = 65535
|
||||
|
||||
|
||||
def is_native_gemini_base_url(base_url: str) -> bool:
|
||||
"""Return True when the endpoint speaks Gemini's native REST API."""
|
||||
|
|
@ -414,6 +421,18 @@ def build_gemini_request(
|
|||
generation_config["temperature"] = temperature
|
||||
if max_tokens is not None:
|
||||
generation_config["maxOutputTokens"] = max_tokens
|
||||
else:
|
||||
# Gemini's native generateContent does NOT treat an omitted
|
||||
# maxOutputTokens as "use the model's full output budget" — it applies
|
||||
# a low internal default and the model stops early with
|
||||
# finishReason=MAX_TOKENS, truncating tool calls mid-stream (Hermes
|
||||
# then retries 3× and refuses the incomplete call). Every current
|
||||
# Gemini text model (2.5 + 3.x, flash / flash-lite / pro) caps at
|
||||
# 65,535 output tokens, so default to that ceiling when the caller
|
||||
# passes None ("unlimited"). See the OpenAI-compat path where omitting
|
||||
# the field genuinely means full budget — that assumption does not
|
||||
# hold on the native API.
|
||||
generation_config["maxOutputTokens"] = GEMINI_DEFAULT_MAX_OUTPUT_TOKENS
|
||||
if top_p is not None:
|
||||
generation_config["topP"] = top_p
|
||||
if stop:
|
||||
|
|
|
|||
|
|
@ -571,7 +571,28 @@ class ChatCompletionsTransport(ProviderTransport):
|
|||
api_kwargs[k] = v
|
||||
|
||||
if extra_body:
|
||||
api_kwargs["extra_body"] = extra_body
|
||||
# Native Gemini (generativelanguage.googleapis.com, non-/openai)
|
||||
# speaks Google's REST schema, not OpenAI's. OpenAI-style extra_body
|
||||
# keys (tags, reasoning, provider, plugins, …) are unknown fields
|
||||
# there and Gemini rejects the whole request with a non-retryable
|
||||
# HTTP 400 ("Invalid JSON payload received. Unknown name 'tags'").
|
||||
# This happens when a profile that emits extra_body (e.g. the Nous
|
||||
# profile's portal `tags`) is active but the resolved endpoint is a
|
||||
# Gemini base_url — typical when only Google credentials are set and
|
||||
# a fallback/aux call lands on Gemini. The native client only reads
|
||||
# thinking_config from extra_body, so drop everything else here.
|
||||
try:
|
||||
from agent.gemini_native_adapter import is_native_gemini_base_url
|
||||
_native_gemini = is_native_gemini_base_url(params.get("base_url"))
|
||||
except Exception:
|
||||
_native_gemini = False
|
||||
if _native_gemini:
|
||||
extra_body = {
|
||||
k: v for k, v in extra_body.items()
|
||||
if k in ("thinking_config", "thinkingConfig")
|
||||
}
|
||||
if extra_body:
|
||||
api_kwargs["extra_body"] = extra_body
|
||||
|
||||
return api_kwargs
|
||||
|
||||
|
|
|
|||
|
|
@ -326,3 +326,27 @@ def test_stream_event_translation_keeps_identical_calls_in_distinct_parts():
|
|||
assert tool_chunks[0].choices[0].delta.tool_calls[0].index == 0
|
||||
assert tool_chunks[1].choices[0].delta.tool_calls[0].index == 1
|
||||
assert tool_chunks[0].choices[0].delta.tool_calls[0].id != tool_chunks[1].choices[0].delta.tool_calls[0].id
|
||||
|
||||
|
||||
def test_max_tokens_none_defaults_to_gemini_output_ceiling():
|
||||
"""max_tokens=None must send the model's full output ceiling, not omit it.
|
||||
|
||||
Gemini's native generateContent applies a low internal default when
|
||||
maxOutputTokens is absent, truncating tool calls mid-stream. Hermes passes
|
||||
None to mean "unlimited", so the adapter must translate that to the
|
||||
published 65,535 ceiling rather than leaving the field unset.
|
||||
"""
|
||||
from agent.gemini_native_adapter import (
|
||||
build_gemini_request,
|
||||
GEMINI_DEFAULT_MAX_OUTPUT_TOKENS,
|
||||
)
|
||||
|
||||
req = build_gemini_request(messages=[{"role": "user", "content": "hi"}], max_tokens=None)
|
||||
assert req["generationConfig"]["maxOutputTokens"] == GEMINI_DEFAULT_MAX_OUTPUT_TOKENS == 65535
|
||||
|
||||
|
||||
def test_explicit_max_tokens_is_respected():
|
||||
from agent.gemini_native_adapter import build_gemini_request
|
||||
|
||||
req = build_gemini_request(messages=[{"role": "user", "content": "hi"}], max_tokens=4096)
|
||||
assert req["generationConfig"]["maxOutputTokens"] == 4096
|
||||
|
|
|
|||
|
|
@ -859,3 +859,53 @@ class TestChatCompletionsCacheStats:
|
|||
r = SimpleNamespace(usage=SimpleNamespace(prompt_tokens_details=details))
|
||||
result = transport.extract_cache_stats(r)
|
||||
assert result == {"cached_tokens": 500, "creation_tokens": 100}
|
||||
|
||||
|
||||
class TestChatCompletionsGeminiNativeExtraBodyStrip:
|
||||
"""Profile extra_body (e.g. Nous portal tags) must not reach a native
|
||||
Gemini endpoint — Google's REST API rejects unknown fields with HTTP 400.
|
||||
"""
|
||||
|
||||
def _nous_profile(self):
|
||||
from providers import get_provider_profile
|
||||
return get_provider_profile("nous")
|
||||
|
||||
def test_tags_stripped_when_endpoint_is_native_gemini(self, transport):
|
||||
kw = transport.build_kwargs(
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
[{"role": "user", "content": "hi"}],
|
||||
None,
|
||||
provider_profile=self._nous_profile(),
|
||||
base_url="https://generativelanguage.googleapis.com/v1beta",
|
||||
session_id="s1",
|
||||
max_tokens=None,
|
||||
)
|
||||
eb = kw.get("extra_body")
|
||||
assert not eb or "tags" not in eb
|
||||
|
||||
def test_tags_preserved_on_nous_endpoint(self, transport):
|
||||
kw = transport.build_kwargs(
|
||||
"hermes-3-405b",
|
||||
[{"role": "user", "content": "hi"}],
|
||||
None,
|
||||
provider_profile=self._nous_profile(),
|
||||
base_url="https://inference.nousresearch.com/v1",
|
||||
session_id="s1",
|
||||
max_tokens=None,
|
||||
)
|
||||
eb = kw.get("extra_body")
|
||||
assert eb and "tags" in eb
|
||||
|
||||
def test_tags_pass_through_on_gemini_openai_compat(self, transport):
|
||||
# /openai compat endpoint is not "native" — unchanged behavior.
|
||||
kw = transport.build_kwargs(
|
||||
"anthropic/claude-sonnet-4.6",
|
||||
[{"role": "user", "content": "hi"}],
|
||||
None,
|
||||
provider_profile=self._nous_profile(),
|
||||
base_url="https://generativelanguage.googleapis.com/v1beta/openai",
|
||||
session_id="s1",
|
||||
max_tokens=None,
|
||||
)
|
||||
eb = kw.get("extra_body")
|
||||
assert eb and "tags" in eb
|
||||
|
|
|
|||
|
|
@ -94,7 +94,11 @@ def agent():
|
|||
a._cached_system_prompt = "You are helpful."
|
||||
a._use_prompt_caching = False
|
||||
a.tool_delay = 0
|
||||
a.compression_enabled = False
|
||||
# Default matches production (`compression.enabled` defaults to True).
|
||||
# Overflow-recovery tests below verify that 413 / context-overflow
|
||||
# errors DO trigger compression; the disabled-path behavior is
|
||||
# covered explicitly by TestOverflowWithCompactionDisabled.
|
||||
a.compression_enabled = True
|
||||
a.save_trajectories = False
|
||||
return a
|
||||
|
||||
|
|
@ -415,6 +419,13 @@ class TestPreflightCompression:
|
|||
|
||||
def test_compress_context_emits_lifecycle_status_before_work(self, agent):
|
||||
"""Direct context compression should tell gateway users why the turn paused."""
|
||||
# This test calls _compress_context directly and asserts the FIRST
|
||||
# status event is the lifecycle "Compacting context" message. With
|
||||
# compaction enabled the lazy feasibility probe would emit an
|
||||
# aux-provider warning first (no aux key in the hermetic test env),
|
||||
# displacing events[0]. The flag value is irrelevant to what this
|
||||
# test asserts, so disable it to suppress the probe.
|
||||
agent.compression_enabled = False
|
||||
events = []
|
||||
agent.status_callback = lambda ev, msg: events.append((ev, msg))
|
||||
|
||||
|
|
@ -802,3 +813,95 @@ class TestToolResultPreflightCompression:
|
|||
|
||||
mock_compress.assert_called_once()
|
||||
assert result["completed"] is True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Disabled auto-compaction on overflow (port of anomalyco/opencode#30749)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestOverflowWithCompactionDisabled:
|
||||
"""When ``compression.enabled`` is False, NO automatic compaction may
|
||||
fire — including the provider/request-size overflow recovery paths.
|
||||
|
||||
Ported from anomalyco/opencode#30749: the proactive token-threshold
|
||||
path already honoured the setting, but provider overflow errors
|
||||
(413 payload-too-large, context-overflow, long-context-tier 429) still
|
||||
silently compressed + rotated the session. The fix surfaces a terminal
|
||||
error so the user can compact manually, start fresh, or switch models.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _prefill():
|
||||
return [
|
||||
{"role": "user", "content": "previous question"},
|
||||
{"role": "assistant", "content": "previous answer"},
|
||||
]
|
||||
|
||||
def test_413_does_not_compress_when_disabled(self, agent):
|
||||
"""413 must NOT call _compress_context when compaction is disabled."""
|
||||
agent.compression_enabled = False
|
||||
err_413 = _make_413_error()
|
||||
# If the guard fails, a second (success) response would be consumed.
|
||||
agent.client.chat.completions.create.side_effect = [err_413, _mock_response()]
|
||||
|
||||
with (
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session") as mock_persist,
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello", conversation_history=self._prefill())
|
||||
|
||||
mock_compress.assert_not_called()
|
||||
mock_persist.assert_called()
|
||||
assert result.get("failed") is True
|
||||
assert result.get("compaction_disabled") is True
|
||||
assert "auto-compaction is disabled" in result["error"]
|
||||
|
||||
def test_context_overflow_does_not_compress_when_disabled(self, agent):
|
||||
"""400 'prompt is too long' must NOT compress when compaction disabled."""
|
||||
agent.compression_enabled = False
|
||||
err_400 = Exception(
|
||||
"Error code: 400 - {'type': 'error', 'error': {'type': "
|
||||
"'invalid_request_error', 'message': 'prompt is too long: "
|
||||
"233153 tokens > 200000 maximum'}}"
|
||||
)
|
||||
err_400.status_code = 400
|
||||
agent.client.chat.completions.create.side_effect = [err_400, _mock_response()]
|
||||
|
||||
with (
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello", conversation_history=self._prefill())
|
||||
|
||||
mock_compress.assert_not_called()
|
||||
assert result.get("compaction_disabled") is True
|
||||
|
||||
def test_413_still_compresses_when_enabled(self, agent):
|
||||
"""Control: with compaction enabled, 413 still triggers compression.
|
||||
|
||||
Guards against the disabled-path guard accidentally swallowing the
|
||||
enabled path.
|
||||
"""
|
||||
agent.compression_enabled = True
|
||||
err_413 = _make_413_error()
|
||||
ok_resp = _mock_response(content="Recovered", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [err_413, ok_resp]
|
||||
|
||||
with (
|
||||
patch.object(agent, "_compress_context") as mock_compress,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
mock_compress.return_value = (
|
||||
[{"role": "user", "content": "hello"}], "compressed",
|
||||
)
|
||||
result = agent.run_conversation("hello", conversation_history=self._prefill())
|
||||
|
||||
mock_compress.assert_called_once()
|
||||
assert result["completed"] is True
|
||||
assert result.get("compaction_disabled") is not True
|
||||
|
|
|
|||
|
|
@ -3903,6 +3903,7 @@ class TestRunConversation:
|
|||
def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent):
|
||||
"""GLM/Z.AI uses 'Prompt exceeds max length' for context overflow."""
|
||||
self._setup_agent(agent)
|
||||
agent.compression_enabled = True # this test verifies overflow→compression fires
|
||||
err_400 = Exception(
|
||||
"Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}"
|
||||
)
|
||||
|
|
@ -3937,6 +3938,7 @@ class TestRunConversation:
|
|||
to the generic 128K fallback tier.
|
||||
"""
|
||||
self._setup_agent(agent)
|
||||
agent.compression_enabled = True # this test verifies overflow→compression fires
|
||||
agent.provider = "minimax"
|
||||
agent.model = "MiniMax-M2.7-highspeed"
|
||||
agent.base_url = "https://api.minimax.io/anthropic"
|
||||
|
|
@ -3982,6 +3984,7 @@ class TestRunConversation:
|
|||
rely on compression — see #33669 / PR #33826.
|
||||
"""
|
||||
self._setup_agent(agent)
|
||||
agent.compression_enabled = True # this test verifies overflow→compression fires
|
||||
agent.provider = "openrouter"
|
||||
agent.model = "some/unknown-model"
|
||||
agent.base_url = "https://openrouter.ai/api/v1"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue