diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 8be763513ff..d01d5d4a844 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -2720,6 +2720,61 @@ def run_conversation( # compress history and retry, not abort immediately. status_code = getattr(api_error, "status_code", None) + # ── Respect disabled auto-compaction on overflow ────── + # Ported from anomalyco/opencode#30749. When the user has + # turned auto-compaction off (``compression.enabled: false``), + # NO automatic compaction trigger may fire — including the + # provider/request-size overflow recovery paths below + # (long-context-tier 429, 413 payload-too-large, and + # context-overflow). Without this guard the proactive + # threshold path correctly honours the setting (see the + # preflight check and the post-response ``should_compress`` + # gate) but a provider overflow error would still silently + # compress + rotate the session, bypassing the user's + # explicit choice. Surface a terminal error instead so the + # user can compact manually (``/compress``), start fresh + # (``/new``), switch to a larger-context model, or reduce + # attachments. Forced compaction via ``/compress`` + # (``force=True``) is unaffected — it never reaches this loop. + _overflow_reasons = { + FailoverReason.long_context_tier, + FailoverReason.payload_too_large, + FailoverReason.context_overflow, + } + if ( + classified.reason in _overflow_reasons + and not getattr(agent, "compression_enabled", True) + ): + agent._flush_status_buffer() + agent._vprint( + f"{agent.log_prefix}❌ Context overflow, but auto-compaction is disabled " + f"(compression.enabled: false).", + force=True, + ) + agent._vprint( + f"{agent.log_prefix} 💡 Run /compress to compact manually, /new to start fresh, " + f"switch to a larger-context model, or reduce attachments.", + force=True, + ) + logger.error( + f"{agent.log_prefix}Context overflow ({classified.reason.value}) with " + f"auto-compaction disabled — not compressing." + ) + agent._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": ( + "Context overflow and auto-compaction is disabled " + "(compression.enabled: false). Run /compress to compact manually, " + "/new to start fresh, or switch to a larger-context model." + ), + "partial": True, + "failed": True, + "compaction_disabled": True, + } + # ── Anthropic Sonnet long-context tier gate ─────────── # Anthropic returns HTTP 429 "Extra usage is required for # long context requests" when a Claude Max (or similar) diff --git a/agent/gemini_native_adapter.py b/agent/gemini_native_adapter.py index b0d903372cd..a0f8e9df548 100644 --- a/agent/gemini_native_adapter.py +++ b/agent/gemini_native_adapter.py @@ -33,6 +33,13 @@ logger = logging.getLogger(__name__) DEFAULT_GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" +# Published max output-token ceiling shared by every current Gemini text model +# (2.5 + 3.x: flash, flash-lite, pro). Used as the default when the caller +# passes max_tokens=None, because Gemini's native API otherwise applies a low +# internal default and truncates output (unlike OpenAI-compat endpoints where +# an omitted limit means full budget). +GEMINI_DEFAULT_MAX_OUTPUT_TOKENS = 65535 + def is_native_gemini_base_url(base_url: str) -> bool: """Return True when the endpoint speaks Gemini's native REST API.""" @@ -414,6 +421,18 @@ def build_gemini_request( generation_config["temperature"] = temperature if max_tokens is not None: generation_config["maxOutputTokens"] = max_tokens + else: + # Gemini's native generateContent does NOT treat an omitted + # maxOutputTokens as "use the model's full output budget" — it applies + # a low internal default and the model stops early with + # finishReason=MAX_TOKENS, truncating tool calls mid-stream (Hermes + # then retries 3× and refuses the incomplete call). Every current + # Gemini text model (2.5 + 3.x, flash / flash-lite / pro) caps at + # 65,535 output tokens, so default to that ceiling when the caller + # passes None ("unlimited"). See the OpenAI-compat path where omitting + # the field genuinely means full budget — that assumption does not + # hold on the native API. + generation_config["maxOutputTokens"] = GEMINI_DEFAULT_MAX_OUTPUT_TOKENS if top_p is not None: generation_config["topP"] = top_p if stop: diff --git a/agent/transports/chat_completions.py b/agent/transports/chat_completions.py index 7b1935528d9..0c17e309a8b 100644 --- a/agent/transports/chat_completions.py +++ b/agent/transports/chat_completions.py @@ -571,7 +571,28 @@ class ChatCompletionsTransport(ProviderTransport): api_kwargs[k] = v if extra_body: - api_kwargs["extra_body"] = extra_body + # Native Gemini (generativelanguage.googleapis.com, non-/openai) + # speaks Google's REST schema, not OpenAI's. OpenAI-style extra_body + # keys (tags, reasoning, provider, plugins, …) are unknown fields + # there and Gemini rejects the whole request with a non-retryable + # HTTP 400 ("Invalid JSON payload received. Unknown name 'tags'"). + # This happens when a profile that emits extra_body (e.g. the Nous + # profile's portal `tags`) is active but the resolved endpoint is a + # Gemini base_url — typical when only Google credentials are set and + # a fallback/aux call lands on Gemini. The native client only reads + # thinking_config from extra_body, so drop everything else here. + try: + from agent.gemini_native_adapter import is_native_gemini_base_url + _native_gemini = is_native_gemini_base_url(params.get("base_url")) + except Exception: + _native_gemini = False + if _native_gemini: + extra_body = { + k: v for k, v in extra_body.items() + if k in ("thinking_config", "thinkingConfig") + } + if extra_body: + api_kwargs["extra_body"] = extra_body return api_kwargs diff --git a/tests/agent/test_gemini_native_adapter.py b/tests/agent/test_gemini_native_adapter.py index 4b066b4f454..4f894c512a6 100644 --- a/tests/agent/test_gemini_native_adapter.py +++ b/tests/agent/test_gemini_native_adapter.py @@ -326,3 +326,27 @@ def test_stream_event_translation_keeps_identical_calls_in_distinct_parts(): assert tool_chunks[0].choices[0].delta.tool_calls[0].index == 0 assert tool_chunks[1].choices[0].delta.tool_calls[0].index == 1 assert tool_chunks[0].choices[0].delta.tool_calls[0].id != tool_chunks[1].choices[0].delta.tool_calls[0].id + + +def test_max_tokens_none_defaults_to_gemini_output_ceiling(): + """max_tokens=None must send the model's full output ceiling, not omit it. + + Gemini's native generateContent applies a low internal default when + maxOutputTokens is absent, truncating tool calls mid-stream. Hermes passes + None to mean "unlimited", so the adapter must translate that to the + published 65,535 ceiling rather than leaving the field unset. + """ + from agent.gemini_native_adapter import ( + build_gemini_request, + GEMINI_DEFAULT_MAX_OUTPUT_TOKENS, + ) + + req = build_gemini_request(messages=[{"role": "user", "content": "hi"}], max_tokens=None) + assert req["generationConfig"]["maxOutputTokens"] == GEMINI_DEFAULT_MAX_OUTPUT_TOKENS == 65535 + + +def test_explicit_max_tokens_is_respected(): + from agent.gemini_native_adapter import build_gemini_request + + req = build_gemini_request(messages=[{"role": "user", "content": "hi"}], max_tokens=4096) + assert req["generationConfig"]["maxOutputTokens"] == 4096 diff --git a/tests/agent/transports/test_chat_completions.py b/tests/agent/transports/test_chat_completions.py index 255d46f4381..0b54fe47059 100644 --- a/tests/agent/transports/test_chat_completions.py +++ b/tests/agent/transports/test_chat_completions.py @@ -859,3 +859,53 @@ class TestChatCompletionsCacheStats: r = SimpleNamespace(usage=SimpleNamespace(prompt_tokens_details=details)) result = transport.extract_cache_stats(r) assert result == {"cached_tokens": 500, "creation_tokens": 100} + + +class TestChatCompletionsGeminiNativeExtraBodyStrip: + """Profile extra_body (e.g. Nous portal tags) must not reach a native + Gemini endpoint — Google's REST API rejects unknown fields with HTTP 400. + """ + + def _nous_profile(self): + from providers import get_provider_profile + return get_provider_profile("nous") + + def test_tags_stripped_when_endpoint_is_native_gemini(self, transport): + kw = transport.build_kwargs( + "anthropic/claude-sonnet-4.6", + [{"role": "user", "content": "hi"}], + None, + provider_profile=self._nous_profile(), + base_url="https://generativelanguage.googleapis.com/v1beta", + session_id="s1", + max_tokens=None, + ) + eb = kw.get("extra_body") + assert not eb or "tags" not in eb + + def test_tags_preserved_on_nous_endpoint(self, transport): + kw = transport.build_kwargs( + "hermes-3-405b", + [{"role": "user", "content": "hi"}], + None, + provider_profile=self._nous_profile(), + base_url="https://inference.nousresearch.com/v1", + session_id="s1", + max_tokens=None, + ) + eb = kw.get("extra_body") + assert eb and "tags" in eb + + def test_tags_pass_through_on_gemini_openai_compat(self, transport): + # /openai compat endpoint is not "native" — unchanged behavior. + kw = transport.build_kwargs( + "anthropic/claude-sonnet-4.6", + [{"role": "user", "content": "hi"}], + None, + provider_profile=self._nous_profile(), + base_url="https://generativelanguage.googleapis.com/v1beta/openai", + session_id="s1", + max_tokens=None, + ) + eb = kw.get("extra_body") + assert eb and "tags" in eb diff --git a/tests/run_agent/test_413_compression.py b/tests/run_agent/test_413_compression.py index cadb26c449b..2b8c32e297b 100644 --- a/tests/run_agent/test_413_compression.py +++ b/tests/run_agent/test_413_compression.py @@ -94,7 +94,11 @@ def agent(): a._cached_system_prompt = "You are helpful." a._use_prompt_caching = False a.tool_delay = 0 - a.compression_enabled = False + # Default matches production (`compression.enabled` defaults to True). + # Overflow-recovery tests below verify that 413 / context-overflow + # errors DO trigger compression; the disabled-path behavior is + # covered explicitly by TestOverflowWithCompactionDisabled. + a.compression_enabled = True a.save_trajectories = False return a @@ -415,6 +419,13 @@ class TestPreflightCompression: def test_compress_context_emits_lifecycle_status_before_work(self, agent): """Direct context compression should tell gateway users why the turn paused.""" + # This test calls _compress_context directly and asserts the FIRST + # status event is the lifecycle "Compacting context" message. With + # compaction enabled the lazy feasibility probe would emit an + # aux-provider warning first (no aux key in the hermetic test env), + # displacing events[0]. The flag value is irrelevant to what this + # test asserts, so disable it to suppress the probe. + agent.compression_enabled = False events = [] agent.status_callback = lambda ev, msg: events.append((ev, msg)) @@ -802,3 +813,95 @@ class TestToolResultPreflightCompression: mock_compress.assert_called_once() assert result["completed"] is True + + +# --------------------------------------------------------------------------- +# Disabled auto-compaction on overflow (port of anomalyco/opencode#30749) +# --------------------------------------------------------------------------- + +class TestOverflowWithCompactionDisabled: + """When ``compression.enabled`` is False, NO automatic compaction may + fire — including the provider/request-size overflow recovery paths. + + Ported from anomalyco/opencode#30749: the proactive token-threshold + path already honoured the setting, but provider overflow errors + (413 payload-too-large, context-overflow, long-context-tier 429) still + silently compressed + rotated the session. The fix surfaces a terminal + error so the user can compact manually, start fresh, or switch models. + """ + + @staticmethod + def _prefill(): + return [ + {"role": "user", "content": "previous question"}, + {"role": "assistant", "content": "previous answer"}, + ] + + def test_413_does_not_compress_when_disabled(self, agent): + """413 must NOT call _compress_context when compaction is disabled.""" + agent.compression_enabled = False + err_413 = _make_413_error() + # If the guard fails, a second (success) response would be consumed. + agent.client.chat.completions.create.side_effect = [err_413, _mock_response()] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session") as mock_persist, + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=self._prefill()) + + mock_compress.assert_not_called() + mock_persist.assert_called() + assert result.get("failed") is True + assert result.get("compaction_disabled") is True + assert "auto-compaction is disabled" in result["error"] + + def test_context_overflow_does_not_compress_when_disabled(self, agent): + """400 'prompt is too long' must NOT compress when compaction disabled.""" + agent.compression_enabled = False + err_400 = Exception( + "Error code: 400 - {'type': 'error', 'error': {'type': " + "'invalid_request_error', 'message': 'prompt is too long: " + "233153 tokens > 200000 maximum'}}" + ) + err_400.status_code = 400 + agent.client.chat.completions.create.side_effect = [err_400, _mock_response()] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello", conversation_history=self._prefill()) + + mock_compress.assert_not_called() + assert result.get("compaction_disabled") is True + + def test_413_still_compresses_when_enabled(self, agent): + """Control: with compaction enabled, 413 still triggers compression. + + Guards against the disabled-path guard accidentally swallowing the + enabled path. + """ + agent.compression_enabled = True + err_413 = _make_413_error() + ok_resp = _mock_response(content="Recovered", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [err_413, ok_resp] + + with ( + patch.object(agent, "_compress_context") as mock_compress, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + mock_compress.return_value = ( + [{"role": "user", "content": "hello"}], "compressed", + ) + result = agent.run_conversation("hello", conversation_history=self._prefill()) + + mock_compress.assert_called_once() + assert result["completed"] is True + assert result.get("compaction_disabled") is not True diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index e9e7011dd1e..ea66789fdd3 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -3903,6 +3903,7 @@ class TestRunConversation: def test_glm_prompt_exceeds_max_length_triggers_compression(self, agent): """GLM/Z.AI uses 'Prompt exceeds max length' for context overflow.""" self._setup_agent(agent) + agent.compression_enabled = True # this test verifies overflow→compression fires err_400 = Exception( "Error code: 400 - {'error': {'code': '1261', 'message': 'Prompt exceeds max length'}}" ) @@ -3937,6 +3938,7 @@ class TestRunConversation: to the generic 128K fallback tier. """ self._setup_agent(agent) + agent.compression_enabled = True # this test verifies overflow→compression fires agent.provider = "minimax" agent.model = "MiniMax-M2.7-highspeed" agent.base_url = "https://api.minimax.io/anthropic" @@ -3982,6 +3984,7 @@ class TestRunConversation: rely on compression — see #33669 / PR #33826. """ self._setup_agent(agent) + agent.compression_enabled = True # this test verifies overflow→compression fires agent.provider = "openrouter" agent.model = "some/unknown-model" agent.base_url = "https://openrouter.ai/api/v1"