diff --git a/run_agent.py b/run_agent.py index 444dc17819..7187499bb1 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2425,8 +2425,24 @@ class AIAgent: # compression actually works this session. The hard floor # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH, # so the new threshold is always >= 64K. + # + # Headroom: the threshold budgets RAW MESSAGES only, but the + # actual request auxiliary callers send also includes the + # system prompt and every tool schema. With 50+ tools that + # overhead can be 25-30K tokens; setting new_threshold = + # aux_context directly would let messages grow right to the + # aux limit and the first compression/flush request would + # overflow with HTTP 400. Subtract a dynamic headroom + # estimate so the full request still fits. + from agent.model_metadata import estimate_request_tokens_rough + tool_overhead = estimate_request_tokens_rough([], tools=self.tools) + # System prompt is not yet built at __init__ time; allow a + # conservative 10K budget (SOUL/AGENTS.md + memory snapshot + + # skills guidance) plus 2K for the flush instruction and a + # small safety margin. + headroom = tool_overhead + 12_000 old_threshold = threshold - new_threshold = aux_context + new_threshold = max(aux_context - headroom, MINIMUM_CONTEXT_LENGTH) self.context_compressor.threshold_tokens = new_threshold # Keep threshold_percent in sync so future main-model # context_length changes (update_model) re-derive from a diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index 25dc0c01ab..2050bee28e 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -41,6 +41,9 @@ def _make_agent( agent.tool_progress_callback = None agent._compression_warning = None agent._aux_compression_context_length_config = None + # Tools feed into the headroom calculation in _check_compression_model_feasibility. + # Tests that want to assert specific threshold values can override this. + agent.tools = [] compressor = MagicMock(spec=ContextCompressor) compressor.context_length = main_context @@ -82,8 +85,9 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien assert "threshold:" in messages[0] # Warning stored for gateway replay assert agent._compression_warning is not None - # Threshold on the live compressor was actually lowered - assert agent.context_compressor.threshold_tokens == 80_000 + # Threshold on the live compressor was actually lowered, accounting for + # the request-overhead headroom (empty tools list → ~12K headroom only). + assert agent.context_compressor.threshold_tokens == 68_000 @patch("agent.model_metadata.get_model_context_length", return_value=32_768) @@ -339,7 +343,93 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len): assert len(messages) == 1 assert "small-model" in messages[0] assert "Auto-lowered" in messages[0] - assert agent.context_compressor.threshold_tokens == 99_999 + assert agent.context_compressor.threshold_tokens == 87_999 + + +# ── Headroom for system prompt + tool schemas ──────────────────────── + + +@patch("agent.model_metadata.get_model_context_length", return_value=128_000) +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_auto_lowered_threshold_reserves_headroom_for_tools_and_system(mock_get_client, mock_ctx_len): + """When aux context binds the threshold, new_threshold must leave room + for the system prompt and tool schemas that auxiliary callers + (compression summariser, flush_memories) prepend to the message list. + + Without headroom, a full-budget message window + ~25K system/tool + overhead overflows the aux model with HTTP 400. Regression guard for + the flush_memories-on-busy-toolset overflow path. + """ + # Main context 200K, threshold 70% = 140K. Aux pins at 128K (below + # threshold → triggers auto-correct). + agent = _make_agent(main_context=200_000, threshold_percent=0.70) + + # Build a realistic tool schema load. + agent.tools = [ + { + "type": "function", + "function": { + "name": f"tool_{i}", + "description": "x" * 200, + "parameters": {"type": "object", "properties": {"arg": {"type": "string", "description": "y" * 120}}}, + }, + } + for i in range(50) + ] + + mock_client = MagicMock() + mock_client.base_url = "https://openrouter.ai/api/v1" + mock_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_client, "model-with-128k") + + agent._emit_status = lambda msg: None + agent._check_compression_model_feasibility() + + new_threshold = agent.context_compressor.threshold_tokens + + # Must have strictly reserved headroom: new_threshold < aux_context. + assert new_threshold < 128_000, ( + f"threshold {new_threshold} did not reserve headroom below aux=128,000 " + f"— system prompt + tools would overflow the aux model" + ) + # Must respect the 64K hard floor. + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + assert new_threshold >= MINIMUM_CONTEXT_LENGTH + + +@patch("agent.model_metadata.get_model_context_length", return_value=80_000) +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_headroom_floors_at_minimum_context(mock_get_client, mock_ctx_len): + """If headroom subtraction would push below 64K floor, clamp to 64K + rather than refusing the session — the aux is still workable for a + smaller message window. + """ + # Aux at 80K, with enough tools to push headroom > 16K → naive subtract + # would land at < 64K. The max(..., MINIMUM_CONTEXT_LENGTH) clamp must + # keep the session running. + agent = _make_agent(main_context=200_000, threshold_percent=0.50) + agent.tools = [ + { + "type": "function", + "function": { + "name": f"tool_{i}", + "description": "z" * 2_000, # fat descriptions + "parameters": {}, + }, + } + for i in range(30) + ] + + mock_client = MagicMock() + mock_client.base_url = "https://openrouter.ai/api/v1" + mock_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_client, "small-aux-model") + + agent._emit_status = lambda msg: None + agent._check_compression_model_feasibility() + + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + assert agent.context_compressor.threshold_tokens == MINIMUM_CONTEXT_LENGTH # ── Two-phase: __init__ + run_conversation replay ───────────────────