From f92006ce1cda1a40249fa4d5dd9c663f70a9de8d Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 25 Apr 2026 05:41:56 -0700
Subject: [PATCH] fix(compression): reserve system+tools headroom when aux
 binds threshold (#15631)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the auxiliary compression model's context is smaller than the main
model's compression threshold, _check_compression_model_feasibility
auto-lowers the session threshold. Previously it set:

    new_threshold = aux_context

This let the raw message list grow to exactly aux_context tokens. But
compression and flush_memories actually send system_prompt + tool_schemas
+ messages to the aux model. With 50+ tools that overhead is 25-30K
tokens, so the full request overflowed aux with HTTP 400.

Subtract a headroom estimate from aux_context before setting the new
threshold: the actual tool-schema token count (from
estimate_request_tokens_rough) plus a 12K allowance for the system
prompt (not yet built at __init__ time) and flush-instruction overhead.
Clamp to MINIMUM_CONTEXT_LENGTH so the session still starts even with
an unusually heavy tool schema.

This fixes the 'flush_memories overflow on busy toolsets' path that
Teknium flagged — where main and aux can be nominally the same model
but still 400 because the threshold left no room for the request
overhead. Same fix also protects the normal compression summarisation
request on the same binding aux.

Tests: two new regression tests cover the headroom reservation and the
MINIMUM_CONTEXT_LENGTH floor. Two existing tests updated for the new
(lower) threshold values now that empty-tools still produces a 12K
static headroom deduction.
---
 run_agent.py                                  | 18 +++-
 .../run_agent/test_compression_feasibility.py | 96 ++++++++++++++++++-
 2 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/run_agent.py b/run_agent.py
index 444dc17819..7187499bb1 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -2425,8 +2425,24 @@ class AIAgent:
                 # compression actually works this session.  The hard floor
                 # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
                 # so the new threshold is always >= 64K.
+                #
+                # Headroom: the threshold budgets RAW MESSAGES only, but the
+                # actual request auxiliary callers send also includes the
+                # system prompt and every tool schema.  With 50+ tools that
+                # overhead can be 25-30K tokens; setting new_threshold =
+                # aux_context directly would let messages grow right to the
+                # aux limit and the first compression/flush request would
+                # overflow with HTTP 400.  Subtract a dynamic headroom
+                # estimate so the full request still fits.
+                from agent.model_metadata import estimate_request_tokens_rough
+                tool_overhead = estimate_request_tokens_rough([], tools=self.tools)
+                # System prompt is not yet built at __init__ time; allow a
+                # conservative 10K budget (SOUL/AGENTS.md + memory snapshot +
+                # skills guidance) plus 2K for the flush instruction and a
+                # small safety margin.
+                headroom = tool_overhead + 12_000
                 old_threshold = threshold
-                new_threshold = aux_context
+                new_threshold = max(aux_context - headroom, MINIMUM_CONTEXT_LENGTH)
                 self.context_compressor.threshold_tokens = new_threshold
                 # Keep threshold_percent in sync so future main-model
                 # context_length changes (update_model) re-derive from a
diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py
index 25dc0c01ab..2050bee28e 100644
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@@ -41,6 +41,9 @@ def _make_agent(
     agent.tool_progress_callback = None
     agent._compression_warning = None
     agent._aux_compression_context_length_config = None
+    # Tools feed into the headroom calculation in _check_compression_model_feasibility.
+    # Tests that want to assert specific threshold values can override this.
+    agent.tools = []
 
     compressor = MagicMock(spec=ContextCompressor)
     compressor.context_length = main_context
@@ -82,8 +85,9 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
     assert "threshold:" in messages[0]
     # Warning stored for gateway replay
     assert agent._compression_warning is not None
-    # Threshold on the live compressor was actually lowered
-    assert agent.context_compressor.threshold_tokens == 80_000
+    # Threshold on the live compressor was actually lowered, accounting for
+    # the request-overhead headroom (empty tools list → ~12K headroom only).
+    assert agent.context_compressor.threshold_tokens == 68_000
 
 
 @patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@@ -339,7 +343,93 @@ def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
     assert len(messages) == 1
     assert "small-model" in messages[0]
     assert "Auto-lowered" in messages[0]
-    assert agent.context_compressor.threshold_tokens == 99_999
+    assert agent.context_compressor.threshold_tokens == 87_999
+
+
+# ── Headroom for system prompt + tool schemas ────────────────────────
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=128_000)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_auto_lowered_threshold_reserves_headroom_for_tools_and_system(mock_get_client, mock_ctx_len):
+    """When aux context binds the threshold, new_threshold must leave room
+    for the system prompt and tool schemas that auxiliary callers
+    (compression summariser, flush_memories) prepend to the message list.
+
+    Without headroom, a full-budget message window + ~25K system/tool
+    overhead overflows the aux model with HTTP 400.  Regression guard for
+    the flush_memories-on-busy-toolset overflow path.
+    """
+    # Main context 200K, threshold 70% = 140K.  Aux pins at 128K (below
+    # threshold → triggers auto-correct).
+    agent = _make_agent(main_context=200_000, threshold_percent=0.70)
+
+    # Build a realistic tool schema load.
+    agent.tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": f"tool_{i}",
+                "description": "x" * 200,
+                "parameters": {"type": "object", "properties": {"arg": {"type": "string", "description": "y" * 120}}},
+            },
+        }
+        for i in range(50)
+    ]
+
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "model-with-128k")
+
+    agent._emit_status = lambda msg: None
+    agent._check_compression_model_feasibility()
+
+    new_threshold = agent.context_compressor.threshold_tokens
+
+    # Must have strictly reserved headroom: new_threshold < aux_context.
+    assert new_threshold < 128_000, (
+        f"threshold {new_threshold} did not reserve headroom below aux=128,000 "
+        f"— system prompt + tools would overflow the aux model"
+    )
+    # Must respect the 64K hard floor.
+    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
+    assert new_threshold >= MINIMUM_CONTEXT_LENGTH
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_headroom_floors_at_minimum_context(mock_get_client, mock_ctx_len):
+    """If headroom subtraction would push below 64K floor, clamp to 64K
+    rather than refusing the session — the aux is still workable for a
+    smaller message window.
+    """
+    # Aux at 80K, with enough tools to push headroom > 16K → naive subtract
+    # would land at < 64K.  The max(..., MINIMUM_CONTEXT_LENGTH) clamp must
+    # keep the session running.
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    agent.tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": f"tool_{i}",
+                "description": "z" * 2_000,  # fat descriptions
+                "parameters": {},
+            },
+        }
+        for i in range(30)
+    ]
+
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "small-aux-model")
+
+    agent._emit_status = lambda msg: None
+    agent._check_compression_model_feasibility()
+
+    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
+    assert agent.context_compressor.threshold_tokens == MINIMUM_CONTEXT_LENGTH
 
 
 # ── Two-phase: __init__ + run_conversation replay ───────────────────