From 4f24db4258d686015f445096458eeaf3c4bc4bf8 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 20 Apr 2026 00:56:04 -0700 Subject: [PATCH] fix(compression): enforce 64k floor on aux model + auto-correct threshold (#12898) Context compression silently failed when the auxiliary compression model's context window was smaller than the main model's compression threshold (e.g. GLM-4.5-air at 131k paired with a 150k threshold). The feasibility check warned but the session kept running and compression attempts errored out mid-conversation. Two changes in _check_compression_model_feasibility(): 1. Hard floor: if detected aux context < MINIMUM_CONTEXT_LENGTH (64k), raise ValueError so the session refuses to start. Mirrors the existing main-model rejection at AIAgent.__init__ line 1600. A compression model below 64k cannot summarise a full threshold-sized window. 2. Auto-correct: when aux context is >= 64k but below the computed threshold, lower the live compressor's threshold_tokens to aux_context (and update threshold_percent to match so later update_model() calls stay in sync). Warning reworded to say what was done and how to persist the fix in config.yaml. Only ValueError re-raises; other exceptions in the check remain swallowed as non-fatal. --- run_agent.py | 71 ++++++++++++++----- .../run_agent/test_compression_feasibility.py | 59 +++++++++++---- 2 files changed, 99 insertions(+), 31 deletions(-) diff --git a/run_agent.py b/run_agent.py index d4118db38..f8b0423b9 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2051,7 +2051,10 @@ class AIAgent: return try: from agent.auxiliary_client import get_text_auxiliary_client - from agent.model_metadata import get_model_context_length + from agent.model_metadata import ( + MINIMUM_CONTEXT_LENGTH, + get_model_context_length, + ) client, aux_model = get_text_auxiliary_client( "compression", @@ -2081,25 +2084,54 @@ class AIAgent: config_context_length=getattr(self, "_aux_compression_context_length_config", None), ) + # Hard floor: the auxiliary compression model must have at least + # MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model + # is already required to meet this floor (checked earlier in + # __init__), so the compression model must too — otherwise it + # cannot summarise a full threshold-sized window of main-model + # content. Mirrors the main-model rejection pattern. + if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH: + raise ValueError( + f"Auxiliary compression model {aux_model} has a context " + f"window of {aux_context:,} tokens, which is below the " + f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes " + f"Agent. Choose a compression model with at least " + f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set " + f"auxiliary.compression.model in config.yaml), or set " + f"auxiliary.compression.context_length to override the " + f"detected value if it is wrong." + ) + threshold = self.context_compressor.threshold_tokens if aux_context < threshold: - # Suggest a threshold that would fit the aux model, - # rounded down to a clean percentage. - safe_pct = int((aux_context / self.context_compressor.context_length) * 100) + # Auto-correct: lower the live session threshold so + # compression actually works this session. The hard floor + # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH, + # so the new threshold is always >= 64K. + old_threshold = threshold + new_threshold = aux_context + self.context_compressor.threshold_tokens = new_threshold + # Keep threshold_percent in sync so future main-model + # context_length changes (update_model) re-derive from a + # sensible number rather than the original too-high value. + main_ctx = self.context_compressor.context_length + if main_ctx: + self.context_compressor.threshold_percent = ( + new_threshold / main_ctx + ) + safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50 msg = ( - f"⚠ Compression model ({aux_model}) context " - f"is {aux_context:,} tokens, but the main model's " - f"compression threshold is {threshold:,} tokens. " - f"Context compression will not be possible — the " - f"content to summarise will exceed the auxiliary " - f"model's context window.\n" - f" Fix options (config.yaml):\n" + f"⚠ Compression model ({aux_model}) context is " + f"{aux_context:,} tokens, but the main model's " + f"compression threshold was {old_threshold:,} tokens. " + f"Auto-lowered this session's threshold to " + f"{new_threshold:,} tokens so compression can run.\n" + f" To make this permanent, edit config.yaml — either:\n" f" 1. Use a larger compression model:\n" f" auxiliary:\n" f" compression:\n" - f" model: \n" - f" 2. Lower the compression threshold to fit " - f"the current model:\n" + f" model: \n" + f" 2. Lower the compression threshold:\n" f" compression:\n" f" threshold: 0.{safe_pct:02d}" ) @@ -2108,12 +2140,17 @@ class AIAgent: logger.warning( "Auxiliary compression model %s has %d token context, " "below the main model's compression threshold of %d " - "tokens — compression summaries will fail or be " - "severely truncated.", + "tokens — auto-lowered session threshold to %d to " + "keep compression working.", aux_model, aux_context, - threshold, + old_threshold, + new_threshold, ) + except ValueError: + # Hard rejections (aux below minimum context) must propagate + # so the session refuses to start. + raise except Exception as exc: logger.debug( "Compression feasibility check failed (non-fatal): %s", exc diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index 451eeb2f7..25dc0c01a 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -10,6 +10,8 @@ Two-phase design: from unittest.mock import MagicMock, patch +import pytest + from run_agent import AIAgent from agent.context_compressor import ContextCompressor @@ -51,12 +53,13 @@ def _make_agent( # ── Core warning logic ────────────────────────────────────────────── -@patch("agent.model_metadata.get_model_context_length", return_value=32_768) +@patch("agent.model_metadata.get_model_context_length", return_value=80_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") -def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len): - """Warning emitted when aux model context < main model threshold.""" +def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len): + """Auto-correction: aux >= 64K floor but < threshold → lower threshold + to aux_context so compression still works this session.""" agent = _make_agent(main_context=200_000, threshold_percent=0.50) - # threshold = 100,000 — aux has only 32,768 + # threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold) mock_client = MagicMock() mock_client.base_url = "https://openrouter.ai/api/v1" mock_client.api_key = "sk-aux" @@ -69,16 +72,41 @@ def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len): assert len(messages) == 1 assert "Compression model" in messages[0] - assert "32,768" in messages[0] - assert "100,000" in messages[0] - assert "will not be possible" in messages[0] - # Actionable fix guidance included - assert "Fix options" in messages[0] + assert "80,000" in messages[0] # aux context + assert "100,000" in messages[0] # old threshold + assert "Auto-lowered" in messages[0] + # Actionable persistence guidance included + assert "config.yaml" in messages[0] assert "auxiliary:" in messages[0] assert "compression:" in messages[0] assert "threshold:" in messages[0] # Warning stored for gateway replay assert agent._compression_warning is not None + # Threshold on the live compressor was actually lowered + assert agent.context_compressor.threshold_tokens == 80_000 + + +@patch("agent.model_metadata.get_model_context_length", return_value=32_768) +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len): + """Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session + refuses to start (ValueError), mirroring the main-model rejection.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.50) + mock_client = MagicMock() + mock_client.base_url = "https://openrouter.ai/api/v1" + mock_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_client, "tiny-aux-model") + + agent._emit_status = lambda msg: None + + with pytest.raises(ValueError) as exc_info: + agent._check_compression_model_feasibility() + + err = str(exc_info.value) + assert "tiny-aux-model" in err + assert "32,768" in err + assert "64,000" in err + assert "below the minimum" in err @patch("agent.model_metadata.get_model_context_length", return_value=200_000) @@ -294,8 +322,9 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len): @patch("agent.model_metadata.get_model_context_length", return_value=99_999) @patch("agent.auxiliary_client.get_text_auxiliary_client") -def test_just_below_threshold_warns(mock_get_client, mock_ctx_len): - """Warning fires when aux context is one token below the threshold.""" +def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len): + """Auto-correct fires when aux context is one token below the threshold + (and above the 64K hard floor).""" agent = _make_agent(main_context=200_000, threshold_percent=0.50) mock_client = MagicMock() mock_client.base_url = "https://openrouter.ai/api/v1" @@ -309,12 +338,14 @@ def test_just_below_threshold_warns(mock_get_client, mock_ctx_len): assert len(messages) == 1 assert "small-model" in messages[0] + assert "Auto-lowered" in messages[0] + assert agent.context_compressor.threshold_tokens == 99_999 # ── Two-phase: __init__ + run_conversation replay ─────────────────── -@patch("agent.model_metadata.get_model_context_length", return_value=32_768) +@patch("agent.model_metadata.get_model_context_length", return_value=80_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len): """__init__ stores the warning; _replay sends it through status_callback.""" @@ -338,7 +369,7 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len): agent._replay_compression_warning() assert any( - ev == "lifecycle" and "will not be possible" in msg + ev == "lifecycle" and "Auto-lowered" in msg for ev, msg in callback_events ) @@ -375,7 +406,7 @@ def test_replay_without_callback_is_noop(): agent._replay_compression_warning() -@patch("agent.model_metadata.get_model_context_length", return_value=32_768) +@patch("agent.model_metadata.get_model_context_length", return_value=80_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len): """After replay in run_conversation, _compression_warning is cleared