fix(compression): enforce 64k floor on aux model + auto-correct threshold (#12898)

Context compression silently failed when the auxiliary compression model's context window was smaller than the main model's compression threshold (e.g. GLM-4.5-air at 131k paired with a 150k threshold). The feasibility check warned but the session kept running and compression attempts errored out mid-conversation. Two changes in _check_compression_model_feasibility(): 1. Hard floor: if detected aux context < MINIMUM_CONTEXT_LENGTH (64k), raise ValueError so the session refuses to start. Mirrors the existing main-model rejection at AIAgent.__init__ line 1600. A compression model below 64k cannot summarise a full threshold-sized window. 2. Auto-correct: when aux context is >= 64k but below the computed threshold, lower the live compressor's threshold_tokens to aux_context (and update threshold_percent to match so later update_model() calls stay in sync). Warning reworded to say what was done and how to persist the fix in config.yaml. Only ValueError re-raises; other exceptions in the check remain swallowed as non-fatal.
2026-04-25 00:51:20 +00:00 · 2026-04-20 00:56:04 -07:00 · 2026-04-20 00:56:04 -07:00 · 4f24db4258
commit 4f24db4258
parent 03e3c22e86
2 changed files with 99 additions and 31 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -2051,7 +2051,10 @@ class AIAgent:
            return
        try:
            from agent.auxiliary_client import get_text_auxiliary_client
-            from agent.model_metadata import get_model_context_length
+            from agent.model_metadata import (
+                MINIMUM_CONTEXT_LENGTH,
+                get_model_context_length,
+            )

            client, aux_model = get_text_auxiliary_client(
                "compression",
@ -2081,25 +2084,54 @@ class AIAgent:
                config_context_length=getattr(self, "_aux_compression_context_length_config", None),
            )

+            # Hard floor: the auxiliary compression model must have at least
+            # MINIMUM_CONTEXT_LENGTH (64K) tokens of context.  The main model
+            # is already required to meet this floor (checked earlier in
+            # __init__), so the compression model must too — otherwise it
+            # cannot summarise a full threshold-sized window of main-model
+            # content.  Mirrors the main-model rejection pattern.
+            if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
+                raise ValueError(
+                    f"Auxiliary compression model {aux_model} has a context "
+                    f"window of {aux_context:,} tokens, which is below the "
+                    f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
+                    f"Agent.  Choose a compression model with at least "
+                    f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
+                    f"auxiliary.compression.model in config.yaml), or set "
+                    f"auxiliary.compression.context_length to override the "
+                    f"detected value if it is wrong."
+                )
+
            threshold = self.context_compressor.threshold_tokens
            if aux_context < threshold:
-                # Suggest a threshold that would fit the aux model,
-                # rounded down to a clean percentage.
-                safe_pct = int((aux_context / self.context_compressor.context_length) * 100)
+                # Auto-correct: lower the live session threshold so
+                # compression actually works this session.  The hard floor
+                # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
+                # so the new threshold is always >= 64K.
+                old_threshold = threshold
+                new_threshold = aux_context
+                self.context_compressor.threshold_tokens = new_threshold
+                # Keep threshold_percent in sync so future main-model
+                # context_length changes (update_model) re-derive from a
+                # sensible number rather than the original too-high value.
+                main_ctx = self.context_compressor.context_length
+                if main_ctx:
+                    self.context_compressor.threshold_percent = (
+                        new_threshold / main_ctx
+                    )
+                safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
                msg = (
-                    f"⚠ Compression model ({aux_model}) context "
-                    f"is {aux_context:,} tokens, but the main model's "
-                    f"compression threshold is {threshold:,} tokens. "
-                    f"Context compression will not be possible — the "
-                    f"content to summarise will exceed the auxiliary "
-                    f"model's context window.\n"
-                    f"  Fix options (config.yaml):\n"
+                    f"⚠ Compression model ({aux_model}) context is "
+                    f"{aux_context:,} tokens, but the main model's "
+                    f"compression threshold was {old_threshold:,} tokens. "
+                    f"Auto-lowered this session's threshold to "
+                    f"{new_threshold:,} tokens so compression can run.\n"
+                    f"  To make this permanent, edit config.yaml — either:\n"
                    f"  1. Use a larger compression model:\n"
                    f"       auxiliary:\n"
                    f"         compression:\n"
-                    f"           model: <model-with-{threshold:,}+-context>\n"
-                    f"  2. Lower the compression threshold to fit "
-                    f"the current model:\n"
+                    f"           model: <model-with-{old_threshold:,}+-context>\n"
+                    f"  2. Lower the compression threshold:\n"
                    f"       compression:\n"
                    f"         threshold: 0.{safe_pct:02d}"
                )
@ -2108,12 +2140,17 @@ class AIAgent:
                logger.warning(
                    "Auxiliary compression model %s has %d token context, "
                    "below the main model's compression threshold of %d "
-                    "tokens — compression summaries will fail or be "
-                    "severely truncated.",
+                    "tokens — auto-lowered session threshold to %d to "
+                    "keep compression working.",
                    aux_model,
                    aux_context,
-                    threshold,
+                    old_threshold,
+                    new_threshold,
                )
+        except ValueError:
+            # Hard rejections (aux below minimum context) must propagate
+            # so the session refuses to start.
+            raise
        except Exception as exc:
            logger.debug(
                "Compression feasibility check failed (non-fatal): %s", exc
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@ -10,6 +10,8 @@ Two-phase design:

 from unittest.mock import MagicMock, patch

+import pytest
+
 from run_agent import AIAgent
 from agent.context_compressor import ContextCompressor

@ -51,12 +53,13 @@ def _make_agent(
 # ── Core warning logic ──────────────────────────────────────────────


-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
-    """Warning emitted when aux model context < main model threshold."""
+def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
+    """Auto-correction: aux >= 64K floor but < threshold → lower threshold
+    to aux_context so compression still works this session."""
    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
-    # threshold = 100,000 — aux has only 32,768
+    # threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
    mock_client = MagicMock()
    mock_client.base_url = "https://openrouter.ai/api/v1"
    mock_client.api_key = "sk-aux"
@ -69,16 +72,41 @@ def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):

    assert len(messages) == 1
    assert "Compression model" in messages[0]
-    assert "32,768" in messages[0]
-    assert "100,000" in messages[0]
-    assert "will not be possible" in messages[0]
-    # Actionable fix guidance included
-    assert "Fix options" in messages[0]
+    assert "80,000" in messages[0]        # aux context
+    assert "100,000" in messages[0]       # old threshold
+    assert "Auto-lowered" in messages[0]
+    # Actionable persistence guidance included
+    assert "config.yaml" in messages[0]
    assert "auxiliary:" in messages[0]
    assert "compression:" in messages[0]
    assert "threshold:" in messages[0]
    # Warning stored for gateway replay
    assert agent._compression_warning is not None
+    # Threshold on the live compressor was actually lowered
+    assert agent.context_compressor.threshold_tokens == 80_000
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
+    """Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
+    refuses to start (ValueError), mirroring the main-model rejection."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "tiny-aux-model")
+
+    agent._emit_status = lambda msg: None
+
+    with pytest.raises(ValueError) as exc_info:
+        agent._check_compression_model_feasibility()
+
+    err = str(exc_info.value)
+    assert "tiny-aux-model" in err
+    assert "32,768" in err
+    assert "64,000" in err
+    assert "below the minimum" in err


@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@ -294,8 +322,9 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len):

@patch("agent.model_metadata.get_model_context_length", return_value=99_999)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
-    """Warning fires when aux context is one token below the threshold."""
+def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
+    """Auto-correct fires when aux context is one token below the threshold
+    (and above the 64K hard floor)."""
    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
    mock_client = MagicMock()
    mock_client.base_url = "https://openrouter.ai/api/v1"
@ -309,12 +338,14 @@ def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):

    assert len(messages) == 1
    assert "small-model" in messages[0]
+    assert "Auto-lowered" in messages[0]
+    assert agent.context_compressor.threshold_tokens == 99_999


 # ── Two-phase: __init__ + run_conversation replay ───────────────────


-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
    """__init__ stores the warning; _replay sends it through status_callback."""
@ -338,7 +369,7 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
    agent._replay_compression_warning()

    assert any(
-        ev == "lifecycle" and "will not be possible" in msg
+        ev == "lifecycle" and "Auto-lowered" in msg
        for ev, msg in callback_events
    )

@ -375,7 +406,7 @@ def test_replay_without_callback_is_noop():
    agent._replay_compression_warning()


-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
    """After replay in run_conversation, _compression_warning is cleared