From 4f24db4258d686015f445096458eeaf3c4bc4bf8 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 20 Apr 2026 00:56:04 -0700
Subject: [PATCH] fix(compression): enforce 64k floor on aux model +
 auto-correct threshold (#12898)

Context compression silently failed when the auxiliary compression model's
context window was smaller than the main model's compression threshold
(e.g. GLM-4.5-air at 131k paired with a 150k threshold).  The feasibility
check warned but the session kept running and compression attempts errored
out mid-conversation.

Two changes in _check_compression_model_feasibility():

1. Hard floor: if detected aux context < MINIMUM_CONTEXT_LENGTH (64k),
   raise ValueError so the session refuses to start.  Mirrors the existing
   main-model rejection at AIAgent.__init__ line 1600.  A compression model
   below 64k cannot summarise a full threshold-sized window.

2. Auto-correct: when aux context is >= 64k but below the computed
   threshold, lower the live compressor's threshold_tokens to aux_context
   (and update threshold_percent to match so later update_model() calls
   stay in sync).  Warning reworded to say what was done and how to
   persist the fix in config.yaml.

Only ValueError re-raises; other exceptions in the check remain swallowed
as non-fatal.
---
 run_agent.py                                  | 71 ++++++++++++++-----
 .../run_agent/test_compression_feasibility.py | 59 +++++++++++----
 2 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/run_agent.py b/run_agent.py
index d4118db38..f8b0423b9 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -2051,7 +2051,10 @@ class AIAgent:
             return
         try:
             from agent.auxiliary_client import get_text_auxiliary_client
-            from agent.model_metadata import get_model_context_length
+            from agent.model_metadata import (
+                MINIMUM_CONTEXT_LENGTH,
+                get_model_context_length,
+            )
 
             client, aux_model = get_text_auxiliary_client(
                 "compression",
@@ -2081,25 +2084,54 @@ class AIAgent:
                 config_context_length=getattr(self, "_aux_compression_context_length_config", None),
             )
 
+            # Hard floor: the auxiliary compression model must have at least
+            # MINIMUM_CONTEXT_LENGTH (64K) tokens of context.  The main model
+            # is already required to meet this floor (checked earlier in
+            # __init__), so the compression model must too — otherwise it
+            # cannot summarise a full threshold-sized window of main-model
+            # content.  Mirrors the main-model rejection pattern.
+            if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
+                raise ValueError(
+                    f"Auxiliary compression model {aux_model} has a context "
+                    f"window of {aux_context:,} tokens, which is below the "
+                    f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
+                    f"Agent.  Choose a compression model with at least "
+                    f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
+                    f"auxiliary.compression.model in config.yaml), or set "
+                    f"auxiliary.compression.context_length to override the "
+                    f"detected value if it is wrong."
+                )
+
             threshold = self.context_compressor.threshold_tokens
             if aux_context < threshold:
-                # Suggest a threshold that would fit the aux model,
-                # rounded down to a clean percentage.
-                safe_pct = int((aux_context / self.context_compressor.context_length) * 100)
+                # Auto-correct: lower the live session threshold so
+                # compression actually works this session.  The hard floor
+                # above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
+                # so the new threshold is always >= 64K.
+                old_threshold = threshold
+                new_threshold = aux_context
+                self.context_compressor.threshold_tokens = new_threshold
+                # Keep threshold_percent in sync so future main-model
+                # context_length changes (update_model) re-derive from a
+                # sensible number rather than the original too-high value.
+                main_ctx = self.context_compressor.context_length
+                if main_ctx:
+                    self.context_compressor.threshold_percent = (
+                        new_threshold / main_ctx
+                    )
+                safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
                 msg = (
-                    f"⚠ Compression model ({aux_model}) context "
-                    f"is {aux_context:,} tokens, but the main model's "
-                    f"compression threshold is {threshold:,} tokens. "
-                    f"Context compression will not be possible — the "
-                    f"content to summarise will exceed the auxiliary "
-                    f"model's context window.\n"
-                    f"  Fix options (config.yaml):\n"
+                    f"⚠ Compression model ({aux_model}) context is "
+                    f"{aux_context:,} tokens, but the main model's "
+                    f"compression threshold was {old_threshold:,} tokens. "
+                    f"Auto-lowered this session's threshold to "
+                    f"{new_threshold:,} tokens so compression can run.\n"
+                    f"  To make this permanent, edit config.yaml — either:\n"
                     f"  1. Use a larger compression model:\n"
                     f"       auxiliary:\n"
                     f"         compression:\n"
-                    f"           model: <model-with-{threshold:,}+-context>\n"
-                    f"  2. Lower the compression threshold to fit "
-                    f"the current model:\n"
+                    f"           model: <model-with-{old_threshold:,}+-context>\n"
+                    f"  2. Lower the compression threshold:\n"
                     f"       compression:\n"
                     f"         threshold: 0.{safe_pct:02d}"
                 )
@@ -2108,12 +2140,17 @@ class AIAgent:
                 logger.warning(
                     "Auxiliary compression model %s has %d token context, "
                     "below the main model's compression threshold of %d "
-                    "tokens — compression summaries will fail or be "
-                    "severely truncated.",
+                    "tokens — auto-lowered session threshold to %d to "
+                    "keep compression working.",
                     aux_model,
                     aux_context,
-                    threshold,
+                    old_threshold,
+                    new_threshold,
                 )
+        except ValueError:
+            # Hard rejections (aux below minimum context) must propagate
+            # so the session refuses to start.
+            raise
         except Exception as exc:
             logger.debug(
                 "Compression feasibility check failed (non-fatal): %s", exc
diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py
index 451eeb2f7..25dc0c01a 100644
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@@ -10,6 +10,8 @@ Two-phase design:
 
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 from run_agent import AIAgent
 from agent.context_compressor import ContextCompressor
 
@@ -51,12 +53,13 @@ def _make_agent(
 # ── Core warning logic ──────────────────────────────────────────────
 
 
-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
-    """Warning emitted when aux model context < main model threshold."""
+def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
+    """Auto-correction: aux >= 64K floor but < threshold → lower threshold
+    to aux_context so compression still works this session."""
     agent = _make_agent(main_context=200_000, threshold_percent=0.50)
-    # threshold = 100,000 — aux has only 32,768
+    # threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
     mock_client = MagicMock()
     mock_client.base_url = "https://openrouter.ai/api/v1"
     mock_client.api_key = "sk-aux"
@@ -69,16 +72,41 @@ def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
 
     assert len(messages) == 1
     assert "Compression model" in messages[0]
-    assert "32,768" in messages[0]
-    assert "100,000" in messages[0]
-    assert "will not be possible" in messages[0]
-    # Actionable fix guidance included
-    assert "Fix options" in messages[0]
+    assert "80,000" in messages[0]        # aux context
+    assert "100,000" in messages[0]       # old threshold
+    assert "Auto-lowered" in messages[0]
+    # Actionable persistence guidance included
+    assert "config.yaml" in messages[0]
     assert "auxiliary:" in messages[0]
     assert "compression:" in messages[0]
     assert "threshold:" in messages[0]
     # Warning stored for gateway replay
     assert agent._compression_warning is not None
+    # Threshold on the live compressor was actually lowered
+    assert agent.context_compressor.threshold_tokens == 80_000
+
+
+@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
+    """Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
+    refuses to start (ValueError), mirroring the main-model rejection."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "tiny-aux-model")
+
+    agent._emit_status = lambda msg: None
+
+    with pytest.raises(ValueError) as exc_info:
+        agent._check_compression_model_feasibility()
+
+    err = str(exc_info.value)
+    assert "tiny-aux-model" in err
+    assert "32,768" in err
+    assert "64,000" in err
+    assert "below the minimum" in err
 
 
 @patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@@ -294,8 +322,9 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len):
 
 @patch("agent.model_metadata.get_model_context_length", return_value=99_999)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
-    """Warning fires when aux context is one token below the threshold."""
+def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
+    """Auto-correct fires when aux context is one token below the threshold
+    (and above the 64K hard floor)."""
     agent = _make_agent(main_context=200_000, threshold_percent=0.50)
     mock_client = MagicMock()
     mock_client.base_url = "https://openrouter.ai/api/v1"
@@ -309,12 +338,14 @@ def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
 
     assert len(messages) == 1
     assert "small-model" in messages[0]
+    assert "Auto-lowered" in messages[0]
+    assert agent.context_compressor.threshold_tokens == 99_999
 
 
 # ── Two-phase: __init__ + run_conversation replay ───────────────────
 
 
-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
     """__init__ stores the warning; _replay sends it through status_callback."""
@@ -338,7 +369,7 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
     agent._replay_compression_warning()
 
     assert any(
-        ev == "lifecycle" and "will not be possible" in msg
+        ev == "lifecycle" and "Auto-lowered" in msg
         for ev, msg in callback_events
     )
 
@@ -375,7 +406,7 @@ def test_replay_without_callback_is_noop():
     agent._replay_compression_warning()
 
 
-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
     """After replay in run_conversation, _compression_warning is cleared