fix(compression): enforce 64k floor on aux model + auto-correct threshold (#12898)

Context compression silently failed when the auxiliary compression model's
context window was smaller than the main model's compression threshold
(e.g. GLM-4.5-air at 131k paired with a 150k threshold).  The feasibility
check warned but the session kept running and compression attempts errored
out mid-conversation.

Two changes in _check_compression_model_feasibility():

1. Hard floor: if detected aux context < MINIMUM_CONTEXT_LENGTH (64k),
   raise ValueError so the session refuses to start.  Mirrors the existing
   main-model rejection at AIAgent.__init__ line 1600.  A compression model
   below 64k cannot summarise a full threshold-sized window.

2. Auto-correct: when aux context is >= 64k but below the computed
   threshold, lower the live compressor's threshold_tokens to aux_context
   (and update threshold_percent to match so later update_model() calls
   stay in sync).  Warning reworded to say what was done and how to
   persist the fix in config.yaml.

Only ValueError re-raises; other exceptions in the check remain swallowed
as non-fatal.
This commit is contained in:
Teknium 2026-04-20 00:56:04 -07:00 committed by GitHub
parent 03e3c22e86
commit 4f24db4258
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 99 additions and 31 deletions

View file

@ -2051,7 +2051,10 @@ class AIAgent:
return
try:
from agent.auxiliary_client import get_text_auxiliary_client
from agent.model_metadata import get_model_context_length
from agent.model_metadata import (
MINIMUM_CONTEXT_LENGTH,
get_model_context_length,
)
client, aux_model = get_text_auxiliary_client(
"compression",
@ -2081,25 +2084,54 @@ class AIAgent:
config_context_length=getattr(self, "_aux_compression_context_length_config", None),
)
# Hard floor: the auxiliary compression model must have at least
# MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model
# is already required to meet this floor (checked earlier in
# __init__), so the compression model must too — otherwise it
# cannot summarise a full threshold-sized window of main-model
# content. Mirrors the main-model rejection pattern.
if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
raise ValueError(
f"Auxiliary compression model {aux_model} has a context "
f"window of {aux_context:,} tokens, which is below the "
f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
f"Agent. Choose a compression model with at least "
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
f"auxiliary.compression.model in config.yaml), or set "
f"auxiliary.compression.context_length to override the "
f"detected value if it is wrong."
)
threshold = self.context_compressor.threshold_tokens
if aux_context < threshold:
# Suggest a threshold that would fit the aux model,
# rounded down to a clean percentage.
safe_pct = int((aux_context / self.context_compressor.context_length) * 100)
# Auto-correct: lower the live session threshold so
# compression actually works this session. The hard floor
# above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
# so the new threshold is always >= 64K.
old_threshold = threshold
new_threshold = aux_context
self.context_compressor.threshold_tokens = new_threshold
# Keep threshold_percent in sync so future main-model
# context_length changes (update_model) re-derive from a
# sensible number rather than the original too-high value.
main_ctx = self.context_compressor.context_length
if main_ctx:
self.context_compressor.threshold_percent = (
new_threshold / main_ctx
)
safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
msg = (
f"⚠ Compression model ({aux_model}) context "
f"is {aux_context:,} tokens, but the main model's "
f"compression threshold is {threshold:,} tokens. "
f"Context compression will not be possible — the "
f"content to summarise will exceed the auxiliary "
f"model's context window.\n"
f" Fix options (config.yaml):\n"
f"⚠ Compression model ({aux_model}) context is "
f"{aux_context:,} tokens, but the main model's "
f"compression threshold was {old_threshold:,} tokens. "
f"Auto-lowered this session's threshold to "
f"{new_threshold:,} tokens so compression can run.\n"
f" To make this permanent, edit config.yaml — either:\n"
f" 1. Use a larger compression model:\n"
f" auxiliary:\n"
f" compression:\n"
f" model: <model-with-{threshold:,}+-context>\n"
f" 2. Lower the compression threshold to fit "
f"the current model:\n"
f" model: <model-with-{old_threshold:,}+-context>\n"
f" 2. Lower the compression threshold:\n"
f" compression:\n"
f" threshold: 0.{safe_pct:02d}"
)
@ -2108,12 +2140,17 @@ class AIAgent:
logger.warning(
"Auxiliary compression model %s has %d token context, "
"below the main model's compression threshold of %d "
"tokens — compression summaries will fail or be "
"severely truncated.",
"tokens — auto-lowered session threshold to %d to "
"keep compression working.",
aux_model,
aux_context,
threshold,
old_threshold,
new_threshold,
)
except ValueError:
# Hard rejections (aux below minimum context) must propagate
# so the session refuses to start.
raise
except Exception as exc:
logger.debug(
"Compression feasibility check failed (non-fatal): %s", exc

View file

@ -10,6 +10,8 @@ Two-phase design:
from unittest.mock import MagicMock, patch
import pytest
from run_agent import AIAgent
from agent.context_compressor import ContextCompressor
@ -51,12 +53,13 @@ def _make_agent(
# ── Core warning logic ──────────────────────────────────────────────
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
"""Warning emitted when aux model context < main model threshold."""
def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
"""Auto-correction: aux >= 64K floor but < threshold → lower threshold
to aux_context so compression still works this session."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
# threshold = 100,000 — aux has only 32,768
# threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
@ -69,16 +72,41 @@ def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
assert len(messages) == 1
assert "Compression model" in messages[0]
assert "32,768" in messages[0]
assert "100,000" in messages[0]
assert "will not be possible" in messages[0]
# Actionable fix guidance included
assert "Fix options" in messages[0]
assert "80,000" in messages[0] # aux context
assert "100,000" in messages[0] # old threshold
assert "Auto-lowered" in messages[0]
# Actionable persistence guidance included
assert "config.yaml" in messages[0]
assert "auxiliary:" in messages[0]
assert "compression:" in messages[0]
assert "threshold:" in messages[0]
# Warning stored for gateway replay
assert agent._compression_warning is not None
# Threshold on the live compressor was actually lowered
assert agent.context_compressor.threshold_tokens == 80_000
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
"""Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
refuses to start (ValueError), mirroring the main-model rejection."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "tiny-aux-model")
agent._emit_status = lambda msg: None
with pytest.raises(ValueError) as exc_info:
agent._check_compression_model_feasibility()
err = str(exc_info.value)
assert "tiny-aux-model" in err
assert "32,768" in err
assert "64,000" in err
assert "below the minimum" in err
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@ -294,8 +322,9 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len):
@patch("agent.model_metadata.get_model_context_length", return_value=99_999)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
"""Warning fires when aux context is one token below the threshold."""
def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
"""Auto-correct fires when aux context is one token below the threshold
(and above the 64K hard floor)."""
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
@ -309,12 +338,14 @@ def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
assert len(messages) == 1
assert "small-model" in messages[0]
assert "Auto-lowered" in messages[0]
assert agent.context_compressor.threshold_tokens == 99_999
# ── Two-phase: __init__ + run_conversation replay ───────────────────
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
"""__init__ stores the warning; _replay sends it through status_callback."""
@ -338,7 +369,7 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
agent._replay_compression_warning()
assert any(
ev == "lifecycle" and "will not be possible" in msg
ev == "lifecycle" and "Auto-lowered" in msg
for ev, msg in callback_events
)
@ -375,7 +406,7 @@ def test_replay_without_callback_is_noop():
agent._replay_compression_warning()
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
"""After replay in run_conversation, _compression_warning is cleared