mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(compression): enforce 64k floor on aux model + auto-correct threshold (#12898)
Context compression silently failed when the auxiliary compression model's context window was smaller than the main model's compression threshold (e.g. GLM-4.5-air at 131k paired with a 150k threshold). The feasibility check warned but the session kept running and compression attempts errored out mid-conversation. Two changes in _check_compression_model_feasibility(): 1. Hard floor: if detected aux context < MINIMUM_CONTEXT_LENGTH (64k), raise ValueError so the session refuses to start. Mirrors the existing main-model rejection at AIAgent.__init__ line 1600. A compression model below 64k cannot summarise a full threshold-sized window. 2. Auto-correct: when aux context is >= 64k but below the computed threshold, lower the live compressor's threshold_tokens to aux_context (and update threshold_percent to match so later update_model() calls stay in sync). Warning reworded to say what was done and how to persist the fix in config.yaml. Only ValueError re-raises; other exceptions in the check remain swallowed as non-fatal.
This commit is contained in:
parent
03e3c22e86
commit
4f24db4258
2 changed files with 99 additions and 31 deletions
71
run_agent.py
71
run_agent.py
|
|
@ -2051,7 +2051,10 @@ class AIAgent:
|
|||
return
|
||||
try:
|
||||
from agent.auxiliary_client import get_text_auxiliary_client
|
||||
from agent.model_metadata import get_model_context_length
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
get_model_context_length,
|
||||
)
|
||||
|
||||
client, aux_model = get_text_auxiliary_client(
|
||||
"compression",
|
||||
|
|
@ -2081,25 +2084,54 @@ class AIAgent:
|
|||
config_context_length=getattr(self, "_aux_compression_context_length_config", None),
|
||||
)
|
||||
|
||||
# Hard floor: the auxiliary compression model must have at least
|
||||
# MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model
|
||||
# is already required to meet this floor (checked earlier in
|
||||
# __init__), so the compression model must too — otherwise it
|
||||
# cannot summarise a full threshold-sized window of main-model
|
||||
# content. Mirrors the main-model rejection pattern.
|
||||
if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
|
||||
raise ValueError(
|
||||
f"Auxiliary compression model {aux_model} has a context "
|
||||
f"window of {aux_context:,} tokens, which is below the "
|
||||
f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
|
||||
f"Agent. Choose a compression model with at least "
|
||||
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
|
||||
f"auxiliary.compression.model in config.yaml), or set "
|
||||
f"auxiliary.compression.context_length to override the "
|
||||
f"detected value if it is wrong."
|
||||
)
|
||||
|
||||
threshold = self.context_compressor.threshold_tokens
|
||||
if aux_context < threshold:
|
||||
# Suggest a threshold that would fit the aux model,
|
||||
# rounded down to a clean percentage.
|
||||
safe_pct = int((aux_context / self.context_compressor.context_length) * 100)
|
||||
# Auto-correct: lower the live session threshold so
|
||||
# compression actually works this session. The hard floor
|
||||
# above guarantees aux_context >= MINIMUM_CONTEXT_LENGTH,
|
||||
# so the new threshold is always >= 64K.
|
||||
old_threshold = threshold
|
||||
new_threshold = aux_context
|
||||
self.context_compressor.threshold_tokens = new_threshold
|
||||
# Keep threshold_percent in sync so future main-model
|
||||
# context_length changes (update_model) re-derive from a
|
||||
# sensible number rather than the original too-high value.
|
||||
main_ctx = self.context_compressor.context_length
|
||||
if main_ctx:
|
||||
self.context_compressor.threshold_percent = (
|
||||
new_threshold / main_ctx
|
||||
)
|
||||
safe_pct = int((aux_context / main_ctx) * 100) if main_ctx else 50
|
||||
msg = (
|
||||
f"⚠ Compression model ({aux_model}) context "
|
||||
f"is {aux_context:,} tokens, but the main model's "
|
||||
f"compression threshold is {threshold:,} tokens. "
|
||||
f"Context compression will not be possible — the "
|
||||
f"content to summarise will exceed the auxiliary "
|
||||
f"model's context window.\n"
|
||||
f" Fix options (config.yaml):\n"
|
||||
f"⚠ Compression model ({aux_model}) context is "
|
||||
f"{aux_context:,} tokens, but the main model's "
|
||||
f"compression threshold was {old_threshold:,} tokens. "
|
||||
f"Auto-lowered this session's threshold to "
|
||||
f"{new_threshold:,} tokens so compression can run.\n"
|
||||
f" To make this permanent, edit config.yaml — either:\n"
|
||||
f" 1. Use a larger compression model:\n"
|
||||
f" auxiliary:\n"
|
||||
f" compression:\n"
|
||||
f" model: <model-with-{threshold:,}+-context>\n"
|
||||
f" 2. Lower the compression threshold to fit "
|
||||
f"the current model:\n"
|
||||
f" model: <model-with-{old_threshold:,}+-context>\n"
|
||||
f" 2. Lower the compression threshold:\n"
|
||||
f" compression:\n"
|
||||
f" threshold: 0.{safe_pct:02d}"
|
||||
)
|
||||
|
|
@ -2108,12 +2140,17 @@ class AIAgent:
|
|||
logger.warning(
|
||||
"Auxiliary compression model %s has %d token context, "
|
||||
"below the main model's compression threshold of %d "
|
||||
"tokens — compression summaries will fail or be "
|
||||
"severely truncated.",
|
||||
"tokens — auto-lowered session threshold to %d to "
|
||||
"keep compression working.",
|
||||
aux_model,
|
||||
aux_context,
|
||||
threshold,
|
||||
old_threshold,
|
||||
new_threshold,
|
||||
)
|
||||
except ValueError:
|
||||
# Hard rejections (aux below minimum context) must propagate
|
||||
# so the session refuses to start.
|
||||
raise
|
||||
except Exception as exc:
|
||||
logger.debug(
|
||||
"Compression feasibility check failed (non-fatal): %s", exc
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@ Two-phase design:
|
|||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from run_agent import AIAgent
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
|
|
@ -51,12 +53,13 @@ def _make_agent(
|
|||
# ── Core warning logic ──────────────────────────────────────────────
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
|
||||
"""Warning emitted when aux model context < main model threshold."""
|
||||
def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
|
||||
"""Auto-correction: aux >= 64K floor but < threshold → lower threshold
|
||||
to aux_context so compression still works this session."""
|
||||
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||
# threshold = 100,000 — aux has only 32,768
|
||||
# threshold = 100,000 — aux has 80,000 (above 64K floor, below threshold)
|
||||
mock_client = MagicMock()
|
||||
mock_client.base_url = "https://openrouter.ai/api/v1"
|
||||
mock_client.api_key = "sk-aux"
|
||||
|
|
@ -69,16 +72,41 @@ def test_warns_when_aux_context_below_threshold(mock_get_client, mock_ctx_len):
|
|||
|
||||
assert len(messages) == 1
|
||||
assert "Compression model" in messages[0]
|
||||
assert "32,768" in messages[0]
|
||||
assert "100,000" in messages[0]
|
||||
assert "will not be possible" in messages[0]
|
||||
# Actionable fix guidance included
|
||||
assert "Fix options" in messages[0]
|
||||
assert "80,000" in messages[0] # aux context
|
||||
assert "100,000" in messages[0] # old threshold
|
||||
assert "Auto-lowered" in messages[0]
|
||||
# Actionable persistence guidance included
|
||||
assert "config.yaml" in messages[0]
|
||||
assert "auxiliary:" in messages[0]
|
||||
assert "compression:" in messages[0]
|
||||
assert "threshold:" in messages[0]
|
||||
# Warning stored for gateway replay
|
||||
assert agent._compression_warning is not None
|
||||
# Threshold on the live compressor was actually lowered
|
||||
assert agent.context_compressor.threshold_tokens == 80_000
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
|
||||
"""Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
|
||||
refuses to start (ValueError), mirroring the main-model rejection."""
|
||||
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||
mock_client = MagicMock()
|
||||
mock_client.base_url = "https://openrouter.ai/api/v1"
|
||||
mock_client.api_key = "sk-aux"
|
||||
mock_get_client.return_value = (mock_client, "tiny-aux-model")
|
||||
|
||||
agent._emit_status = lambda msg: None
|
||||
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
agent._check_compression_model_feasibility()
|
||||
|
||||
err = str(exc_info.value)
|
||||
assert "tiny-aux-model" in err
|
||||
assert "32,768" in err
|
||||
assert "64,000" in err
|
||||
assert "below the minimum" in err
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
|
||||
|
|
@ -294,8 +322,9 @@ def test_exact_threshold_boundary_no_warning(mock_get_client, mock_ctx_len):
|
|||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=99_999)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
|
||||
"""Warning fires when aux context is one token below the threshold."""
|
||||
def test_just_below_threshold_auto_corrects(mock_get_client, mock_ctx_len):
|
||||
"""Auto-correct fires when aux context is one token below the threshold
|
||||
(and above the 64K hard floor)."""
|
||||
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||
mock_client = MagicMock()
|
||||
mock_client.base_url = "https://openrouter.ai/api/v1"
|
||||
|
|
@ -309,12 +338,14 @@ def test_just_below_threshold_warns(mock_get_client, mock_ctx_len):
|
|||
|
||||
assert len(messages) == 1
|
||||
assert "small-model" in messages[0]
|
||||
assert "Auto-lowered" in messages[0]
|
||||
assert agent.context_compressor.threshold_tokens == 99_999
|
||||
|
||||
|
||||
# ── Two-phase: __init__ + run_conversation replay ───────────────────
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
|
||||
"""__init__ stores the warning; _replay sends it through status_callback."""
|
||||
|
|
@ -338,7 +369,7 @@ def test_warning_stored_for_gateway_replay(mock_get_client, mock_ctx_len):
|
|||
agent._replay_compression_warning()
|
||||
|
||||
assert any(
|
||||
ev == "lifecycle" and "will not be possible" in msg
|
||||
ev == "lifecycle" and "Auto-lowered" in msg
|
||||
for ev, msg in callback_events
|
||||
)
|
||||
|
||||
|
|
@ -375,7 +406,7 @@ def test_replay_without_callback_is_noop():
|
|||
agent._replay_compression_warning()
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=80_000)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_run_conversation_clears_warning_after_replay(mock_get_client, mock_ctx_len):
|
||||
"""After replay in run_conversation, _compression_warning is cleared
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue