From cae1ee44a7afb462a9fe11863d30feffa0736966 Mon Sep 17 00:00:00 2001 From: Tranquil-Flow Date: Wed, 24 Jun 2026 23:12:48 +0530 Subject: [PATCH] fix(compression): make minimum context floor configurable (#31600) Add compression.minimum_context_floor config key that allows users to lower the compression threshold floor below the hardcoded 64K default, preventing infinite tool-call loops on models whose structured output degrades well before 64K tokens. - agent/model_metadata.py: add get_configurable_minimum_context() helper with 16K hard safety limit - agent/context_compressor.py: accept minimum_context_floor param, thread it through _compute_threshold_tokens - agent/conversation_compression.py: use compressor's floor for aux model context validation - agent/agent_init.py: read compression.minimum_context_floor from config and pass to ContextCompressor - gateway/run.py: cache-busting includes new key Salvaged from #31686 by @Tranquil-Flow onto current main. Resolves conflicts with in-place compaction (#38763) and max_tokens threshold computation (#43547) that landed after the original PR. Closes #31600 --- agent/agent_init.py | 12 +++ agent/context_compressor.py | 27 +++++-- agent/conversation_compression.py | 16 ++-- agent/model_metadata.py | 19 +++++ gateway/run.py | 1 + tests/gateway/test_agent_cache.py | 23 +++++- .../run_agent/test_compression_feasibility.py | 26 +++++++ .../test_compression_minimum_floor.py | 75 +++++++++++++++++++ 8 files changed, 187 insertions(+), 12 deletions(-) create mode 100644 tests/run_agent/test_compression_minimum_floor.py diff --git a/agent/agent_init.py b/agent/agent_init.py index e7f2ed9eac3..9054c6f2528 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1358,6 +1358,17 @@ def init_agent( compression_in_place = is_truthy_value( _compression_cfg.get("in_place"), default=False ) + # Allow users to lower the compression floor for models whose + # structured output degrades well before the default 64K tokens. + # Clamped to a hard-coded safety limit of 16K by the compressor. + _raw_floor = _compression_cfg.get("minimum_context_floor", None) + if _raw_floor is not None: + try: + compression_minimum_context_floor = int(_raw_floor) + except (TypeError, ValueError): + compression_minimum_context_floor = None + else: + compression_minimum_context_floor = None # Read optional explicit context_length override for the auxiliary # compression model. Custom endpoints often cannot report this via @@ -1576,6 +1587,7 @@ def init_agent( api_mode=agent.api_mode, abort_on_summary_failure=compression_abort_on_summary_failure, max_tokens=agent.max_tokens, + minimum_context_floor=compression_minimum_context_floor, ) agent.compression_enabled = compression_enabled agent.compression_in_place = compression_in_place diff --git a/agent/context_compressor.py b/agent/context_compressor.py index fbde99bda5f..5b446516939 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -27,6 +27,7 @@ from agent.auxiliary_client import call_llm, _is_connection_error, aux_interrupt from agent.context_engine import ContextEngine from agent.model_metadata import ( MINIMUM_CONTEXT_LENGTH, + get_configurable_minimum_context, get_model_context_length, estimate_messages_tokens_rough, ) @@ -683,6 +684,7 @@ class ContextCompressor(ContextEngine): self.max_tokens = self._coerce_max_tokens(max_tokens) self.threshold_tokens = self._compute_threshold_tokens( context_length, self.threshold_percent, self.max_tokens, + minimum_floor=self._minimum_context_floor, ) # Recalculate token budgets for the new context length so the # compressor stays calibrated after a model switch (e.g. 200K → 32K). @@ -741,6 +743,7 @@ class ContextCompressor(ContextEngine): @staticmethod def _compute_threshold_tokens( context_length: int, threshold_percent: float, max_tokens: int | None = None, + minimum_floor: int = MINIMUM_CONTEXT_LENGTH, ) -> int: """Compute the compaction trigger threshold in tokens. @@ -770,7 +773,7 @@ class ContextCompressor(ContextEngine): if effective_window <= 0: effective_window = context_length pct_value = int(effective_window * threshold_percent) - floored = max(pct_value, MINIMUM_CONTEXT_LENGTH) + floored = max(pct_value, minimum_floor) # If flooring pushed the threshold to/over the effective window it can # never be reached. Trigger at 85% of the effective input budget so a # minimum-context model rides most of its budget before compacting @@ -796,6 +799,7 @@ class ContextCompressor(ContextEngine): api_mode: str = "", abort_on_summary_failure: bool = False, max_tokens: int | None = None, + minimum_context_floor: int | None = None, ): self.model = model self.base_url = base_url @@ -820,19 +824,30 @@ class ContextCompressor(ContextEngine): # deterministic "summary unavailable" handoff and drop the middle window. self.abort_on_summary_failure = abort_on_summary_failure + # Configurable compression floor — allows users with large-context + # models that degrade before 64K tokens to lower the bar (never below + # the hard-coded safety limit of 16K). When None, the default + # MINIMUM_CONTEXT_LENGTH (64K) applies. + self._minimum_context_floor = get_configurable_minimum_context( + minimum_context_floor + ) + self.context_length = get_model_context_length( model, base_url=base_url, api_key=api_key, config_context_length=config_context_length, provider=provider, ) - # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if - # the percentage would suggest a lower value. This prevents premature - # compression on large-context models at 50% while keeping the % sane - # for models right at the minimum. _compute_threshold_tokens also - # guards the degenerate case where the floor would equal/exceed the + # Floor: never compress below the configured minimum context floor + # even if the percentage would suggest a lower value. This prevents + # premature compression on large-context models at 50% while keeping + # the % sane for models right at the minimum. _compute_threshold_tokens + # also guards the degenerate case where the floor would equal/exceed the # window (small models), so auto-compression can still fire (#14690). + # The floor is configurable via compression.minimum_context_floor for + # models whose structured output degrades well below 64K tokens. self.threshold_tokens = self._compute_threshold_tokens( self.context_length, threshold_percent, self.max_tokens, + minimum_floor=self._minimum_context_floor, ) self.compression_count = 0 diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index ba67f036954..24f3e1e691b 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -94,9 +94,15 @@ def check_compression_model_feasibility(agent: Any) -> None: ) from agent.model_metadata import ( MINIMUM_CONTEXT_LENGTH, + get_configurable_minimum_context, get_model_context_length, ) + # Configurable compression floor from the compressor instance + _compression_floor = getattr( + agent.context_compressor, "_minimum_context_floor", MINIMUM_CONTEXT_LENGTH + ) + client, aux_model = get_text_auxiliary_client( "compression", main_runtime=agent._current_main_runtime(), @@ -156,18 +162,18 @@ def check_compression_model_feasibility(agent: Any) -> None: ) # Hard floor: the auxiliary compression model must have at least - # MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model - # is already required to meet this floor (checked earlier in + # the configured compression floor's worth of context. The main model + # is already required to meet its floor (checked earlier in # __init__), so the compression model must too — otherwise it # cannot summarise a full threshold-sized window of main-model # content. Mirrors the main-model rejection pattern. - if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH: + if aux_context and aux_context < _compression_floor: raise ValueError( f"Auxiliary compression model {aux_model} has a context " f"window of {aux_context:,} tokens, which is below the " - f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes " + f"minimum {_compression_floor:,} required by Hermes " f"Agent. Choose a compression model with at least " - f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set " + f"{_compression_floor // 1000}K context (set " f"auxiliary.compression.model in config.yaml), or set " f"auxiliary.compression.context_length to override the " f"detected value if it is wrong." diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 4493eae5f1f..cba5b0ac423 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -184,6 +184,25 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0] # Sessions, model switches, and cron jobs should reject models below this. MINIMUM_CONTEXT_LENGTH = 64_000 +# Lower bound for user-configured compression floor overrides. +# Users with large-context models that degrade before 64K tokens +# (e.g. Gemini Flash via proxies) can use ``compression.minimum_context_floor`` +# in config.yaml to lower this, but never below this hard safety limit. +_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT = 16_000 + + +def get_configurable_minimum_context(config_floor: int | None = None) -> int: + """Return the effective minimum context floor for compression. + + When *config_floor* is provided (from ``compression.minimum_context_floor``), + it is clamped to the hard limit and returned, allowing users to lower the + default 64K compression floor for models that degrade earlier. When + *config_floor* is ``None`` the default ``MINIMUM_CONTEXT_LENGTH`` is used. + """ + if config_floor is None: + return MINIMUM_CONTEXT_LENGTH + return max(config_floor, _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT) + # Thin fallback defaults — only broad model family patterns. # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic # all miss. Replaced the previous 80+ entry dict. diff --git a/gateway/run.py b/gateway/run.py index 4b285287b22..03695956b7c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -13490,6 +13490,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew ("model", "max_tokens"), ("compression", "enabled"), ("compression", "threshold"), + ("compression", "minimum_context_floor"), ("compression", "target_ratio"), ("compression", "protect_last_n"), ("agent", "disabled_toolsets"), diff --git a/tests/gateway/test_agent_cache.py b/tests/gateway/test_agent_cache.py index 559e1c0e96c..ce84d0a5945 100644 --- a/tests/gateway/test_agent_cache.py +++ b/tests/gateway/test_agent_cache.py @@ -226,6 +226,7 @@ class TestExtractCacheBustingConfig: "compression": { "enabled": False, "threshold": 0.6, + "minimum_context_floor": 32_000, "target_ratio": 0.3, "protect_last_n": 25, "some_other_key": "ignored", @@ -234,6 +235,7 @@ class TestExtractCacheBustingConfig: ) assert out["compression.enabled"] is False assert out["compression.threshold"] == 0.6 + assert out["compression.minimum_context_floor"] == 32_000 assert out["compression.target_ratio"] == 0.3 assert out["compression.protect_last_n"] == 25 @@ -386,7 +388,7 @@ class TestExtractCacheBustingConfig: extracted cache_keys change produces a new signature.""" from gateway.run import GatewayRunner - runtime = {"api_key": "k", "base_url": "u", "provider": "p"} + runtime = {"api_key": "***", "base_url": "u", "provider": "p"} cfg_before = { "model": {"context_length": 200_000}, "compression": {"threshold": 0.50, "enabled": True}, @@ -409,6 +411,25 @@ class TestExtractCacheBustingConfig: "gateway's cached agent so the new threshold takes effect." ) + def test_minimum_context_floor_edit_busts_cache(self): + """Gateway sessions must rebuild the agent when the new floor changes.""" + from gateway.run import GatewayRunner + + runtime = {"api_key": "***", "base_url": "u", "provider": "p"} + cfg_before = {"compression": {"minimum_context_floor": 64_000}} + cfg_after = {"compression": {"minimum_context_floor": 32_000}} + + sig_before = GatewayRunner._agent_config_signature( + "m", runtime, [], "", + cache_keys=GatewayRunner._extract_cache_busting_config(cfg_before), + ) + sig_after = GatewayRunner._agent_config_signature( + "m", runtime, [], "", + cache_keys=GatewayRunner._extract_cache_busting_config(cfg_after), + ) + + assert sig_before != sig_after + class TestAgentCacheLifecycle: """End-to-end cache behavior with real AIAgent construction.""" diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index 3be0f0235a3..ba9dc8a3649 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -31,6 +31,7 @@ def _make_agent( compression_enabled: bool = True, threshold_percent: float = 0.50, main_context: int = 200_000, + minimum_context_floor: int = 64_000, ) -> AIAgent: """Build a minimal AIAgent with a compressor, skipping __init__.""" agent = AIAgent.__new__(AIAgent) @@ -57,6 +58,7 @@ def _make_agent( compressor = MagicMock(spec=ContextCompressor) compressor.context_length = main_context compressor.threshold_tokens = int(main_context * threshold_percent) + compressor._minimum_context_floor = minimum_context_floor agent.context_compressor = compressor return agent @@ -121,6 +123,30 @@ def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len): assert "below the minimum" in err +@patch("agent.model_metadata.get_model_context_length", return_value=32_768) +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_configured_floor_allows_smaller_aux_model(mock_get_client, mock_ctx_len): + """A lowered compression floor also lowers the aux-model hard floor.""" + agent = _make_agent( + main_context=96_000, + threshold_percent=0.50, + minimum_context_floor=32_000, + ) + mock_client = MagicMock() + mock_client.base_url = "https://openrouter.ai/api/v1" + mock_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_client, "small-aux-model") + + messages = [] + agent._emit_status = lambda msg: messages.append(msg) + + agent._check_compression_model_feasibility() + + assert messages + assert "Auto-lowered" in messages[0] + assert agent.context_compressor.threshold_tokens == 32_768 + + @patch("agent.model_metadata.get_model_context_length", return_value=200_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len): diff --git a/tests/run_agent/test_compression_minimum_floor.py b/tests/run_agent/test_compression_minimum_floor.py new file mode 100644 index 00000000000..b056a5a40ac --- /dev/null +++ b/tests/run_agent/test_compression_minimum_floor.py @@ -0,0 +1,75 @@ +"""Configurable minimum context compression floor (#31600).""" + +from unittest.mock import patch + +from agent.model_metadata import ( + MINIMUM_CONTEXT_LENGTH, + _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT, + get_configurable_minimum_context, +) + + +class TestConfigurableMinimumContext: + """Unit tests for ``get_configurable_minimum_context``.""" + + def test_default_returns_minimum_context_length(self): + assert get_configurable_minimum_context(None) == MINIMUM_CONTEXT_LENGTH + assert get_configurable_minimum_context() == MINIMUM_CONTEXT_LENGTH + + def test_config_floor_respected(self): + assert get_configurable_minimum_context(32_000) == 32_000 + assert get_configurable_minimum_context(65_536) == 65_536 + + def test_config_floor_clamped_to_hard_limit(self): + assert get_configurable_minimum_context(1_000) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT + assert get_configurable_minimum_context(0) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT + assert get_configurable_minimum_context(-1) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT + assert get_configurable_minimum_context(15_999) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT + + +class TestContextCompressorFloor: + """Verify ContextCompressor uses the configurable floor in real paths.""" + + def test_default_floor_in_threshold(self): + from agent.context_compressor import ContextCompressor + + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + cc = ContextCompressor(model="test", quiet_mode=True) + + assert cc._minimum_context_floor == MINIMUM_CONTEXT_LENGTH + assert cc.threshold_tokens == MINIMUM_CONTEXT_LENGTH + + def test_small_model_with_lowered_floor(self): + from agent.context_compressor import ContextCompressor + + with patch("agent.context_compressor.get_model_context_length", return_value=48_000): + cc = ContextCompressor( + model="test", quiet_mode=True, minimum_context_floor=24_000 + ) + + assert cc._minimum_context_floor == 24_000 + assert cc.threshold_tokens == 24_000 + + def test_floor_dominates_on_large_models_too(self): + from agent.context_compressor import ContextCompressor + + with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000): + cc = ContextCompressor( + model="test", + quiet_mode=True, + threshold_percent=0.02, + minimum_context_floor=32_000, + ) + + assert cc.threshold_tokens == 32_000 + + def test_update_model_uses_floor(self): + from agent.context_compressor import ContextCompressor + + with patch("agent.context_compressor.get_model_context_length", return_value=200_000): + cc = ContextCompressor( + model="test", quiet_mode=True, minimum_context_floor=40_000 + ) + + cc.update_model("switched", context_length=64_000) + assert cc.threshold_tokens == 40_000