From e0272cfef28f8ef86904a5adf0e34b575b63b109 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Thu, 25 Jun 2026 01:04:44 +0530 Subject: [PATCH] Revert "fix(compression): make minimum context floor configurable (#31600)" This reverts commit cae1ee44a7afb462a9fe11863d30feffa0736966. --- agent/agent_init.py | 12 --- agent/context_compressor.py | 27 ++----- agent/conversation_compression.py | 16 ++-- agent/model_metadata.py | 19 ----- gateway/run.py | 1 - tests/gateway/test_agent_cache.py | 23 +----- .../run_agent/test_compression_feasibility.py | 26 ------- .../test_compression_minimum_floor.py | 75 ------------------- 8 files changed, 12 insertions(+), 187 deletions(-) delete mode 100644 tests/run_agent/test_compression_minimum_floor.py diff --git a/agent/agent_init.py b/agent/agent_init.py index 9054c6f2528..e7f2ed9eac3 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1358,17 +1358,6 @@ def init_agent( compression_in_place = is_truthy_value( _compression_cfg.get("in_place"), default=False ) - # Allow users to lower the compression floor for models whose - # structured output degrades well before the default 64K tokens. - # Clamped to a hard-coded safety limit of 16K by the compressor. - _raw_floor = _compression_cfg.get("minimum_context_floor", None) - if _raw_floor is not None: - try: - compression_minimum_context_floor = int(_raw_floor) - except (TypeError, ValueError): - compression_minimum_context_floor = None - else: - compression_minimum_context_floor = None # Read optional explicit context_length override for the auxiliary # compression model. Custom endpoints often cannot report this via @@ -1587,7 +1576,6 @@ def init_agent( api_mode=agent.api_mode, abort_on_summary_failure=compression_abort_on_summary_failure, max_tokens=agent.max_tokens, - minimum_context_floor=compression_minimum_context_floor, ) agent.compression_enabled = compression_enabled agent.compression_in_place = compression_in_place diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 5b446516939..fbde99bda5f 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -27,7 +27,6 @@ from agent.auxiliary_client import call_llm, _is_connection_error, aux_interrupt from agent.context_engine import ContextEngine from agent.model_metadata import ( MINIMUM_CONTEXT_LENGTH, - get_configurable_minimum_context, get_model_context_length, estimate_messages_tokens_rough, ) @@ -684,7 +683,6 @@ class ContextCompressor(ContextEngine): self.max_tokens = self._coerce_max_tokens(max_tokens) self.threshold_tokens = self._compute_threshold_tokens( context_length, self.threshold_percent, self.max_tokens, - minimum_floor=self._minimum_context_floor, ) # Recalculate token budgets for the new context length so the # compressor stays calibrated after a model switch (e.g. 200K → 32K). @@ -743,7 +741,6 @@ class ContextCompressor(ContextEngine): @staticmethod def _compute_threshold_tokens( context_length: int, threshold_percent: float, max_tokens: int | None = None, - minimum_floor: int = MINIMUM_CONTEXT_LENGTH, ) -> int: """Compute the compaction trigger threshold in tokens. @@ -773,7 +770,7 @@ class ContextCompressor(ContextEngine): if effective_window <= 0: effective_window = context_length pct_value = int(effective_window * threshold_percent) - floored = max(pct_value, minimum_floor) + floored = max(pct_value, MINIMUM_CONTEXT_LENGTH) # If flooring pushed the threshold to/over the effective window it can # never be reached. Trigger at 85% of the effective input budget so a # minimum-context model rides most of its budget before compacting @@ -799,7 +796,6 @@ class ContextCompressor(ContextEngine): api_mode: str = "", abort_on_summary_failure: bool = False, max_tokens: int | None = None, - minimum_context_floor: int | None = None, ): self.model = model self.base_url = base_url @@ -824,30 +820,19 @@ class ContextCompressor(ContextEngine): # deterministic "summary unavailable" handoff and drop the middle window. self.abort_on_summary_failure = abort_on_summary_failure - # Configurable compression floor — allows users with large-context - # models that degrade before 64K tokens to lower the bar (never below - # the hard-coded safety limit of 16K). When None, the default - # MINIMUM_CONTEXT_LENGTH (64K) applies. - self._minimum_context_floor = get_configurable_minimum_context( - minimum_context_floor - ) - self.context_length = get_model_context_length( model, base_url=base_url, api_key=api_key, config_context_length=config_context_length, provider=provider, ) - # Floor: never compress below the configured minimum context floor - # even if the percentage would suggest a lower value. This prevents - # premature compression on large-context models at 50% while keeping - # the % sane for models right at the minimum. _compute_threshold_tokens - # also guards the degenerate case where the floor would equal/exceed the + # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if + # the percentage would suggest a lower value. This prevents premature + # compression on large-context models at 50% while keeping the % sane + # for models right at the minimum. _compute_threshold_tokens also + # guards the degenerate case where the floor would equal/exceed the # window (small models), so auto-compression can still fire (#14690). - # The floor is configurable via compression.minimum_context_floor for - # models whose structured output degrades well below 64K tokens. self.threshold_tokens = self._compute_threshold_tokens( self.context_length, threshold_percent, self.max_tokens, - minimum_floor=self._minimum_context_floor, ) self.compression_count = 0 diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 24f3e1e691b..ba67f036954 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -94,15 +94,9 @@ def check_compression_model_feasibility(agent: Any) -> None: ) from agent.model_metadata import ( MINIMUM_CONTEXT_LENGTH, - get_configurable_minimum_context, get_model_context_length, ) - # Configurable compression floor from the compressor instance - _compression_floor = getattr( - agent.context_compressor, "_minimum_context_floor", MINIMUM_CONTEXT_LENGTH - ) - client, aux_model = get_text_auxiliary_client( "compression", main_runtime=agent._current_main_runtime(), @@ -162,18 +156,18 @@ def check_compression_model_feasibility(agent: Any) -> None: ) # Hard floor: the auxiliary compression model must have at least - # the configured compression floor's worth of context. The main model - # is already required to meet its floor (checked earlier in + # MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model + # is already required to meet this floor (checked earlier in # __init__), so the compression model must too — otherwise it # cannot summarise a full threshold-sized window of main-model # content. Mirrors the main-model rejection pattern. - if aux_context and aux_context < _compression_floor: + if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH: raise ValueError( f"Auxiliary compression model {aux_model} has a context " f"window of {aux_context:,} tokens, which is below the " - f"minimum {_compression_floor:,} required by Hermes " + f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes " f"Agent. Choose a compression model with at least " - f"{_compression_floor // 1000}K context (set " + f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set " f"auxiliary.compression.model in config.yaml), or set " f"auxiliary.compression.context_length to override the " f"detected value if it is wrong." diff --git a/agent/model_metadata.py b/agent/model_metadata.py index cba5b0ac423..4493eae5f1f 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -184,25 +184,6 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0] # Sessions, model switches, and cron jobs should reject models below this. MINIMUM_CONTEXT_LENGTH = 64_000 -# Lower bound for user-configured compression floor overrides. -# Users with large-context models that degrade before 64K tokens -# (e.g. Gemini Flash via proxies) can use ``compression.minimum_context_floor`` -# in config.yaml to lower this, but never below this hard safety limit. -_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT = 16_000 - - -def get_configurable_minimum_context(config_floor: int | None = None) -> int: - """Return the effective minimum context floor for compression. - - When *config_floor* is provided (from ``compression.minimum_context_floor``), - it is clamped to the hard limit and returned, allowing users to lower the - default 64K compression floor for models that degrade earlier. When - *config_floor* is ``None`` the default ``MINIMUM_CONTEXT_LENGTH`` is used. - """ - if config_floor is None: - return MINIMUM_CONTEXT_LENGTH - return max(config_floor, _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT) - # Thin fallback defaults — only broad model family patterns. # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic # all miss. Replaced the previous 80+ entry dict. diff --git a/gateway/run.py b/gateway/run.py index 03695956b7c..4b285287b22 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -13490,7 +13490,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew ("model", "max_tokens"), ("compression", "enabled"), ("compression", "threshold"), - ("compression", "minimum_context_floor"), ("compression", "target_ratio"), ("compression", "protect_last_n"), ("agent", "disabled_toolsets"), diff --git a/tests/gateway/test_agent_cache.py b/tests/gateway/test_agent_cache.py index ce84d0a5945..559e1c0e96c 100644 --- a/tests/gateway/test_agent_cache.py +++ b/tests/gateway/test_agent_cache.py @@ -226,7 +226,6 @@ class TestExtractCacheBustingConfig: "compression": { "enabled": False, "threshold": 0.6, - "minimum_context_floor": 32_000, "target_ratio": 0.3, "protect_last_n": 25, "some_other_key": "ignored", @@ -235,7 +234,6 @@ class TestExtractCacheBustingConfig: ) assert out["compression.enabled"] is False assert out["compression.threshold"] == 0.6 - assert out["compression.minimum_context_floor"] == 32_000 assert out["compression.target_ratio"] == 0.3 assert out["compression.protect_last_n"] == 25 @@ -388,7 +386,7 @@ class TestExtractCacheBustingConfig: extracted cache_keys change produces a new signature.""" from gateway.run import GatewayRunner - runtime = {"api_key": "***", "base_url": "u", "provider": "p"} + runtime = {"api_key": "k", "base_url": "u", "provider": "p"} cfg_before = { "model": {"context_length": 200_000}, "compression": {"threshold": 0.50, "enabled": True}, @@ -411,25 +409,6 @@ class TestExtractCacheBustingConfig: "gateway's cached agent so the new threshold takes effect." ) - def test_minimum_context_floor_edit_busts_cache(self): - """Gateway sessions must rebuild the agent when the new floor changes.""" - from gateway.run import GatewayRunner - - runtime = {"api_key": "***", "base_url": "u", "provider": "p"} - cfg_before = {"compression": {"minimum_context_floor": 64_000}} - cfg_after = {"compression": {"minimum_context_floor": 32_000}} - - sig_before = GatewayRunner._agent_config_signature( - "m", runtime, [], "", - cache_keys=GatewayRunner._extract_cache_busting_config(cfg_before), - ) - sig_after = GatewayRunner._agent_config_signature( - "m", runtime, [], "", - cache_keys=GatewayRunner._extract_cache_busting_config(cfg_after), - ) - - assert sig_before != sig_after - class TestAgentCacheLifecycle: """End-to-end cache behavior with real AIAgent construction.""" diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index ba9dc8a3649..3be0f0235a3 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -31,7 +31,6 @@ def _make_agent( compression_enabled: bool = True, threshold_percent: float = 0.50, main_context: int = 200_000, - minimum_context_floor: int = 64_000, ) -> AIAgent: """Build a minimal AIAgent with a compressor, skipping __init__.""" agent = AIAgent.__new__(AIAgent) @@ -58,7 +57,6 @@ def _make_agent( compressor = MagicMock(spec=ContextCompressor) compressor.context_length = main_context compressor.threshold_tokens = int(main_context * threshold_percent) - compressor._minimum_context_floor = minimum_context_floor agent.context_compressor = compressor return agent @@ -123,30 +121,6 @@ def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len): assert "below the minimum" in err -@patch("agent.model_metadata.get_model_context_length", return_value=32_768) -@patch("agent.auxiliary_client.get_text_auxiliary_client") -def test_configured_floor_allows_smaller_aux_model(mock_get_client, mock_ctx_len): - """A lowered compression floor also lowers the aux-model hard floor.""" - agent = _make_agent( - main_context=96_000, - threshold_percent=0.50, - minimum_context_floor=32_000, - ) - mock_client = MagicMock() - mock_client.base_url = "https://openrouter.ai/api/v1" - mock_client.api_key = "sk-aux" - mock_get_client.return_value = (mock_client, "small-aux-model") - - messages = [] - agent._emit_status = lambda msg: messages.append(msg) - - agent._check_compression_model_feasibility() - - assert messages - assert "Auto-lowered" in messages[0] - assert agent.context_compressor.threshold_tokens == 32_768 - - @patch("agent.model_metadata.get_model_context_length", return_value=200_000) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len): diff --git a/tests/run_agent/test_compression_minimum_floor.py b/tests/run_agent/test_compression_minimum_floor.py deleted file mode 100644 index b056a5a40ac..00000000000 --- a/tests/run_agent/test_compression_minimum_floor.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Configurable minimum context compression floor (#31600).""" - -from unittest.mock import patch - -from agent.model_metadata import ( - MINIMUM_CONTEXT_LENGTH, - _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT, - get_configurable_minimum_context, -) - - -class TestConfigurableMinimumContext: - """Unit tests for ``get_configurable_minimum_context``.""" - - def test_default_returns_minimum_context_length(self): - assert get_configurable_minimum_context(None) == MINIMUM_CONTEXT_LENGTH - assert get_configurable_minimum_context() == MINIMUM_CONTEXT_LENGTH - - def test_config_floor_respected(self): - assert get_configurable_minimum_context(32_000) == 32_000 - assert get_configurable_minimum_context(65_536) == 65_536 - - def test_config_floor_clamped_to_hard_limit(self): - assert get_configurable_minimum_context(1_000) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT - assert get_configurable_minimum_context(0) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT - assert get_configurable_minimum_context(-1) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT - assert get_configurable_minimum_context(15_999) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT - - -class TestContextCompressorFloor: - """Verify ContextCompressor uses the configurable floor in real paths.""" - - def test_default_floor_in_threshold(self): - from agent.context_compressor import ContextCompressor - - with patch("agent.context_compressor.get_model_context_length", return_value=100_000): - cc = ContextCompressor(model="test", quiet_mode=True) - - assert cc._minimum_context_floor == MINIMUM_CONTEXT_LENGTH - assert cc.threshold_tokens == MINIMUM_CONTEXT_LENGTH - - def test_small_model_with_lowered_floor(self): - from agent.context_compressor import ContextCompressor - - with patch("agent.context_compressor.get_model_context_length", return_value=48_000): - cc = ContextCompressor( - model="test", quiet_mode=True, minimum_context_floor=24_000 - ) - - assert cc._minimum_context_floor == 24_000 - assert cc.threshold_tokens == 24_000 - - def test_floor_dominates_on_large_models_too(self): - from agent.context_compressor import ContextCompressor - - with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000): - cc = ContextCompressor( - model="test", - quiet_mode=True, - threshold_percent=0.02, - minimum_context_floor=32_000, - ) - - assert cc.threshold_tokens == 32_000 - - def test_update_model_uses_floor(self): - from agent.context_compressor import ContextCompressor - - with patch("agent.context_compressor.get_model_context_length", return_value=200_000): - cc = ContextCompressor( - model="test", quiet_mode=True, minimum_context_floor=40_000 - ) - - cc.update_model("switched", context_length=64_000) - assert cc.threshold_tokens == 40_000