Revert "fix(compression): make minimum context floor configurable (#31600)"

This reverts commit cae1ee44a7.
2026-06-27 11:22:03 +00:00 · 2026-06-25 01:04:44 +05:30 · 2026-06-25 01:04:44 +05:30 · e0272cfef2
commit e0272cfef2
parent 59acaa972f
8 changed files with 12 additions and 187 deletions
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@ -1358,17 +1358,6 @@ def init_agent(
    compression_in_place = is_truthy_value(
        _compression_cfg.get("in_place"), default=False
    )
-    # Allow users to lower the compression floor for models whose
-    # structured output degrades well before the default 64K tokens.
-    # Clamped to a hard-coded safety limit of 16K by the compressor.
-    _raw_floor = _compression_cfg.get("minimum_context_floor", None)
-    if _raw_floor is not None:
-        try:
-            compression_minimum_context_floor = int(_raw_floor)
-        except (TypeError, ValueError):
-            compression_minimum_context_floor = None
-    else:
-        compression_minimum_context_floor = None

    # Read optional explicit context_length override for the auxiliary
    # compression model. Custom endpoints often cannot report this via
@ -1587,7 +1576,6 @@ def init_agent(
            api_mode=agent.api_mode,
            abort_on_summary_failure=compression_abort_on_summary_failure,
            max_tokens=agent.max_tokens,
-            minimum_context_floor=compression_minimum_context_floor,
        )
    agent.compression_enabled = compression_enabled
    agent.compression_in_place = compression_in_place
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -27,7 +27,6 @@ from agent.auxiliary_client import call_llm, _is_connection_error, aux_interrupt
 from agent.context_engine import ContextEngine
 from agent.model_metadata import (
    MINIMUM_CONTEXT_LENGTH,
-    get_configurable_minimum_context,
    get_model_context_length,
    estimate_messages_tokens_rough,
 )
@ -684,7 +683,6 @@ class ContextCompressor(ContextEngine):
            self.max_tokens = self._coerce_max_tokens(max_tokens)
        self.threshold_tokens = self._compute_threshold_tokens(
            context_length, self.threshold_percent, self.max_tokens,
-            minimum_floor=self._minimum_context_floor,
        )
        # Recalculate token budgets for the new context length so the
        # compressor stays calibrated after a model switch (e.g. 200K → 32K).
@ -743,7 +741,6 @@ class ContextCompressor(ContextEngine):
    @staticmethod
    def _compute_threshold_tokens(
        context_length: int, threshold_percent: float, max_tokens: int | None = None,
-        minimum_floor: int = MINIMUM_CONTEXT_LENGTH,
    ) -> int:
        """Compute the compaction trigger threshold in tokens.

@ -773,7 +770,7 @@ class ContextCompressor(ContextEngine):
        if effective_window <= 0:
            effective_window = context_length
        pct_value = int(effective_window * threshold_percent)
-        floored = max(pct_value, minimum_floor)
+        floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
        # If flooring pushed the threshold to/over the effective window it can
        # never be reached. Trigger at 85% of the effective input budget so a
        # minimum-context model rides most of its budget before compacting
@ -799,7 +796,6 @@ class ContextCompressor(ContextEngine):
        api_mode: str = "",
        abort_on_summary_failure: bool = False,
        max_tokens: int | None = None,
-        minimum_context_floor: int | None = None,
    ):
        self.model = model
        self.base_url = base_url
@ -824,30 +820,19 @@ class ContextCompressor(ContextEngine):
        # deterministic "summary unavailable" handoff and drop the middle window.
        self.abort_on_summary_failure = abort_on_summary_failure

-        # Configurable compression floor — allows users with large-context
-        # models that degrade before 64K tokens to lower the bar (never below
-        # the hard-coded safety limit of 16K).  When None, the default
-        # MINIMUM_CONTEXT_LENGTH (64K) applies.
-        self._minimum_context_floor = get_configurable_minimum_context(
-            minimum_context_floor
-        )
-
        self.context_length = get_model_context_length(
            model, base_url=base_url, api_key=api_key,
            config_context_length=config_context_length,
            provider=provider,
        )
-        # Floor: never compress below the configured minimum context floor
-        # even if the percentage would suggest a lower value.  This prevents
-        # premature compression on large-context models at 50% while keeping
-        # the % sane for models right at the minimum. _compute_threshold_tokens
-        # also guards the degenerate case where the floor would equal/exceed the
+        # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
+        # the percentage would suggest a lower value.  This prevents premature
+        # compression on large-context models at 50% while keeping the % sane
+        # for models right at the minimum. _compute_threshold_tokens also
+        # guards the degenerate case where the floor would equal/exceed the
        # window (small models), so auto-compression can still fire (#14690).
-        # The floor is configurable via compression.minimum_context_floor for
-        # models whose structured output degrades well below 64K tokens.
        self.threshold_tokens = self._compute_threshold_tokens(
            self.context_length, threshold_percent, self.max_tokens,
-            minimum_floor=self._minimum_context_floor,
        )
        self.compression_count = 0

--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -94,15 +94,9 @@ def check_compression_model_feasibility(agent: Any) -> None:
        )
        from agent.model_metadata import (
            MINIMUM_CONTEXT_LENGTH,
-            get_configurable_minimum_context,
            get_model_context_length,
        )

-        # Configurable compression floor from the compressor instance
-        _compression_floor = getattr(
-            agent.context_compressor, "_minimum_context_floor", MINIMUM_CONTEXT_LENGTH
-        )
-
        client, aux_model = get_text_auxiliary_client(
            "compression",
            main_runtime=agent._current_main_runtime(),
@ -162,18 +156,18 @@ def check_compression_model_feasibility(agent: Any) -> None:
        )

        # Hard floor: the auxiliary compression model must have at least
-        # the configured compression floor's worth of context.  The main model
-        # is already required to meet its floor (checked earlier in
+        # MINIMUM_CONTEXT_LENGTH (64K) tokens of context.  The main model
+        # is already required to meet this floor (checked earlier in
        # __init__), so the compression model must too — otherwise it
        # cannot summarise a full threshold-sized window of main-model
        # content.  Mirrors the main-model rejection pattern.
-        if aux_context and aux_context < _compression_floor:
+        if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
            raise ValueError(
                f"Auxiliary compression model {aux_model} has a context "
                f"window of {aux_context:,} tokens, which is below the "
-                f"minimum {_compression_floor:,} required by Hermes "
+                f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
                f"Agent.  Choose a compression model with at least "
-                f"{_compression_floor // 1000}K context (set "
+                f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
                f"auxiliary.compression.model in config.yaml), or set "
                f"auxiliary.compression.context_length to override the "
                f"detected value if it is wrong."
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -184,25 +184,6 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
 # Sessions, model switches, and cron jobs should reject models below this.
 MINIMUM_CONTEXT_LENGTH = 64_000

-# Lower bound for user-configured compression floor overrides.
-# Users with large-context models that degrade before 64K tokens
-# (e.g. Gemini Flash via proxies) can use ``compression.minimum_context_floor``
-# in config.yaml to lower this, but never below this hard safety limit.
-_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT = 16_000
-
-
-def get_configurable_minimum_context(config_floor: int | None = None) -> int:
-    """Return the effective minimum context floor for compression.
-
-    When *config_floor* is provided (from ``compression.minimum_context_floor``),
-    it is clamped to the hard limit and returned, allowing users to lower the
-    default 64K compression floor for models that degrade earlier.  When
-    *config_floor* is ``None`` the default ``MINIMUM_CONTEXT_LENGTH`` is used.
-    """
-    if config_floor is None:
-        return MINIMUM_CONTEXT_LENGTH
-    return max(config_floor, _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT)
-
 # Thin fallback defaults — only broad model family patterns.
 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 # all miss. Replaced the previous 80+ entry dict.
--- a/gateway/run.py
+++ b/gateway/run.py
@ -13490,7 +13490,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
        ("model", "max_tokens"),
        ("compression", "enabled"),
        ("compression", "threshold"),
-        ("compression", "minimum_context_floor"),
        ("compression", "target_ratio"),
        ("compression", "protect_last_n"),
        ("agent", "disabled_toolsets"),
--- a/tests/gateway/test_agent_cache.py
+++ b/tests/gateway/test_agent_cache.py
@ -226,7 +226,6 @@ class TestExtractCacheBustingConfig:
                "compression": {
                    "enabled": False,
                    "threshold": 0.6,
-                    "minimum_context_floor": 32_000,
                    "target_ratio": 0.3,
                    "protect_last_n": 25,
                    "some_other_key": "ignored",
@ -235,7 +234,6 @@ class TestExtractCacheBustingConfig:
        )
        assert out["compression.enabled"] is False
        assert out["compression.threshold"] == 0.6
-        assert out["compression.minimum_context_floor"] == 32_000
        assert out["compression.target_ratio"] == 0.3
        assert out["compression.protect_last_n"] == 25

@ -388,7 +386,7 @@ class TestExtractCacheBustingConfig:
        extracted cache_keys change produces a new signature."""
        from gateway.run import GatewayRunner

-        runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
+        runtime = {"api_key": "k", "base_url": "u", "provider": "p"}
        cfg_before = {
            "model": {"context_length": 200_000},
            "compression": {"threshold": 0.50, "enabled": True},
@ -411,25 +409,6 @@ class TestExtractCacheBustingConfig:
            "gateway's cached agent so the new threshold takes effect."
        )

-    def test_minimum_context_floor_edit_busts_cache(self):
-        """Gateway sessions must rebuild the agent when the new floor changes."""
-        from gateway.run import GatewayRunner
-
-        runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
-        cfg_before = {"compression": {"minimum_context_floor": 64_000}}
-        cfg_after = {"compression": {"minimum_context_floor": 32_000}}
-
-        sig_before = GatewayRunner._agent_config_signature(
-            "m", runtime, [], "",
-            cache_keys=GatewayRunner._extract_cache_busting_config(cfg_before),
-        )
-        sig_after = GatewayRunner._agent_config_signature(
-            "m", runtime, [], "",
-            cache_keys=GatewayRunner._extract_cache_busting_config(cfg_after),
-        )
-
-        assert sig_before != sig_after
-

 class TestAgentCacheLifecycle:
    """End-to-end cache behavior with real AIAgent construction."""
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@ -31,7 +31,6 @@ def _make_agent(
    compression_enabled: bool = True,
    threshold_percent: float = 0.50,
    main_context: int = 200_000,
-    minimum_context_floor: int = 64_000,
 ) -> AIAgent:
    """Build a minimal AIAgent with a compressor, skipping __init__."""
    agent = AIAgent.__new__(AIAgent)
@ -58,7 +57,6 @@ def _make_agent(
    compressor = MagicMock(spec=ContextCompressor)
    compressor.context_length = main_context
    compressor.threshold_tokens = int(main_context * threshold_percent)
-    compressor._minimum_context_floor = minimum_context_floor
    agent.context_compressor = compressor

    return agent
@ -123,30 +121,6 @@ def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
    assert "below the minimum" in err


-@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
-@patch("agent.auxiliary_client.get_text_auxiliary_client")
-def test_configured_floor_allows_smaller_aux_model(mock_get_client, mock_ctx_len):
-    """A lowered compression floor also lowers the aux-model hard floor."""
-    agent = _make_agent(
-        main_context=96_000,
-        threshold_percent=0.50,
-        minimum_context_floor=32_000,
-    )
-    mock_client = MagicMock()
-    mock_client.base_url = "https://openrouter.ai/api/v1"
-    mock_client.api_key = "sk-aux"
-    mock_get_client.return_value = (mock_client, "small-aux-model")
-
-    messages = []
-    agent._emit_status = lambda msg: messages.append(msg)
-
-    agent._check_compression_model_feasibility()
-
-    assert messages
-    assert "Auto-lowered" in messages[0]
-    assert agent.context_compressor.threshold_tokens == 32_768
-
-
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len):
--- a/tests/run_agent/test_compression_minimum_floor.py
+++ b/tests/run_agent/test_compression_minimum_floor.py
@ -1,75 +0,0 @@
-"""Configurable minimum context compression floor (#31600)."""
-
-from unittest.mock import patch
-
-from agent.model_metadata import (
-    MINIMUM_CONTEXT_LENGTH,
-    _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT,
-    get_configurable_minimum_context,
-)
-
-
-class TestConfigurableMinimumContext:
-    """Unit tests for ``get_configurable_minimum_context``."""
-
-    def test_default_returns_minimum_context_length(self):
-        assert get_configurable_minimum_context(None) == MINIMUM_CONTEXT_LENGTH
-        assert get_configurable_minimum_context() == MINIMUM_CONTEXT_LENGTH
-
-    def test_config_floor_respected(self):
-        assert get_configurable_minimum_context(32_000) == 32_000
-        assert get_configurable_minimum_context(65_536) == 65_536
-
-    def test_config_floor_clamped_to_hard_limit(self):
-        assert get_configurable_minimum_context(1_000) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
-        assert get_configurable_minimum_context(0) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
-        assert get_configurable_minimum_context(-1) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
-        assert get_configurable_minimum_context(15_999) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
-
-
-class TestContextCompressorFloor:
-    """Verify ContextCompressor uses the configurable floor in real paths."""
-
-    def test_default_floor_in_threshold(self):
-        from agent.context_compressor import ContextCompressor
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
-            cc = ContextCompressor(model="test", quiet_mode=True)
-
-        assert cc._minimum_context_floor == MINIMUM_CONTEXT_LENGTH
-        assert cc.threshold_tokens == MINIMUM_CONTEXT_LENGTH
-
-    def test_small_model_with_lowered_floor(self):
-        from agent.context_compressor import ContextCompressor
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=48_000):
-            cc = ContextCompressor(
-                model="test", quiet_mode=True, minimum_context_floor=24_000
-            )
-
-        assert cc._minimum_context_floor == 24_000
-        assert cc.threshold_tokens == 24_000
-
-    def test_floor_dominates_on_large_models_too(self):
-        from agent.context_compressor import ContextCompressor
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
-            cc = ContextCompressor(
-                model="test",
-                quiet_mode=True,
-                threshold_percent=0.02,
-                minimum_context_floor=32_000,
-            )
-
-        assert cc.threshold_tokens == 32_000
-
-    def test_update_model_uses_floor(self):
-        from agent.context_compressor import ContextCompressor
-
-        with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
-            cc = ContextCompressor(
-                model="test", quiet_mode=True, minimum_context_floor=40_000
-            )
-
-        cc.update_model("switched", context_length=64_000)
-        assert cc.threshold_tokens == 40_000