From cae1ee44a7afb462a9fe11863d30feffa0736966 Mon Sep 17 00:00:00 2001
From: Tranquil-Flow <tranquil_flow@protonmail.com>
Date: Wed, 24 Jun 2026 23:12:48 +0530
Subject: [PATCH] fix(compression): make minimum context floor configurable
 (#31600)

Add compression.minimum_context_floor config key that allows users
to lower the compression threshold floor below the hardcoded 64K
default, preventing infinite tool-call loops on models whose
structured output degrades well before 64K tokens.

- agent/model_metadata.py: add get_configurable_minimum_context()
  helper with 16K hard safety limit
- agent/context_compressor.py: accept minimum_context_floor param,
  thread it through _compute_threshold_tokens
- agent/conversation_compression.py: use compressor's floor for
  aux model context validation
- agent/agent_init.py: read compression.minimum_context_floor from
  config and pass to ContextCompressor
- gateway/run.py: cache-busting includes new key

Salvaged from #31686 by @Tranquil-Flow onto current main.
Resolves conflicts with in-place compaction (#38763) and max_tokens
threshold computation (#43547) that landed after the original PR.

Closes #31600
---
 agent/agent_init.py                           | 12 +++
 agent/context_compressor.py                   | 27 +++++--
 agent/conversation_compression.py             | 16 ++--
 agent/model_metadata.py                       | 19 +++++
 gateway/run.py                                |  1 +
 tests/gateway/test_agent_cache.py             | 23 +++++-
 .../run_agent/test_compression_feasibility.py | 26 +++++++
 .../test_compression_minimum_floor.py         | 75 +++++++++++++++++++
 8 files changed, 187 insertions(+), 12 deletions(-)
 create mode 100644 tests/run_agent/test_compression_minimum_floor.py

diff --git a/agent/agent_init.py b/agent/agent_init.py
index e7f2ed9eac3..9054c6f2528 100644
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@@ -1358,6 +1358,17 @@ def init_agent(
     compression_in_place = is_truthy_value(
         _compression_cfg.get("in_place"), default=False
     )
+    # Allow users to lower the compression floor for models whose
+    # structured output degrades well before the default 64K tokens.
+    # Clamped to a hard-coded safety limit of 16K by the compressor.
+    _raw_floor = _compression_cfg.get("minimum_context_floor", None)
+    if _raw_floor is not None:
+        try:
+            compression_minimum_context_floor = int(_raw_floor)
+        except (TypeError, ValueError):
+            compression_minimum_context_floor = None
+    else:
+        compression_minimum_context_floor = None
 
     # Read optional explicit context_length override for the auxiliary
     # compression model. Custom endpoints often cannot report this via
@@ -1576,6 +1587,7 @@ def init_agent(
             api_mode=agent.api_mode,
             abort_on_summary_failure=compression_abort_on_summary_failure,
             max_tokens=agent.max_tokens,
+            minimum_context_floor=compression_minimum_context_floor,
         )
     agent.compression_enabled = compression_enabled
     agent.compression_in_place = compression_in_place
diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index fbde99bda5f..5b446516939 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -27,6 +27,7 @@ from agent.auxiliary_client import call_llm, _is_connection_error, aux_interrupt
 from agent.context_engine import ContextEngine
 from agent.model_metadata import (
     MINIMUM_CONTEXT_LENGTH,
+    get_configurable_minimum_context,
     get_model_context_length,
     estimate_messages_tokens_rough,
 )
@@ -683,6 +684,7 @@ class ContextCompressor(ContextEngine):
             self.max_tokens = self._coerce_max_tokens(max_tokens)
         self.threshold_tokens = self._compute_threshold_tokens(
             context_length, self.threshold_percent, self.max_tokens,
+            minimum_floor=self._minimum_context_floor,
         )
         # Recalculate token budgets for the new context length so the
         # compressor stays calibrated after a model switch (e.g. 200K → 32K).
@@ -741,6 +743,7 @@ class ContextCompressor(ContextEngine):
     @staticmethod
     def _compute_threshold_tokens(
         context_length: int, threshold_percent: float, max_tokens: int | None = None,
+        minimum_floor: int = MINIMUM_CONTEXT_LENGTH,
     ) -> int:
         """Compute the compaction trigger threshold in tokens.
 
@@ -770,7 +773,7 @@ class ContextCompressor(ContextEngine):
         if effective_window <= 0:
             effective_window = context_length
         pct_value = int(effective_window * threshold_percent)
-        floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
+        floored = max(pct_value, minimum_floor)
         # If flooring pushed the threshold to/over the effective window it can
         # never be reached. Trigger at 85% of the effective input budget so a
         # minimum-context model rides most of its budget before compacting
@@ -796,6 +799,7 @@ class ContextCompressor(ContextEngine):
         api_mode: str = "",
         abort_on_summary_failure: bool = False,
         max_tokens: int | None = None,
+        minimum_context_floor: int | None = None,
     ):
         self.model = model
         self.base_url = base_url
@@ -820,19 +824,30 @@ class ContextCompressor(ContextEngine):
         # deterministic "summary unavailable" handoff and drop the middle window.
         self.abort_on_summary_failure = abort_on_summary_failure
 
+        # Configurable compression floor — allows users with large-context
+        # models that degrade before 64K tokens to lower the bar (never below
+        # the hard-coded safety limit of 16K).  When None, the default
+        # MINIMUM_CONTEXT_LENGTH (64K) applies.
+        self._minimum_context_floor = get_configurable_minimum_context(
+            minimum_context_floor
+        )
+
         self.context_length = get_model_context_length(
             model, base_url=base_url, api_key=api_key,
             config_context_length=config_context_length,
             provider=provider,
         )
-        # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
-        # the percentage would suggest a lower value.  This prevents premature
-        # compression on large-context models at 50% while keeping the % sane
-        # for models right at the minimum. _compute_threshold_tokens also
-        # guards the degenerate case where the floor would equal/exceed the
+        # Floor: never compress below the configured minimum context floor
+        # even if the percentage would suggest a lower value.  This prevents
+        # premature compression on large-context models at 50% while keeping
+        # the % sane for models right at the minimum. _compute_threshold_tokens
+        # also guards the degenerate case where the floor would equal/exceed the
         # window (small models), so auto-compression can still fire (#14690).
+        # The floor is configurable via compression.minimum_context_floor for
+        # models whose structured output degrades well below 64K tokens.
         self.threshold_tokens = self._compute_threshold_tokens(
             self.context_length, threshold_percent, self.max_tokens,
+            minimum_floor=self._minimum_context_floor,
         )
         self.compression_count = 0
 
diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py
index ba67f036954..24f3e1e691b 100644
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -94,9 +94,15 @@ def check_compression_model_feasibility(agent: Any) -> None:
         )
         from agent.model_metadata import (
             MINIMUM_CONTEXT_LENGTH,
+            get_configurable_minimum_context,
             get_model_context_length,
         )
 
+        # Configurable compression floor from the compressor instance
+        _compression_floor = getattr(
+            agent.context_compressor, "_minimum_context_floor", MINIMUM_CONTEXT_LENGTH
+        )
+
         client, aux_model = get_text_auxiliary_client(
             "compression",
             main_runtime=agent._current_main_runtime(),
@@ -156,18 +162,18 @@ def check_compression_model_feasibility(agent: Any) -> None:
         )
 
         # Hard floor: the auxiliary compression model must have at least
-        # MINIMUM_CONTEXT_LENGTH (64K) tokens of context.  The main model
-        # is already required to meet this floor (checked earlier in
+        # the configured compression floor's worth of context.  The main model
+        # is already required to meet its floor (checked earlier in
         # __init__), so the compression model must too — otherwise it
         # cannot summarise a full threshold-sized window of main-model
         # content.  Mirrors the main-model rejection pattern.
-        if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
+        if aux_context and aux_context < _compression_floor:
             raise ValueError(
                 f"Auxiliary compression model {aux_model} has a context "
                 f"window of {aux_context:,} tokens, which is below the "
-                f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
+                f"minimum {_compression_floor:,} required by Hermes "
                 f"Agent.  Choose a compression model with at least "
-                f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
+                f"{_compression_floor // 1000}K context (set "
                 f"auxiliary.compression.model in config.yaml), or set "
                 f"auxiliary.compression.context_length to override the "
                 f"detected value if it is wrong."
diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index 4493eae5f1f..cba5b0ac423 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -184,6 +184,25 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
 # Sessions, model switches, and cron jobs should reject models below this.
 MINIMUM_CONTEXT_LENGTH = 64_000
 
+# Lower bound for user-configured compression floor overrides.
+# Users with large-context models that degrade before 64K tokens
+# (e.g. Gemini Flash via proxies) can use ``compression.minimum_context_floor``
+# in config.yaml to lower this, but never below this hard safety limit.
+_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT = 16_000
+
+
+def get_configurable_minimum_context(config_floor: int | None = None) -> int:
+    """Return the effective minimum context floor for compression.
+
+    When *config_floor* is provided (from ``compression.minimum_context_floor``),
+    it is clamped to the hard limit and returned, allowing users to lower the
+    default 64K compression floor for models that degrade earlier.  When
+    *config_floor* is ``None`` the default ``MINIMUM_CONTEXT_LENGTH`` is used.
+    """
+    if config_floor is None:
+        return MINIMUM_CONTEXT_LENGTH
+    return max(config_floor, _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT)
+
 # Thin fallback defaults — only broad model family patterns.
 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 # all miss. Replaced the previous 80+ entry dict.
diff --git a/gateway/run.py b/gateway/run.py
index 4b285287b22..03695956b7c 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -13490,6 +13490,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
         ("model", "max_tokens"),
         ("compression", "enabled"),
         ("compression", "threshold"),
+        ("compression", "minimum_context_floor"),
         ("compression", "target_ratio"),
         ("compression", "protect_last_n"),
         ("agent", "disabled_toolsets"),
diff --git a/tests/gateway/test_agent_cache.py b/tests/gateway/test_agent_cache.py
index 559e1c0e96c..ce84d0a5945 100644
--- a/tests/gateway/test_agent_cache.py
+++ b/tests/gateway/test_agent_cache.py
@@ -226,6 +226,7 @@ class TestExtractCacheBustingConfig:
                 "compression": {
                     "enabled": False,
                     "threshold": 0.6,
+                    "minimum_context_floor": 32_000,
                     "target_ratio": 0.3,
                     "protect_last_n": 25,
                     "some_other_key": "ignored",
@@ -234,6 +235,7 @@ class TestExtractCacheBustingConfig:
         )
         assert out["compression.enabled"] is False
         assert out["compression.threshold"] == 0.6
+        assert out["compression.minimum_context_floor"] == 32_000
         assert out["compression.target_ratio"] == 0.3
         assert out["compression.protect_last_n"] == 25
 
@@ -386,7 +388,7 @@ class TestExtractCacheBustingConfig:
         extracted cache_keys change produces a new signature."""
         from gateway.run import GatewayRunner
 
-        runtime = {"api_key": "k", "base_url": "u", "provider": "p"}
+        runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
         cfg_before = {
             "model": {"context_length": 200_000},
             "compression": {"threshold": 0.50, "enabled": True},
@@ -409,6 +411,25 @@ class TestExtractCacheBustingConfig:
             "gateway's cached agent so the new threshold takes effect."
         )
 
+    def test_minimum_context_floor_edit_busts_cache(self):
+        """Gateway sessions must rebuild the agent when the new floor changes."""
+        from gateway.run import GatewayRunner
+
+        runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
+        cfg_before = {"compression": {"minimum_context_floor": 64_000}}
+        cfg_after = {"compression": {"minimum_context_floor": 32_000}}
+
+        sig_before = GatewayRunner._agent_config_signature(
+            "m", runtime, [], "",
+            cache_keys=GatewayRunner._extract_cache_busting_config(cfg_before),
+        )
+        sig_after = GatewayRunner._agent_config_signature(
+            "m", runtime, [], "",
+            cache_keys=GatewayRunner._extract_cache_busting_config(cfg_after),
+        )
+
+        assert sig_before != sig_after
+
 
 class TestAgentCacheLifecycle:
     """End-to-end cache behavior with real AIAgent construction."""
diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py
index 3be0f0235a3..ba9dc8a3649 100644
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@@ -31,6 +31,7 @@ def _make_agent(
     compression_enabled: bool = True,
     threshold_percent: float = 0.50,
     main_context: int = 200_000,
+    minimum_context_floor: int = 64_000,
 ) -> AIAgent:
     """Build a minimal AIAgent with a compressor, skipping __init__."""
     agent = AIAgent.__new__(AIAgent)
@@ -57,6 +58,7 @@ def _make_agent(
     compressor = MagicMock(spec=ContextCompressor)
     compressor.context_length = main_context
     compressor.threshold_tokens = int(main_context * threshold_percent)
+    compressor._minimum_context_floor = minimum_context_floor
     agent.context_compressor = compressor
 
     return agent
@@ -121,6 +123,30 @@ def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
     assert "below the minimum" in err
 
 
+@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_configured_floor_allows_smaller_aux_model(mock_get_client, mock_ctx_len):
+    """A lowered compression floor also lowers the aux-model hard floor."""
+    agent = _make_agent(
+        main_context=96_000,
+        threshold_percent=0.50,
+        minimum_context_floor=32_000,
+    )
+    mock_client = MagicMock()
+    mock_client.base_url = "https://openrouter.ai/api/v1"
+    mock_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_client, "small-aux-model")
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    agent._check_compression_model_feasibility()
+
+    assert messages
+    assert "Auto-lowered" in messages[0]
+    assert agent.context_compressor.threshold_tokens == 32_768
+
+
 @patch("agent.model_metadata.get_model_context_length", return_value=200_000)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len):
diff --git a/tests/run_agent/test_compression_minimum_floor.py b/tests/run_agent/test_compression_minimum_floor.py
new file mode 100644
index 00000000000..b056a5a40ac
--- /dev/null
+++ b/tests/run_agent/test_compression_minimum_floor.py
@@ -0,0 +1,75 @@
+"""Configurable minimum context compression floor (#31600)."""
+
+from unittest.mock import patch
+
+from agent.model_metadata import (
+    MINIMUM_CONTEXT_LENGTH,
+    _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT,
+    get_configurable_minimum_context,
+)
+
+
+class TestConfigurableMinimumContext:
+    """Unit tests for ``get_configurable_minimum_context``."""
+
+    def test_default_returns_minimum_context_length(self):
+        assert get_configurable_minimum_context(None) == MINIMUM_CONTEXT_LENGTH
+        assert get_configurable_minimum_context() == MINIMUM_CONTEXT_LENGTH
+
+    def test_config_floor_respected(self):
+        assert get_configurable_minimum_context(32_000) == 32_000
+        assert get_configurable_minimum_context(65_536) == 65_536
+
+    def test_config_floor_clamped_to_hard_limit(self):
+        assert get_configurable_minimum_context(1_000) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
+        assert get_configurable_minimum_context(0) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
+        assert get_configurable_minimum_context(-1) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
+        assert get_configurable_minimum_context(15_999) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
+
+
+class TestContextCompressorFloor:
+    """Verify ContextCompressor uses the configurable floor in real paths."""
+
+    def test_default_floor_in_threshold(self):
+        from agent.context_compressor import ContextCompressor
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
+            cc = ContextCompressor(model="test", quiet_mode=True)
+
+        assert cc._minimum_context_floor == MINIMUM_CONTEXT_LENGTH
+        assert cc.threshold_tokens == MINIMUM_CONTEXT_LENGTH
+
+    def test_small_model_with_lowered_floor(self):
+        from agent.context_compressor import ContextCompressor
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=48_000):
+            cc = ContextCompressor(
+                model="test", quiet_mode=True, minimum_context_floor=24_000
+            )
+
+        assert cc._minimum_context_floor == 24_000
+        assert cc.threshold_tokens == 24_000
+
+    def test_floor_dominates_on_large_models_too(self):
+        from agent.context_compressor import ContextCompressor
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
+            cc = ContextCompressor(
+                model="test",
+                quiet_mode=True,
+                threshold_percent=0.02,
+                minimum_context_floor=32_000,
+            )
+
+        assert cc.threshold_tokens == 32_000
+
+    def test_update_model_uses_floor(self):
+        from agent.context_compressor import ContextCompressor
+
+        with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
+            cc = ContextCompressor(
+                model="test", quiet_mode=True, minimum_context_floor=40_000
+            )
+
+        cc.update_model("switched", context_length=64_000)
+        assert cc.threshold_tokens == 40_000