Revert "fix(compression): make minimum context floor configurable (#31600)"

This reverts commit cae1ee44a7.
This commit is contained in:
kshitijk4poor 2026-06-25 01:04:44 +05:30
parent 59acaa972f
commit e0272cfef2
8 changed files with 12 additions and 187 deletions

View file

@ -1358,17 +1358,6 @@ def init_agent(
compression_in_place = is_truthy_value(
_compression_cfg.get("in_place"), default=False
)
# Allow users to lower the compression floor for models whose
# structured output degrades well before the default 64K tokens.
# Clamped to a hard-coded safety limit of 16K by the compressor.
_raw_floor = _compression_cfg.get("minimum_context_floor", None)
if _raw_floor is not None:
try:
compression_minimum_context_floor = int(_raw_floor)
except (TypeError, ValueError):
compression_minimum_context_floor = None
else:
compression_minimum_context_floor = None
# Read optional explicit context_length override for the auxiliary
# compression model. Custom endpoints often cannot report this via
@ -1587,7 +1576,6 @@ def init_agent(
api_mode=agent.api_mode,
abort_on_summary_failure=compression_abort_on_summary_failure,
max_tokens=agent.max_tokens,
minimum_context_floor=compression_minimum_context_floor,
)
agent.compression_enabled = compression_enabled
agent.compression_in_place = compression_in_place

View file

@ -27,7 +27,6 @@ from agent.auxiliary_client import call_llm, _is_connection_error, aux_interrupt
from agent.context_engine import ContextEngine
from agent.model_metadata import (
MINIMUM_CONTEXT_LENGTH,
get_configurable_minimum_context,
get_model_context_length,
estimate_messages_tokens_rough,
)
@ -684,7 +683,6 @@ class ContextCompressor(ContextEngine):
self.max_tokens = self._coerce_max_tokens(max_tokens)
self.threshold_tokens = self._compute_threshold_tokens(
context_length, self.threshold_percent, self.max_tokens,
minimum_floor=self._minimum_context_floor,
)
# Recalculate token budgets for the new context length so the
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
@ -743,7 +741,6 @@ class ContextCompressor(ContextEngine):
@staticmethod
def _compute_threshold_tokens(
context_length: int, threshold_percent: float, max_tokens: int | None = None,
minimum_floor: int = MINIMUM_CONTEXT_LENGTH,
) -> int:
"""Compute the compaction trigger threshold in tokens.
@ -773,7 +770,7 @@ class ContextCompressor(ContextEngine):
if effective_window <= 0:
effective_window = context_length
pct_value = int(effective_window * threshold_percent)
floored = max(pct_value, minimum_floor)
floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
# If flooring pushed the threshold to/over the effective window it can
# never be reached. Trigger at 85% of the effective input budget so a
# minimum-context model rides most of its budget before compacting
@ -799,7 +796,6 @@ class ContextCompressor(ContextEngine):
api_mode: str = "",
abort_on_summary_failure: bool = False,
max_tokens: int | None = None,
minimum_context_floor: int | None = None,
):
self.model = model
self.base_url = base_url
@ -824,30 +820,19 @@ class ContextCompressor(ContextEngine):
# deterministic "summary unavailable" handoff and drop the middle window.
self.abort_on_summary_failure = abort_on_summary_failure
# Configurable compression floor — allows users with large-context
# models that degrade before 64K tokens to lower the bar (never below
# the hard-coded safety limit of 16K). When None, the default
# MINIMUM_CONTEXT_LENGTH (64K) applies.
self._minimum_context_floor = get_configurable_minimum_context(
minimum_context_floor
)
self.context_length = get_model_context_length(
model, base_url=base_url, api_key=api_key,
config_context_length=config_context_length,
provider=provider,
)
# Floor: never compress below the configured minimum context floor
# even if the percentage would suggest a lower value. This prevents
# premature compression on large-context models at 50% while keeping
# the % sane for models right at the minimum. _compute_threshold_tokens
# also guards the degenerate case where the floor would equal/exceed the
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
# the percentage would suggest a lower value. This prevents premature
# compression on large-context models at 50% while keeping the % sane
# for models right at the minimum. _compute_threshold_tokens also
# guards the degenerate case where the floor would equal/exceed the
# window (small models), so auto-compression can still fire (#14690).
# The floor is configurable via compression.minimum_context_floor for
# models whose structured output degrades well below 64K tokens.
self.threshold_tokens = self._compute_threshold_tokens(
self.context_length, threshold_percent, self.max_tokens,
minimum_floor=self._minimum_context_floor,
)
self.compression_count = 0

View file

@ -94,15 +94,9 @@ def check_compression_model_feasibility(agent: Any) -> None:
)
from agent.model_metadata import (
MINIMUM_CONTEXT_LENGTH,
get_configurable_minimum_context,
get_model_context_length,
)
# Configurable compression floor from the compressor instance
_compression_floor = getattr(
agent.context_compressor, "_minimum_context_floor", MINIMUM_CONTEXT_LENGTH
)
client, aux_model = get_text_auxiliary_client(
"compression",
main_runtime=agent._current_main_runtime(),
@ -162,18 +156,18 @@ def check_compression_model_feasibility(agent: Any) -> None:
)
# Hard floor: the auxiliary compression model must have at least
# the configured compression floor's worth of context. The main model
# is already required to meet its floor (checked earlier in
# MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model
# is already required to meet this floor (checked earlier in
# __init__), so the compression model must too — otherwise it
# cannot summarise a full threshold-sized window of main-model
# content. Mirrors the main-model rejection pattern.
if aux_context and aux_context < _compression_floor:
if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
raise ValueError(
f"Auxiliary compression model {aux_model} has a context "
f"window of {aux_context:,} tokens, which is below the "
f"minimum {_compression_floor:,} required by Hermes "
f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
f"Agent. Choose a compression model with at least "
f"{_compression_floor // 1000}K context (set "
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
f"auxiliary.compression.model in config.yaml), or set "
f"auxiliary.compression.context_length to override the "
f"detected value if it is wrong."

View file

@ -184,25 +184,6 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
# Sessions, model switches, and cron jobs should reject models below this.
MINIMUM_CONTEXT_LENGTH = 64_000
# Lower bound for user-configured compression floor overrides.
# Users with large-context models that degrade before 64K tokens
# (e.g. Gemini Flash via proxies) can use ``compression.minimum_context_floor``
# in config.yaml to lower this, but never below this hard safety limit.
_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT = 16_000
def get_configurable_minimum_context(config_floor: int | None = None) -> int:
"""Return the effective minimum context floor for compression.
When *config_floor* is provided (from ``compression.minimum_context_floor``),
it is clamped to the hard limit and returned, allowing users to lower the
default 64K compression floor for models that degrade earlier. When
*config_floor* is ``None`` the default ``MINIMUM_CONTEXT_LENGTH`` is used.
"""
if config_floor is None:
return MINIMUM_CONTEXT_LENGTH
return max(config_floor, _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT)
# Thin fallback defaults — only broad model family patterns.
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
# all miss. Replaced the previous 80+ entry dict.

View file

@ -13490,7 +13490,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
("model", "max_tokens"),
("compression", "enabled"),
("compression", "threshold"),
("compression", "minimum_context_floor"),
("compression", "target_ratio"),
("compression", "protect_last_n"),
("agent", "disabled_toolsets"),

View file

@ -226,7 +226,6 @@ class TestExtractCacheBustingConfig:
"compression": {
"enabled": False,
"threshold": 0.6,
"minimum_context_floor": 32_000,
"target_ratio": 0.3,
"protect_last_n": 25,
"some_other_key": "ignored",
@ -235,7 +234,6 @@ class TestExtractCacheBustingConfig:
)
assert out["compression.enabled"] is False
assert out["compression.threshold"] == 0.6
assert out["compression.minimum_context_floor"] == 32_000
assert out["compression.target_ratio"] == 0.3
assert out["compression.protect_last_n"] == 25
@ -388,7 +386,7 @@ class TestExtractCacheBustingConfig:
extracted cache_keys change produces a new signature."""
from gateway.run import GatewayRunner
runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
runtime = {"api_key": "k", "base_url": "u", "provider": "p"}
cfg_before = {
"model": {"context_length": 200_000},
"compression": {"threshold": 0.50, "enabled": True},
@ -411,25 +409,6 @@ class TestExtractCacheBustingConfig:
"gateway's cached agent so the new threshold takes effect."
)
def test_minimum_context_floor_edit_busts_cache(self):
"""Gateway sessions must rebuild the agent when the new floor changes."""
from gateway.run import GatewayRunner
runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
cfg_before = {"compression": {"minimum_context_floor": 64_000}}
cfg_after = {"compression": {"minimum_context_floor": 32_000}}
sig_before = GatewayRunner._agent_config_signature(
"m", runtime, [], "",
cache_keys=GatewayRunner._extract_cache_busting_config(cfg_before),
)
sig_after = GatewayRunner._agent_config_signature(
"m", runtime, [], "",
cache_keys=GatewayRunner._extract_cache_busting_config(cfg_after),
)
assert sig_before != sig_after
class TestAgentCacheLifecycle:
"""End-to-end cache behavior with real AIAgent construction."""

View file

@ -31,7 +31,6 @@ def _make_agent(
compression_enabled: bool = True,
threshold_percent: float = 0.50,
main_context: int = 200_000,
minimum_context_floor: int = 64_000,
) -> AIAgent:
"""Build a minimal AIAgent with a compressor, skipping __init__."""
agent = AIAgent.__new__(AIAgent)
@ -58,7 +57,6 @@ def _make_agent(
compressor = MagicMock(spec=ContextCompressor)
compressor.context_length = main_context
compressor.threshold_tokens = int(main_context * threshold_percent)
compressor._minimum_context_floor = minimum_context_floor
agent.context_compressor = compressor
return agent
@ -123,30 +121,6 @@ def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
assert "below the minimum" in err
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_configured_floor_allows_smaller_aux_model(mock_get_client, mock_ctx_len):
"""A lowered compression floor also lowers the aux-model hard floor."""
agent = _make_agent(
main_context=96_000,
threshold_percent=0.50,
minimum_context_floor=32_000,
)
mock_client = MagicMock()
mock_client.base_url = "https://openrouter.ai/api/v1"
mock_client.api_key = "sk-aux"
mock_get_client.return_value = (mock_client, "small-aux-model")
messages = []
agent._emit_status = lambda msg: messages.append(msg)
agent._check_compression_model_feasibility()
assert messages
assert "Auto-lowered" in messages[0]
assert agent.context_compressor.threshold_tokens == 32_768
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len):

View file

@ -1,75 +0,0 @@
"""Configurable minimum context compression floor (#31600)."""
from unittest.mock import patch
from agent.model_metadata import (
MINIMUM_CONTEXT_LENGTH,
_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT,
get_configurable_minimum_context,
)
class TestConfigurableMinimumContext:
"""Unit tests for ``get_configurable_minimum_context``."""
def test_default_returns_minimum_context_length(self):
assert get_configurable_minimum_context(None) == MINIMUM_CONTEXT_LENGTH
assert get_configurable_minimum_context() == MINIMUM_CONTEXT_LENGTH
def test_config_floor_respected(self):
assert get_configurable_minimum_context(32_000) == 32_000
assert get_configurable_minimum_context(65_536) == 65_536
def test_config_floor_clamped_to_hard_limit(self):
assert get_configurable_minimum_context(1_000) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
assert get_configurable_minimum_context(0) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
assert get_configurable_minimum_context(-1) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
assert get_configurable_minimum_context(15_999) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
class TestContextCompressorFloor:
"""Verify ContextCompressor uses the configurable floor in real paths."""
def test_default_floor_in_threshold(self):
from agent.context_compressor import ContextCompressor
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
cc = ContextCompressor(model="test", quiet_mode=True)
assert cc._minimum_context_floor == MINIMUM_CONTEXT_LENGTH
assert cc.threshold_tokens == MINIMUM_CONTEXT_LENGTH
def test_small_model_with_lowered_floor(self):
from agent.context_compressor import ContextCompressor
with patch("agent.context_compressor.get_model_context_length", return_value=48_000):
cc = ContextCompressor(
model="test", quiet_mode=True, minimum_context_floor=24_000
)
assert cc._minimum_context_floor == 24_000
assert cc.threshold_tokens == 24_000
def test_floor_dominates_on_large_models_too(self):
from agent.context_compressor import ContextCompressor
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
cc = ContextCompressor(
model="test",
quiet_mode=True,
threshold_percent=0.02,
minimum_context_floor=32_000,
)
assert cc.threshold_tokens == 32_000
def test_update_model_uses_floor(self):
from agent.context_compressor import ContextCompressor
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
cc = ContextCompressor(
model="test", quiet_mode=True, minimum_context_floor=40_000
)
cc.update_model("switched", context_length=64_000)
assert cc.threshold_tokens == 40_000