mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
Merge pull request #52112 from NousResearch/revert/52053-minimum-context-floor
revert(plugins): revert minimum context floor configurable (#52053)
This commit is contained in:
commit
ed1fdb5b61
8 changed files with 12 additions and 187 deletions
|
|
@ -1358,17 +1358,6 @@ def init_agent(
|
|||
compression_in_place = is_truthy_value(
|
||||
_compression_cfg.get("in_place"), default=False
|
||||
)
|
||||
# Allow users to lower the compression floor for models whose
|
||||
# structured output degrades well before the default 64K tokens.
|
||||
# Clamped to a hard-coded safety limit of 16K by the compressor.
|
||||
_raw_floor = _compression_cfg.get("minimum_context_floor", None)
|
||||
if _raw_floor is not None:
|
||||
try:
|
||||
compression_minimum_context_floor = int(_raw_floor)
|
||||
except (TypeError, ValueError):
|
||||
compression_minimum_context_floor = None
|
||||
else:
|
||||
compression_minimum_context_floor = None
|
||||
|
||||
# Read optional explicit context_length override for the auxiliary
|
||||
# compression model. Custom endpoints often cannot report this via
|
||||
|
|
@ -1587,7 +1576,6 @@ def init_agent(
|
|||
api_mode=agent.api_mode,
|
||||
abort_on_summary_failure=compression_abort_on_summary_failure,
|
||||
max_tokens=agent.max_tokens,
|
||||
minimum_context_floor=compression_minimum_context_floor,
|
||||
)
|
||||
agent.compression_enabled = compression_enabled
|
||||
agent.compression_in_place = compression_in_place
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ from agent.auxiliary_client import call_llm, _is_connection_error, aux_interrupt
|
|||
from agent.context_engine import ContextEngine
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
get_configurable_minimum_context,
|
||||
get_model_context_length,
|
||||
estimate_messages_tokens_rough,
|
||||
)
|
||||
|
|
@ -684,7 +683,6 @@ class ContextCompressor(ContextEngine):
|
|||
self.max_tokens = self._coerce_max_tokens(max_tokens)
|
||||
self.threshold_tokens = self._compute_threshold_tokens(
|
||||
context_length, self.threshold_percent, self.max_tokens,
|
||||
minimum_floor=self._minimum_context_floor,
|
||||
)
|
||||
# Recalculate token budgets for the new context length so the
|
||||
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
|
||||
|
|
@ -743,7 +741,6 @@ class ContextCompressor(ContextEngine):
|
|||
@staticmethod
|
||||
def _compute_threshold_tokens(
|
||||
context_length: int, threshold_percent: float, max_tokens: int | None = None,
|
||||
minimum_floor: int = MINIMUM_CONTEXT_LENGTH,
|
||||
) -> int:
|
||||
"""Compute the compaction trigger threshold in tokens.
|
||||
|
||||
|
|
@ -773,7 +770,7 @@ class ContextCompressor(ContextEngine):
|
|||
if effective_window <= 0:
|
||||
effective_window = context_length
|
||||
pct_value = int(effective_window * threshold_percent)
|
||||
floored = max(pct_value, minimum_floor)
|
||||
floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
|
||||
# If flooring pushed the threshold to/over the effective window it can
|
||||
# never be reached. Trigger at 85% of the effective input budget so a
|
||||
# minimum-context model rides most of its budget before compacting
|
||||
|
|
@ -799,7 +796,6 @@ class ContextCompressor(ContextEngine):
|
|||
api_mode: str = "",
|
||||
abort_on_summary_failure: bool = False,
|
||||
max_tokens: int | None = None,
|
||||
minimum_context_floor: int | None = None,
|
||||
):
|
||||
self.model = model
|
||||
self.base_url = base_url
|
||||
|
|
@ -824,30 +820,19 @@ class ContextCompressor(ContextEngine):
|
|||
# deterministic "summary unavailable" handoff and drop the middle window.
|
||||
self.abort_on_summary_failure = abort_on_summary_failure
|
||||
|
||||
# Configurable compression floor — allows users with large-context
|
||||
# models that degrade before 64K tokens to lower the bar (never below
|
||||
# the hard-coded safety limit of 16K). When None, the default
|
||||
# MINIMUM_CONTEXT_LENGTH (64K) applies.
|
||||
self._minimum_context_floor = get_configurable_minimum_context(
|
||||
minimum_context_floor
|
||||
)
|
||||
|
||||
self.context_length = get_model_context_length(
|
||||
model, base_url=base_url, api_key=api_key,
|
||||
config_context_length=config_context_length,
|
||||
provider=provider,
|
||||
)
|
||||
# Floor: never compress below the configured minimum context floor
|
||||
# even if the percentage would suggest a lower value. This prevents
|
||||
# premature compression on large-context models at 50% while keeping
|
||||
# the % sane for models right at the minimum. _compute_threshold_tokens
|
||||
# also guards the degenerate case where the floor would equal/exceed the
|
||||
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
|
||||
# the percentage would suggest a lower value. This prevents premature
|
||||
# compression on large-context models at 50% while keeping the % sane
|
||||
# for models right at the minimum. _compute_threshold_tokens also
|
||||
# guards the degenerate case where the floor would equal/exceed the
|
||||
# window (small models), so auto-compression can still fire (#14690).
|
||||
# The floor is configurable via compression.minimum_context_floor for
|
||||
# models whose structured output degrades well below 64K tokens.
|
||||
self.threshold_tokens = self._compute_threshold_tokens(
|
||||
self.context_length, threshold_percent, self.max_tokens,
|
||||
minimum_floor=self._minimum_context_floor,
|
||||
)
|
||||
self.compression_count = 0
|
||||
|
||||
|
|
|
|||
|
|
@ -94,15 +94,9 @@ def check_compression_model_feasibility(agent: Any) -> None:
|
|||
)
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
get_configurable_minimum_context,
|
||||
get_model_context_length,
|
||||
)
|
||||
|
||||
# Configurable compression floor from the compressor instance
|
||||
_compression_floor = getattr(
|
||||
agent.context_compressor, "_minimum_context_floor", MINIMUM_CONTEXT_LENGTH
|
||||
)
|
||||
|
||||
client, aux_model = get_text_auxiliary_client(
|
||||
"compression",
|
||||
main_runtime=agent._current_main_runtime(),
|
||||
|
|
@ -162,18 +156,18 @@ def check_compression_model_feasibility(agent: Any) -> None:
|
|||
)
|
||||
|
||||
# Hard floor: the auxiliary compression model must have at least
|
||||
# the configured compression floor's worth of context. The main model
|
||||
# is already required to meet its floor (checked earlier in
|
||||
# MINIMUM_CONTEXT_LENGTH (64K) tokens of context. The main model
|
||||
# is already required to meet this floor (checked earlier in
|
||||
# __init__), so the compression model must too — otherwise it
|
||||
# cannot summarise a full threshold-sized window of main-model
|
||||
# content. Mirrors the main-model rejection pattern.
|
||||
if aux_context and aux_context < _compression_floor:
|
||||
if aux_context and aux_context < MINIMUM_CONTEXT_LENGTH:
|
||||
raise ValueError(
|
||||
f"Auxiliary compression model {aux_model} has a context "
|
||||
f"window of {aux_context:,} tokens, which is below the "
|
||||
f"minimum {_compression_floor:,} required by Hermes "
|
||||
f"minimum {MINIMUM_CONTEXT_LENGTH:,} required by Hermes "
|
||||
f"Agent. Choose a compression model with at least "
|
||||
f"{_compression_floor // 1000}K context (set "
|
||||
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context (set "
|
||||
f"auxiliary.compression.model in config.yaml), or set "
|
||||
f"auxiliary.compression.context_length to override the "
|
||||
f"detected value if it is wrong."
|
||||
|
|
|
|||
|
|
@ -184,25 +184,6 @@ DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
|
|||
# Sessions, model switches, and cron jobs should reject models below this.
|
||||
MINIMUM_CONTEXT_LENGTH = 64_000
|
||||
|
||||
# Lower bound for user-configured compression floor overrides.
|
||||
# Users with large-context models that degrade before 64K tokens
|
||||
# (e.g. Gemini Flash via proxies) can use ``compression.minimum_context_floor``
|
||||
# in config.yaml to lower this, but never below this hard safety limit.
|
||||
_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT = 16_000
|
||||
|
||||
|
||||
def get_configurable_minimum_context(config_floor: int | None = None) -> int:
|
||||
"""Return the effective minimum context floor for compression.
|
||||
|
||||
When *config_floor* is provided (from ``compression.minimum_context_floor``),
|
||||
it is clamped to the hard limit and returned, allowing users to lower the
|
||||
default 64K compression floor for models that degrade earlier. When
|
||||
*config_floor* is ``None`` the default ``MINIMUM_CONTEXT_LENGTH`` is used.
|
||||
"""
|
||||
if config_floor is None:
|
||||
return MINIMUM_CONTEXT_LENGTH
|
||||
return max(config_floor, _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT)
|
||||
|
||||
# Thin fallback defaults — only broad model family patterns.
|
||||
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
|
||||
# all miss. Replaced the previous 80+ entry dict.
|
||||
|
|
|
|||
|
|
@ -13490,7 +13490,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
("model", "max_tokens"),
|
||||
("compression", "enabled"),
|
||||
("compression", "threshold"),
|
||||
("compression", "minimum_context_floor"),
|
||||
("compression", "target_ratio"),
|
||||
("compression", "protect_last_n"),
|
||||
("agent", "disabled_toolsets"),
|
||||
|
|
|
|||
|
|
@ -226,7 +226,6 @@ class TestExtractCacheBustingConfig:
|
|||
"compression": {
|
||||
"enabled": False,
|
||||
"threshold": 0.6,
|
||||
"minimum_context_floor": 32_000,
|
||||
"target_ratio": 0.3,
|
||||
"protect_last_n": 25,
|
||||
"some_other_key": "ignored",
|
||||
|
|
@ -235,7 +234,6 @@ class TestExtractCacheBustingConfig:
|
|||
)
|
||||
assert out["compression.enabled"] is False
|
||||
assert out["compression.threshold"] == 0.6
|
||||
assert out["compression.minimum_context_floor"] == 32_000
|
||||
assert out["compression.target_ratio"] == 0.3
|
||||
assert out["compression.protect_last_n"] == 25
|
||||
|
||||
|
|
@ -388,7 +386,7 @@ class TestExtractCacheBustingConfig:
|
|||
extracted cache_keys change produces a new signature."""
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
|
||||
runtime = {"api_key": "k", "base_url": "u", "provider": "p"}
|
||||
cfg_before = {
|
||||
"model": {"context_length": 200_000},
|
||||
"compression": {"threshold": 0.50, "enabled": True},
|
||||
|
|
@ -411,25 +409,6 @@ class TestExtractCacheBustingConfig:
|
|||
"gateway's cached agent so the new threshold takes effect."
|
||||
)
|
||||
|
||||
def test_minimum_context_floor_edit_busts_cache(self):
|
||||
"""Gateway sessions must rebuild the agent when the new floor changes."""
|
||||
from gateway.run import GatewayRunner
|
||||
|
||||
runtime = {"api_key": "***", "base_url": "u", "provider": "p"}
|
||||
cfg_before = {"compression": {"minimum_context_floor": 64_000}}
|
||||
cfg_after = {"compression": {"minimum_context_floor": 32_000}}
|
||||
|
||||
sig_before = GatewayRunner._agent_config_signature(
|
||||
"m", runtime, [], "",
|
||||
cache_keys=GatewayRunner._extract_cache_busting_config(cfg_before),
|
||||
)
|
||||
sig_after = GatewayRunner._agent_config_signature(
|
||||
"m", runtime, [], "",
|
||||
cache_keys=GatewayRunner._extract_cache_busting_config(cfg_after),
|
||||
)
|
||||
|
||||
assert sig_before != sig_after
|
||||
|
||||
|
||||
class TestAgentCacheLifecycle:
|
||||
"""End-to-end cache behavior with real AIAgent construction."""
|
||||
|
|
|
|||
|
|
@ -31,7 +31,6 @@ def _make_agent(
|
|||
compression_enabled: bool = True,
|
||||
threshold_percent: float = 0.50,
|
||||
main_context: int = 200_000,
|
||||
minimum_context_floor: int = 64_000,
|
||||
) -> AIAgent:
|
||||
"""Build a minimal AIAgent with a compressor, skipping __init__."""
|
||||
agent = AIAgent.__new__(AIAgent)
|
||||
|
|
@ -58,7 +57,6 @@ def _make_agent(
|
|||
compressor = MagicMock(spec=ContextCompressor)
|
||||
compressor.context_length = main_context
|
||||
compressor.threshold_tokens = int(main_context * threshold_percent)
|
||||
compressor._minimum_context_floor = minimum_context_floor
|
||||
agent.context_compressor = compressor
|
||||
|
||||
return agent
|
||||
|
|
@ -123,30 +121,6 @@ def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
|
|||
assert "below the minimum" in err
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_configured_floor_allows_smaller_aux_model(mock_get_client, mock_ctx_len):
|
||||
"""A lowered compression floor also lowers the aux-model hard floor."""
|
||||
agent = _make_agent(
|
||||
main_context=96_000,
|
||||
threshold_percent=0.50,
|
||||
minimum_context_floor=32_000,
|
||||
)
|
||||
mock_client = MagicMock()
|
||||
mock_client.base_url = "https://openrouter.ai/api/v1"
|
||||
mock_client.api_key = "sk-aux"
|
||||
mock_get_client.return_value = (mock_client, "small-aux-model")
|
||||
|
||||
messages = []
|
||||
agent._emit_status = lambda msg: messages.append(msg)
|
||||
|
||||
agent._check_compression_model_feasibility()
|
||||
|
||||
assert messages
|
||||
assert "Auto-lowered" in messages[0]
|
||||
assert agent.context_compressor.threshold_tokens == 32_768
|
||||
|
||||
|
||||
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
|
||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||
def test_no_warning_when_aux_context_sufficient(mock_get_client, mock_ctx_len):
|
||||
|
|
|
|||
|
|
@ -1,75 +0,0 @@
|
|||
"""Configurable minimum context compression floor (#31600)."""
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
_MINIMUM_CONTEXT_FLOOR_HARD_LIMIT,
|
||||
get_configurable_minimum_context,
|
||||
)
|
||||
|
||||
|
||||
class TestConfigurableMinimumContext:
|
||||
"""Unit tests for ``get_configurable_minimum_context``."""
|
||||
|
||||
def test_default_returns_minimum_context_length(self):
|
||||
assert get_configurable_minimum_context(None) == MINIMUM_CONTEXT_LENGTH
|
||||
assert get_configurable_minimum_context() == MINIMUM_CONTEXT_LENGTH
|
||||
|
||||
def test_config_floor_respected(self):
|
||||
assert get_configurable_minimum_context(32_000) == 32_000
|
||||
assert get_configurable_minimum_context(65_536) == 65_536
|
||||
|
||||
def test_config_floor_clamped_to_hard_limit(self):
|
||||
assert get_configurable_minimum_context(1_000) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
|
||||
assert get_configurable_minimum_context(0) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
|
||||
assert get_configurable_minimum_context(-1) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
|
||||
assert get_configurable_minimum_context(15_999) == _MINIMUM_CONTEXT_FLOOR_HARD_LIMIT
|
||||
|
||||
|
||||
class TestContextCompressorFloor:
|
||||
"""Verify ContextCompressor uses the configurable floor in real paths."""
|
||||
|
||||
def test_default_floor_in_threshold(self):
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
cc = ContextCompressor(model="test", quiet_mode=True)
|
||||
|
||||
assert cc._minimum_context_floor == MINIMUM_CONTEXT_LENGTH
|
||||
assert cc.threshold_tokens == MINIMUM_CONTEXT_LENGTH
|
||||
|
||||
def test_small_model_with_lowered_floor(self):
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=48_000):
|
||||
cc = ContextCompressor(
|
||||
model="test", quiet_mode=True, minimum_context_floor=24_000
|
||||
)
|
||||
|
||||
assert cc._minimum_context_floor == 24_000
|
||||
assert cc.threshold_tokens == 24_000
|
||||
|
||||
def test_floor_dominates_on_large_models_too(self):
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||
cc = ContextCompressor(
|
||||
model="test",
|
||||
quiet_mode=True,
|
||||
threshold_percent=0.02,
|
||||
minimum_context_floor=32_000,
|
||||
)
|
||||
|
||||
assert cc.threshold_tokens == 32_000
|
||||
|
||||
def test_update_model_uses_floor(self):
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
cc = ContextCompressor(
|
||||
model="test", quiet_mode=True, minimum_context_floor=40_000
|
||||
)
|
||||
|
||||
cc.update_model("switched", context_length=64_000)
|
||||
assert cc.threshold_tokens == 40_000
|
||||
Loading…
Add table
Add a link
Reference in a new issue