mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 10:52:21 +00:00
fix(compression): auto-compression triggers at minimum context length (#14690)
The compaction threshold is max(context_length * threshold_percent, MINIMUM_CONTEXT_LENGTH=64000). The floor prevents premature compression on large models, but degenerates at small windows: a model at exactly 64000 ctx gets max(32000, 64000) = 64000 — a threshold equal to the ENTIRE window. should_compress() can then never fire, because the provider rejects the request before usage reaches 100%. Auto-compression silently never triggers for any model whose context_length <= MINIMUM / threshold_percent (e.g. 64K-per-slot local models). Centralize the calc in _compute_threshold_tokens(). When the floor would meet or exceed the context window, trigger at 85% of the window (_MIN_CTX_TRIGGER_RATIO) — high enough that a minimum-context model uses most of its budget before compacting (compacting at the 50% percentage would waste half the small window), but below 100% so compaction actually fires before the provider rejects the request. This mirrors the existing gpt-5.5/Codex 85% autoraise rationale. Large-context behavior (floor at 64000) is unchanged; both call sites (__init__ and update_model) use the shared helper. Co-authored-by: soynchux <soynchuux@gmail.com> Co-authored-by: LeonSGP43 <154585401+LeonSGP43@users.noreply.github.com> Co-authored-by: Tranquil-Flow <tranquil_flow@protonmail.com>
This commit is contained in:
parent
c6a0929875
commit
3509be7124
2 changed files with 79 additions and 7 deletions
|
|
@ -656,9 +656,8 @@ class ContextCompressor(ContextEngine):
|
|||
self.provider = provider
|
||||
self.api_mode = api_mode
|
||||
self.context_length = context_length
|
||||
self.threshold_tokens = max(
|
||||
int(context_length * self.threshold_percent),
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
self.threshold_tokens = self._compute_threshold_tokens(
|
||||
context_length, self.threshold_percent
|
||||
)
|
||||
# Recalculate token budgets for the new context length so the
|
||||
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
|
||||
|
|
@ -690,6 +689,40 @@ class ContextCompressor(ContextEngine):
|
|||
self.awaiting_real_usage_after_compression = False
|
||||
self._ineffective_compression_count = 0
|
||||
|
||||
# When the MINIMUM_CONTEXT_LENGTH floor meets/exceeds a small context
|
||||
# window, compacting at the percentage (50% → 32K of a 64K window) wastes
|
||||
# half the usable context. Trigger near the top of the window instead so a
|
||||
# minimum-context model uses most of its budget before compacting — same
|
||||
# rationale as the gpt-5.5/Codex 85% autoraise.
|
||||
_MIN_CTX_TRIGGER_RATIO = 0.85
|
||||
|
||||
@staticmethod
|
||||
def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int:
|
||||
"""Compute the compaction trigger threshold in tokens.
|
||||
|
||||
The base value is ``context_length * threshold_percent``, floored at
|
||||
``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
|
||||
prematurely at 50%. BUT that floor degenerates at small windows: for a
|
||||
model whose ``context_length`` is at/below the minimum (e.g. a 64K
|
||||
local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold
|
||||
equal the ENTIRE window — auto-compression can never fire because the
|
||||
provider rejects the request before usage reaches 100% (#14690).
|
||||
|
||||
When the floor would meet or exceed the context window, trigger at
|
||||
``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window — high enough that a
|
||||
small model uses most of its context before compacting, but below
|
||||
100% so compaction fires before the provider rejects the request.
|
||||
"""
|
||||
pct_value = int(context_length * threshold_percent)
|
||||
floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
|
||||
# If flooring pushed the threshold to/over the window it can never be
|
||||
# reached. Trigger at 85% of the window so a minimum-context model
|
||||
# rides most of its budget before compacting instead of wasting half.
|
||||
if context_length > 0 and floored >= context_length:
|
||||
return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
|
||||
context_length - 1))
|
||||
return floored
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str,
|
||||
|
|
@ -730,10 +763,11 @@ class ContextCompressor(ContextEngine):
|
|||
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
|
||||
# the percentage would suggest a lower value. This prevents premature
|
||||
# compression on large-context models at 50% while keeping the % sane
|
||||
# for models right at the minimum.
|
||||
self.threshold_tokens = max(
|
||||
int(self.context_length * threshold_percent),
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
# for models right at the minimum. _compute_threshold_tokens also
|
||||
# guards the degenerate case where the floor would equal/exceed the
|
||||
# window (small models), so auto-compression can still fire (#14690).
|
||||
self.threshold_tokens = self._compute_threshold_tokens(
|
||||
self.context_length, threshold_percent
|
||||
)
|
||||
self.compression_count = 0
|
||||
|
||||
|
|
|
|||
|
|
@ -204,6 +204,44 @@ class TestCompress:
|
|||
f"#49307), found {count}x:\n{summary}"
|
||||
)
|
||||
|
||||
def test_threshold_below_window_at_minimum_ctx(self):
|
||||
"""Regression for #14690: at context_length == MINIMUM_CONTEXT_LENGTH
|
||||
the floored threshold used to equal the whole window, so
|
||||
auto-compression could never fire. It now triggers at 85% of the
|
||||
window — high enough not to waste the small budget, below 100% so it
|
||||
actually fires."""
|
||||
from agent.context_compressor import MINIMUM_CONTEXT_LENGTH
|
||||
t = ContextCompressor._compute_threshold_tokens(MINIMUM_CONTEXT_LENGTH, 0.50)
|
||||
assert t < MINIMUM_CONTEXT_LENGTH
|
||||
assert t == 54400 # 85% of 64000
|
||||
|
||||
def test_threshold_below_window_for_small_ctx(self):
|
||||
# 32K model: the 64000 floor exceeds the window — trigger at 85%.
|
||||
t = ContextCompressor._compute_threshold_tokens(32000, 0.50)
|
||||
assert t == 27200 # 85% of 32000
|
||||
assert t < 32000
|
||||
|
||||
def test_threshold_floored_for_large_ctx(self):
|
||||
from agent.context_compressor import MINIMUM_CONTEXT_LENGTH
|
||||
# 200K model at 50% = 100000 (above floor) — unchanged.
|
||||
assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000
|
||||
# 100K model at 50% = 50000 (below floor) — floored to MINIMUM.
|
||||
assert ContextCompressor._compute_threshold_tokens(100000, 0.50) == MINIMUM_CONTEXT_LENGTH
|
||||
|
||||
def test_minimum_ctx_model_can_actually_compress(self):
|
||||
"""End-to-end: a model at exactly the minimum context length must have
|
||||
should_compress() fire below its window (at the 85% trigger), not only
|
||||
at 100%."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=64000):
|
||||
c = ContextCompressor(model="small-64k", quiet_mode=True)
|
||||
c.context_length = 64000
|
||||
c.threshold_tokens = c._compute_threshold_tokens(64000, c.threshold_percent)
|
||||
assert c.threshold_tokens == 54400
|
||||
assert c.threshold_tokens < 64000
|
||||
# At 85%+ usage compaction fires; below it, it doesn't (no premature compact).
|
||||
assert c.should_compress(55000) is True
|
||||
assert c.should_compress(40000) is False
|
||||
|
||||
def test_compression_increments_count(self, compressor):
|
||||
msgs = self._make_messages(10)
|
||||
# Default config (abort_on_summary_failure=False) — fallback path
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue