fix(compression): auto-compression triggers at minimum context length (#14690)

The compaction threshold is max(context_length * threshold_percent,
MINIMUM_CONTEXT_LENGTH=64000). The floor prevents premature compression on
large models, but degenerates at small windows: a model at exactly 64000
ctx gets max(32000, 64000) = 64000 — a threshold equal to the ENTIRE
window. should_compress() can then never fire, because the provider
rejects the request before usage reaches 100%. Auto-compression silently
never triggers for any model whose context_length <= MINIMUM /
threshold_percent (e.g. 64K-per-slot local models).

Centralize the calc in _compute_threshold_tokens(). When the floor would
meet or exceed the context window, trigger at 85% of the window
(_MIN_CTX_TRIGGER_RATIO) — high enough that a minimum-context model uses
most of its budget before compacting (compacting at the 50% percentage
would waste half the small window), but below 100% so compaction actually
fires before the provider rejects the request. This mirrors the existing
gpt-5.5/Codex 85% autoraise rationale. Large-context behavior (floor at
64000) is unchanged; both call sites (__init__ and update_model) use the
shared helper.

Co-authored-by: soynchux <soynchuux@gmail.com>
Co-authored-by: LeonSGP43 <154585401+LeonSGP43@users.noreply.github.com>
Co-authored-by: Tranquil-Flow <tranquil_flow@protonmail.com>
This commit is contained in:
teknium1 2026-06-20 23:32:38 -07:00 committed by Teknium
parent c6a0929875
commit 3509be7124
2 changed files with 79 additions and 7 deletions

View file

@ -656,9 +656,8 @@ class ContextCompressor(ContextEngine):
self.provider = provider
self.api_mode = api_mode
self.context_length = context_length
self.threshold_tokens = max(
int(context_length * self.threshold_percent),
MINIMUM_CONTEXT_LENGTH,
self.threshold_tokens = self._compute_threshold_tokens(
context_length, self.threshold_percent
)
# Recalculate token budgets for the new context length so the
# compressor stays calibrated after a model switch (e.g. 200K → 32K).
@ -690,6 +689,40 @@ class ContextCompressor(ContextEngine):
self.awaiting_real_usage_after_compression = False
self._ineffective_compression_count = 0
# When the MINIMUM_CONTEXT_LENGTH floor meets/exceeds a small context
# window, compacting at the percentage (50% → 32K of a 64K window) wastes
# half the usable context. Trigger near the top of the window instead so a
# minimum-context model uses most of its budget before compacting — same
# rationale as the gpt-5.5/Codex 85% autoraise.
_MIN_CTX_TRIGGER_RATIO = 0.85
@staticmethod
def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int:
"""Compute the compaction trigger threshold in tokens.
The base value is ``context_length * threshold_percent``, floored at
``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
prematurely at 50%. BUT that floor degenerates at small windows: for a
model whose ``context_length`` is at/below the minimum (e.g. a 64K
local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold
equal the ENTIRE window auto-compression can never fire because the
provider rejects the request before usage reaches 100% (#14690).
When the floor would meet or exceed the context window, trigger at
``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window high enough that a
small model uses most of its context before compacting, but below
100% so compaction fires before the provider rejects the request.
"""
pct_value = int(context_length * threshold_percent)
floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
# If flooring pushed the threshold to/over the window it can never be
# reached. Trigger at 85% of the window so a minimum-context model
# rides most of its budget before compacting instead of wasting half.
if context_length > 0 and floored >= context_length:
return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
context_length - 1))
return floored
def __init__(
self,
model: str,
@ -730,10 +763,11 @@ class ContextCompressor(ContextEngine):
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
# the percentage would suggest a lower value. This prevents premature
# compression on large-context models at 50% while keeping the % sane
# for models right at the minimum.
self.threshold_tokens = max(
int(self.context_length * threshold_percent),
MINIMUM_CONTEXT_LENGTH,
# for models right at the minimum. _compute_threshold_tokens also
# guards the degenerate case where the floor would equal/exceed the
# window (small models), so auto-compression can still fire (#14690).
self.threshold_tokens = self._compute_threshold_tokens(
self.context_length, threshold_percent
)
self.compression_count = 0

View file

@ -204,6 +204,44 @@ class TestCompress:
f"#49307), found {count}x:\n{summary}"
)
def test_threshold_below_window_at_minimum_ctx(self):
"""Regression for #14690: at context_length == MINIMUM_CONTEXT_LENGTH
the floored threshold used to equal the whole window, so
auto-compression could never fire. It now triggers at 85% of the
window high enough not to waste the small budget, below 100% so it
actually fires."""
from agent.context_compressor import MINIMUM_CONTEXT_LENGTH
t = ContextCompressor._compute_threshold_tokens(MINIMUM_CONTEXT_LENGTH, 0.50)
assert t < MINIMUM_CONTEXT_LENGTH
assert t == 54400 # 85% of 64000
def test_threshold_below_window_for_small_ctx(self):
# 32K model: the 64000 floor exceeds the window — trigger at 85%.
t = ContextCompressor._compute_threshold_tokens(32000, 0.50)
assert t == 27200 # 85% of 32000
assert t < 32000
def test_threshold_floored_for_large_ctx(self):
from agent.context_compressor import MINIMUM_CONTEXT_LENGTH
# 200K model at 50% = 100000 (above floor) — unchanged.
assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000
# 100K model at 50% = 50000 (below floor) — floored to MINIMUM.
assert ContextCompressor._compute_threshold_tokens(100000, 0.50) == MINIMUM_CONTEXT_LENGTH
def test_minimum_ctx_model_can_actually_compress(self):
"""End-to-end: a model at exactly the minimum context length must have
should_compress() fire below its window (at the 85% trigger), not only
at 100%."""
with patch("agent.context_compressor.get_model_context_length", return_value=64000):
c = ContextCompressor(model="small-64k", quiet_mode=True)
c.context_length = 64000
c.threshold_tokens = c._compute_threshold_tokens(64000, c.threshold_percent)
assert c.threshold_tokens == 54400
assert c.threshold_tokens < 64000
# At 85%+ usage compaction fires; below it, it doesn't (no premature compact).
assert c.should_compress(55000) is True
assert c.should_compress(40000) is False
def test_compression_increments_count(self, compressor):
msgs = self._make_messages(10)
# Default config (abort_on_summary_failure=False) — fallback path