diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 70588940eda..2eb896a9934 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -656,9 +656,8 @@ class ContextCompressor(ContextEngine): self.provider = provider self.api_mode = api_mode self.context_length = context_length - self.threshold_tokens = max( - int(context_length * self.threshold_percent), - MINIMUM_CONTEXT_LENGTH, + self.threshold_tokens = self._compute_threshold_tokens( + context_length, self.threshold_percent ) # Recalculate token budgets for the new context length so the # compressor stays calibrated after a model switch (e.g. 200K → 32K). @@ -690,6 +689,40 @@ class ContextCompressor(ContextEngine): self.awaiting_real_usage_after_compression = False self._ineffective_compression_count = 0 + # When the MINIMUM_CONTEXT_LENGTH floor meets/exceeds a small context + # window, compacting at the percentage (50% → 32K of a 64K window) wastes + # half the usable context. Trigger near the top of the window instead so a + # minimum-context model uses most of its budget before compacting — same + # rationale as the gpt-5.5/Codex 85% autoraise. + _MIN_CTX_TRIGGER_RATIO = 0.85 + + @staticmethod + def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int: + """Compute the compaction trigger threshold in tokens. + + The base value is ``context_length * threshold_percent``, floored at + ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress + prematurely at 50%. BUT that floor degenerates at small windows: for a + model whose ``context_length`` is at/below the minimum (e.g. a 64K + local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold + equal the ENTIRE window — auto-compression can never fire because the + provider rejects the request before usage reaches 100% (#14690). + + When the floor would meet or exceed the context window, trigger at + ``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window — high enough that a + small model uses most of its context before compacting, but below + 100% so compaction fires before the provider rejects the request. + """ + pct_value = int(context_length * threshold_percent) + floored = max(pct_value, MINIMUM_CONTEXT_LENGTH) + # If flooring pushed the threshold to/over the window it can never be + # reached. Trigger at 85% of the window so a minimum-context model + # rides most of its budget before compacting instead of wasting half. + if context_length > 0 and floored >= context_length: + return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO), + context_length - 1)) + return floored + def __init__( self, model: str, @@ -730,10 +763,11 @@ class ContextCompressor(ContextEngine): # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if # the percentage would suggest a lower value. This prevents premature # compression on large-context models at 50% while keeping the % sane - # for models right at the minimum. - self.threshold_tokens = max( - int(self.context_length * threshold_percent), - MINIMUM_CONTEXT_LENGTH, + # for models right at the minimum. _compute_threshold_tokens also + # guards the degenerate case where the floor would equal/exceed the + # window (small models), so auto-compression can still fire (#14690). + self.threshold_tokens = self._compute_threshold_tokens( + self.context_length, threshold_percent ) self.compression_count = 0 diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 24b1c4cbe2b..084cb446b4d 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -204,6 +204,44 @@ class TestCompress: f"#49307), found {count}x:\n{summary}" ) + def test_threshold_below_window_at_minimum_ctx(self): + """Regression for #14690: at context_length == MINIMUM_CONTEXT_LENGTH + the floored threshold used to equal the whole window, so + auto-compression could never fire. It now triggers at 85% of the + window — high enough not to waste the small budget, below 100% so it + actually fires.""" + from agent.context_compressor import MINIMUM_CONTEXT_LENGTH + t = ContextCompressor._compute_threshold_tokens(MINIMUM_CONTEXT_LENGTH, 0.50) + assert t < MINIMUM_CONTEXT_LENGTH + assert t == 54400 # 85% of 64000 + + def test_threshold_below_window_for_small_ctx(self): + # 32K model: the 64000 floor exceeds the window — trigger at 85%. + t = ContextCompressor._compute_threshold_tokens(32000, 0.50) + assert t == 27200 # 85% of 32000 + assert t < 32000 + + def test_threshold_floored_for_large_ctx(self): + from agent.context_compressor import MINIMUM_CONTEXT_LENGTH + # 200K model at 50% = 100000 (above floor) — unchanged. + assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000 + # 100K model at 50% = 50000 (below floor) — floored to MINIMUM. + assert ContextCompressor._compute_threshold_tokens(100000, 0.50) == MINIMUM_CONTEXT_LENGTH + + def test_minimum_ctx_model_can_actually_compress(self): + """End-to-end: a model at exactly the minimum context length must have + should_compress() fire below its window (at the 85% trigger), not only + at 100%.""" + with patch("agent.context_compressor.get_model_context_length", return_value=64000): + c = ContextCompressor(model="small-64k", quiet_mode=True) + c.context_length = 64000 + c.threshold_tokens = c._compute_threshold_tokens(64000, c.threshold_percent) + assert c.threshold_tokens == 54400 + assert c.threshold_tokens < 64000 + # At 85%+ usage compaction fires; below it, it doesn't (no premature compact). + assert c.should_compress(55000) is True + assert c.should_compress(40000) is False + def test_compression_increments_count(self, compressor): msgs = self._make_messages(10) # Default config (abort_on_summary_failure=False) — fallback path