fix(compression): auto-compression triggers at minimum context length (#14690)

The compaction threshold is max(context_length * threshold_percent, MINIMUM_CONTEXT_LENGTH=64000). The floor prevents premature compression on large models, but degenerates at small windows: a model at exactly 64000 ctx gets max(32000, 64000) = 64000 — a threshold equal to the ENTIRE window. should_compress() can then never fire, because the provider rejects the request before usage reaches 100%. Auto-compression silently never triggers for any model whose context_length <= MINIMUM / threshold_percent (e.g. 64K-per-slot local models). Centralize the calc in _compute_threshold_tokens(). When the floor would meet or exceed the context window, trigger at 85% of the window (_MIN_CTX_TRIGGER_RATIO) — high enough that a minimum-context model uses most of its budget before compacting (compacting at the 50% percentage would waste half the small window), but below 100% so compaction actually fires before the provider rejects the request. This mirrors the existing gpt-5.5/Codex 85% autoraise rationale. Large-context behavior (floor at 64000) is unchanged; both call sites (__init__ and update_model) use the shared helper. Co-authored-by: soynchux <soynchuux@gmail.com> Co-authored-by: LeonSGP43 <154585401+LeonSGP43@users.noreply.github.com> Co-authored-by: Tranquil-Flow <tranquil_flow@protonmail.com>
2026-06-24 10:52:21 +00:00 · 2026-06-20 23:32:38 -07:00 · 2026-06-20 23:32:38 -07:00 · 3509be7124
commit 3509be7124
parent c6a0929875
2 changed files with 79 additions and 7 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -656,9 +656,8 @@ class ContextCompressor(ContextEngine):
        self.provider = provider
        self.api_mode = api_mode
        self.context_length = context_length
-        self.threshold_tokens = max(
-            int(context_length * self.threshold_percent),
-            MINIMUM_CONTEXT_LENGTH,
+        self.threshold_tokens = self._compute_threshold_tokens(
+            context_length, self.threshold_percent
        )
        # Recalculate token budgets for the new context length so the
        # compressor stays calibrated after a model switch (e.g. 200K → 32K).
@ -690,6 +689,40 @@ class ContextCompressor(ContextEngine):
        self.awaiting_real_usage_after_compression = False
        self._ineffective_compression_count = 0

+    # When the MINIMUM_CONTEXT_LENGTH floor meets/exceeds a small context
+    # window, compacting at the percentage (50% → 32K of a 64K window) wastes
+    # half the usable context. Trigger near the top of the window instead so a
+    # minimum-context model uses most of its budget before compacting — same
+    # rationale as the gpt-5.5/Codex 85% autoraise.
+    _MIN_CTX_TRIGGER_RATIO = 0.85
+
+    @staticmethod
+    def _compute_threshold_tokens(context_length: int, threshold_percent: float) -> int:
+        """Compute the compaction trigger threshold in tokens.
+
+        The base value is ``context_length * threshold_percent``, floored at
+        ``MINIMUM_CONTEXT_LENGTH`` so large-context models don't compress
+        prematurely at 50%. BUT that floor degenerates at small windows: for a
+        model whose ``context_length`` is at/below the minimum (e.g. a 64K
+        local model), ``max(0.5*64000, 64000) == 64000`` makes the threshold
+        equal the ENTIRE window — auto-compression can never fire because the
+        provider rejects the request before usage reaches 100% (#14690).
+
+        When the floor would meet or exceed the context window, trigger at
+        ``_MIN_CTX_TRIGGER_RATIO`` (85%) of the window — high enough that a
+        small model uses most of its context before compacting, but below
+        100% so compaction fires before the provider rejects the request.
+        """
+        pct_value = int(context_length * threshold_percent)
+        floored = max(pct_value, MINIMUM_CONTEXT_LENGTH)
+        # If flooring pushed the threshold to/over the window it can never be
+        # reached. Trigger at 85% of the window so a minimum-context model
+        # rides most of its budget before compacting instead of wasting half.
+        if context_length > 0 and floored >= context_length:
+            return max(1, min(int(context_length * ContextCompressor._MIN_CTX_TRIGGER_RATIO),
+                              context_length - 1))
+        return floored
+
    def __init__(
        self,
        model: str,
@ -730,10 +763,11 @@ class ContextCompressor(ContextEngine):
        # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
        # the percentage would suggest a lower value.  This prevents premature
        # compression on large-context models at 50% while keeping the % sane
-        # for models right at the minimum.
-        self.threshold_tokens = max(
-            int(self.context_length * threshold_percent),
-            MINIMUM_CONTEXT_LENGTH,
+        # for models right at the minimum. _compute_threshold_tokens also
+        # guards the degenerate case where the floor would equal/exceed the
+        # window (small models), so auto-compression can still fire (#14690).
+        self.threshold_tokens = self._compute_threshold_tokens(
+            self.context_length, threshold_percent
        )
        self.compression_count = 0

--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@ -204,6 +204,44 @@ class TestCompress:
            f"#49307), found {count}x:\n{summary}"
        )

+    def test_threshold_below_window_at_minimum_ctx(self):
+        """Regression for #14690: at context_length == MINIMUM_CONTEXT_LENGTH
+        the floored threshold used to equal the whole window, so
+        auto-compression could never fire. It now triggers at 85% of the
+        window — high enough not to waste the small budget, below 100% so it
+        actually fires."""
+        from agent.context_compressor import MINIMUM_CONTEXT_LENGTH
+        t = ContextCompressor._compute_threshold_tokens(MINIMUM_CONTEXT_LENGTH, 0.50)
+        assert t < MINIMUM_CONTEXT_LENGTH
+        assert t == 54400  # 85% of 64000
+
+    def test_threshold_below_window_for_small_ctx(self):
+        # 32K model: the 64000 floor exceeds the window — trigger at 85%.
+        t = ContextCompressor._compute_threshold_tokens(32000, 0.50)
+        assert t == 27200  # 85% of 32000
+        assert t < 32000
+
+    def test_threshold_floored_for_large_ctx(self):
+        from agent.context_compressor import MINIMUM_CONTEXT_LENGTH
+        # 200K model at 50% = 100000 (above floor) — unchanged.
+        assert ContextCompressor._compute_threshold_tokens(200000, 0.50) == 100000
+        # 100K model at 50% = 50000 (below floor) — floored to MINIMUM.
+        assert ContextCompressor._compute_threshold_tokens(100000, 0.50) == MINIMUM_CONTEXT_LENGTH
+
+    def test_minimum_ctx_model_can_actually_compress(self):
+        """End-to-end: a model at exactly the minimum context length must have
+        should_compress() fire below its window (at the 85% trigger), not only
+        at 100%."""
+        with patch("agent.context_compressor.get_model_context_length", return_value=64000):
+            c = ContextCompressor(model="small-64k", quiet_mode=True)
+            c.context_length = 64000
+            c.threshold_tokens = c._compute_threshold_tokens(64000, c.threshold_percent)
+        assert c.threshold_tokens == 54400
+        assert c.threshold_tokens < 64000
+        # At 85%+ usage compaction fires; below it, it doesn't (no premature compact).
+        assert c.should_compress(55000) is True
+        assert c.should_compress(40000) is False
+
    def test_compression_increments_count(self, compressor):
        msgs = self._make_messages(10)
        # Default config (abort_on_summary_failure=False) — fallback path