fix(compression): three bugs causing auto-compression to never trigger

1. MINIMUM_CONTEXT_LENGTH floor makes threshold=100% when context_length==64000 - When context_length equals MINIMUM_CONTEXT_LENGTH (64000), the floor value in threshold_tokens calculation dominates, making the threshold equal to 100% of the context window. The API errors out before prompt_tokens can reach that value, so compression never fires. - Fix: fall back to percentage-based value when floor >= context_length. - Closes #14690 2. Anti-thrashing protection permanently disables compression with no recovery - After 2 consecutive ineffective compressions (<10% savings each), should_compress() returns False forever. No timeout, decay, or auto-recovery mechanism exists — only /new resets the counter. - Fix: add time-based auto-recovery (300s). If enough time has passed since the last compression attempt, reset the counter. - Closes #14694 3. Post-compression token estimate excludes tools schema - After compression, last_prompt_tokens is set using estimate_messages_tokens_rough() which omits tools schema tokens (20-30K with 50+ tools). This causes the next compression cycle to trigger much later than the configured threshold. - Fix: use estimate_request_tokens_rough() which includes tools schema, consistent with the preflight compression check pattern. - Closes #14695
2026-04-25 00:51:20 +00:00 · 2026-04-24 03:06:09 +08:00 · 2026-04-24 03:06:09 +08:00 · c1c62b6eef
commit c1c62b6eef
parent ce089169d5
2 changed files with 42 additions and 11 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -296,6 +296,7 @@ class ContextCompressor(ContextEngine):
        self._previous_summary = None
        self._last_compression_savings_pct = 100.0
        self._ineffective_compression_count = 0
        self._last_compression_time = 0.0
    def update_model(
        self,
@ -317,6 +318,8 @@ class ContextCompressor(ContextEngine):
            int(context_length * self.threshold_percent),
            MINIMUM_CONTEXT_LENGTH,
        )
        if self.threshold_tokens >= context_length:
            self.threshold_tokens = int(context_length * self.threshold_percent)
    def __init__(
        self,
@ -353,10 +356,17 @@ class ContextCompressor(ContextEngine):
        # the percentage would suggest a lower value.  This prevents premature
        # compression on large-context models at 50% while keeping the % sane
        # for models right at the minimum.
        # However, when context_length <= MINIMUM_CONTEXT_LENGTH the floor
        # would make threshold >= 100% of context, which is unreachable — the
        # API errors out before prompt_tokens can reach that value.  In that
        # case fall back to the percentage-based value so compression can
        # actually trigger.
        self.threshold_tokens = max(
            int(self.context_length * threshold_percent),
            MINIMUM_CONTEXT_LENGTH,
        )
        if self.threshold_tokens >= self.context_length:
            self.threshold_tokens = int(self.context_length * threshold_percent)
        self.compression_count = 0
        # Derive token budgets: ratio is relative to the threshold, not total context
@ -388,6 +398,8 @@ class ContextCompressor(ContextEngine):
        # Anti-thrashing: track whether last compression was effective
        self._last_compression_savings_pct: float = 100.0
        self._ineffective_compression_count: int = 0
        self._last_compression_time: float = 0.0
        self._ANTI_THRASH_RECOVERY_SECONDS: float = 300.0
        self._summary_failure_cooldown_until: float = 0.0
    def update_from_response(self, usage: Dict[str, Any]):
@ -406,15 +418,28 @@ class ContextCompressor(ContextEngine):
        if tokens < self.threshold_tokens:
            return False
        # Anti-thrashing: back off if recent compressions were ineffective
        # Auto-recovery: if enough time has passed since the last compression
        # attempt, reset the counter.  Without this, a session that had two
        # ineffective compressions early on will never auto-compress again,
        # even as the context grows far beyond the threshold.
        if self._ineffective_compression_count >= 2:
-            if not self.quiet_mode:
+            _elapsed = time.monotonic() - self._last_compression_time
-                logger.warning(
+            if _elapsed > self._ANTI_THRASH_RECOVERY_SECONDS:
-                    "Compression skipped — last %d compressions saved <10%% each. "
+                self._ineffective_compression_count = 0
-                    "Consider /new to start a fresh session, or /compress <topic> "
+                if not self.quiet_mode:
-                    "for focused compression.",
+                    logger.info(
-                    self._ineffective_compression_count,
+                        "Anti-thrashing reset: %.0fs since last compression attempt",
-                )
+                        _elapsed,
-            return False
+                    )
            else:
                if not self.quiet_mode:
                    logger.warning(
                        "Compression skipped — last %d compressions saved <10%% each. "
                        "Consider /new to start a fresh session, or /compress <topic> "
                        "for focused compression.",
                        self._ineffective_compression_count,
                    )
                return False
        return True
    # ------------------------------------------------------------------
@ -1258,6 +1283,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        # Anti-thrashing: track compression effectiveness
        savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0
        self._last_compression_savings_pct = savings_pct
        self._last_compression_time = time.monotonic()
        if savings_pct < 10:
            self._ineffective_compression_count += 1
        else:
--- a/run_agent.py
+++ b/run_agent.py
@ -7595,9 +7595,14 @@ class AIAgent:
        # Update token estimate after compaction so pressure calculations
        # use the post-compression count, not the stale pre-compression one.
-        _compressed_est = (
+        # Use estimate_request_tokens_rough (not estimate_messages_tokens_rough)
-            estimate_tokens_rough(new_system_prompt)
+        # to include tools schema tokens — with 50+ tools enabled, schemas alone
-            + estimate_messages_tokens_rough(compressed)
+        # can add 20-30K tokens, and omitting them causes the next compression
        # cycle to trigger much later than the configured threshold.
        _compressed_est = estimate_request_tokens_rough(
            compressed,
            system_prompt=new_system_prompt or "",
            tools=self.tools or None,
        )
        self.context_compressor.last_prompt_tokens = _compressed_est
        self.context_compressor.last_completion_tokens = 0