fix(compression): three bugs causing auto-compression to never trigger

1. MINIMUM_CONTEXT_LENGTH floor makes threshold=100% when context_length==64000
   - When context_length equals MINIMUM_CONTEXT_LENGTH (64000), the floor
     value in threshold_tokens calculation dominates, making the threshold
     equal to 100% of the context window. The API errors out before
     prompt_tokens can reach that value, so compression never fires.
   - Fix: fall back to percentage-based value when floor >= context_length.
   - Closes #14690

2. Anti-thrashing protection permanently disables compression with no recovery
   - After 2 consecutive ineffective compressions (<10% savings each),
     should_compress() returns False forever. No timeout, decay, or
     auto-recovery mechanism exists — only /new resets the counter.
   - Fix: add time-based auto-recovery (300s). If enough time has passed
     since the last compression attempt, reset the counter.
   - Closes #14694

3. Post-compression token estimate excludes tools schema
   - After compression, last_prompt_tokens is set using
     estimate_messages_tokens_rough() which omits tools schema tokens
     (20-30K with 50+ tools). This causes the next compression cycle
     to trigger much later than the configured threshold.
   - Fix: use estimate_request_tokens_rough() which includes tools schema,
     consistent with the preflight compression check pattern.
   - Closes #14695
This commit is contained in:
devilardis 2026-04-24 03:06:09 +08:00
parent ce089169d5
commit c1c62b6eef
2 changed files with 42 additions and 11 deletions

View file

@ -296,6 +296,7 @@ class ContextCompressor(ContextEngine):
self._previous_summary = None self._previous_summary = None
self._last_compression_savings_pct = 100.0 self._last_compression_savings_pct = 100.0
self._ineffective_compression_count = 0 self._ineffective_compression_count = 0
self._last_compression_time = 0.0
def update_model( def update_model(
self, self,
@ -317,6 +318,8 @@ class ContextCompressor(ContextEngine):
int(context_length * self.threshold_percent), int(context_length * self.threshold_percent),
MINIMUM_CONTEXT_LENGTH, MINIMUM_CONTEXT_LENGTH,
) )
if self.threshold_tokens >= context_length:
self.threshold_tokens = int(context_length * self.threshold_percent)
def __init__( def __init__(
self, self,
@ -353,10 +356,17 @@ class ContextCompressor(ContextEngine):
# the percentage would suggest a lower value. This prevents premature # the percentage would suggest a lower value. This prevents premature
# compression on large-context models at 50% while keeping the % sane # compression on large-context models at 50% while keeping the % sane
# for models right at the minimum. # for models right at the minimum.
# However, when context_length <= MINIMUM_CONTEXT_LENGTH the floor
# would make threshold >= 100% of context, which is unreachable — the
# API errors out before prompt_tokens can reach that value. In that
# case fall back to the percentage-based value so compression can
# actually trigger.
self.threshold_tokens = max( self.threshold_tokens = max(
int(self.context_length * threshold_percent), int(self.context_length * threshold_percent),
MINIMUM_CONTEXT_LENGTH, MINIMUM_CONTEXT_LENGTH,
) )
if self.threshold_tokens >= self.context_length:
self.threshold_tokens = int(self.context_length * threshold_percent)
self.compression_count = 0 self.compression_count = 0
# Derive token budgets: ratio is relative to the threshold, not total context # Derive token budgets: ratio is relative to the threshold, not total context
@ -388,6 +398,8 @@ class ContextCompressor(ContextEngine):
# Anti-thrashing: track whether last compression was effective # Anti-thrashing: track whether last compression was effective
self._last_compression_savings_pct: float = 100.0 self._last_compression_savings_pct: float = 100.0
self._ineffective_compression_count: int = 0 self._ineffective_compression_count: int = 0
self._last_compression_time: float = 0.0
self._ANTI_THRASH_RECOVERY_SECONDS: float = 300.0
self._summary_failure_cooldown_until: float = 0.0 self._summary_failure_cooldown_until: float = 0.0
def update_from_response(self, usage: Dict[str, Any]): def update_from_response(self, usage: Dict[str, Any]):
@ -406,15 +418,28 @@ class ContextCompressor(ContextEngine):
if tokens < self.threshold_tokens: if tokens < self.threshold_tokens:
return False return False
# Anti-thrashing: back off if recent compressions were ineffective # Anti-thrashing: back off if recent compressions were ineffective
# Auto-recovery: if enough time has passed since the last compression
# attempt, reset the counter. Without this, a session that had two
# ineffective compressions early on will never auto-compress again,
# even as the context grows far beyond the threshold.
if self._ineffective_compression_count >= 2: if self._ineffective_compression_count >= 2:
if not self.quiet_mode: _elapsed = time.monotonic() - self._last_compression_time
logger.warning( if _elapsed > self._ANTI_THRASH_RECOVERY_SECONDS:
"Compression skipped — last %d compressions saved <10%% each. " self._ineffective_compression_count = 0
"Consider /new to start a fresh session, or /compress <topic> " if not self.quiet_mode:
"for focused compression.", logger.info(
self._ineffective_compression_count, "Anti-thrashing reset: %.0fs since last compression attempt",
) _elapsed,
return False )
else:
if not self.quiet_mode:
logger.warning(
"Compression skipped — last %d compressions saved <10%% each. "
"Consider /new to start a fresh session, or /compress <topic> "
"for focused compression.",
self._ineffective_compression_count,
)
return False
return True return True
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@ -1258,6 +1283,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
# Anti-thrashing: track compression effectiveness # Anti-thrashing: track compression effectiveness
savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0 savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0
self._last_compression_savings_pct = savings_pct self._last_compression_savings_pct = savings_pct
self._last_compression_time = time.monotonic()
if savings_pct < 10: if savings_pct < 10:
self._ineffective_compression_count += 1 self._ineffective_compression_count += 1
else: else:

View file

@ -7595,9 +7595,14 @@ class AIAgent:
# Update token estimate after compaction so pressure calculations # Update token estimate after compaction so pressure calculations
# use the post-compression count, not the stale pre-compression one. # use the post-compression count, not the stale pre-compression one.
_compressed_est = ( # Use estimate_request_tokens_rough (not estimate_messages_tokens_rough)
estimate_tokens_rough(new_system_prompt) # to include tools schema tokens — with 50+ tools enabled, schemas alone
+ estimate_messages_tokens_rough(compressed) # can add 20-30K tokens, and omitting them causes the next compression
# cycle to trigger much later than the configured threshold.
_compressed_est = estimate_request_tokens_rough(
compressed,
system_prompt=new_system_prompt or "",
tools=self.tools or None,
) )
self.context_compressor.last_prompt_tokens = _compressed_est self.context_compressor.last_prompt_tokens = _compressed_est
self.context_compressor.last_completion_tokens = 0 self.context_compressor.last_completion_tokens = 0