fix(compression): replace dead summary_target_tokens with ratio-based scaling (#2554)

The summary_target_tokens parameter was accepted in the constructor, stored on the instance, and never used — the summary budget was always computed from hardcoded module constants (_SUMMARY_RATIO=0.20, _MAX_SUMMARY_TOKENS=8000). This caused two compounding problems: 1. The config value was silently ignored, giving users no control over post-compression size. 2. Fixed budgets (20K tail, 8K summary cap) didn't scale with context window size. Switching from a 1M-context model to a 200K model would trigger compression that nuked 350K tokens of conversation history down to ~30K. Changes: - Replace summary_target_tokens with summary_target_ratio (default 0.40) which sets the post-compression target as a fraction of context_length. Tail token budget and summary cap now scale proportionally: MiniMax 200K → ~80K post-compression GPT-5 1M → ~400K post-compression - Change threshold_percent default: 0.50 → 0.80 (don't fire until 80% of context is consumed) - Change protect_last_n default: 4 → 20 (preserve ~10 full turns) - Summary token cap scales to 5% of context (was fixed 8K), capped at 32K ceiling - Read target_ratio and protect_last_n from config.yaml compression section (both are now configurable) - Remove hardcoded summary_target_tokens=500 from run_agent.py - Add 5 new tests for ratio scaling, clamping, and new defaults
2026-04-25 00:51:20 +00:00 · 2026-03-24 17:45:49 -07:00 · 2026-03-24 17:45:49 -07:00 · 9231a335d4
commit 9231a335d4
parent 7efaa5968d
4 changed files with 103 additions and 21 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -35,14 +35,12 @@ SUMMARY_PREFIX = (
 )
 LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
-# Minimum / maximum tokens for the summary output
+# Minimum tokens for the summary output
 _MIN_SUMMARY_TOKENS = 2000
 _MAX_SUMMARY_TOKENS = 8000
 # Proportion of compressed content to allocate for summary
 _SUMMARY_RATIO = 0.20
-
+# Absolute ceiling for summary tokens (even on very large context windows)
-# Token budget for tail protection (keep most-recent context)
+_SUMMARY_TOKENS_CEILING = 32_000
 _DEFAULT_TAIL_TOKEN_BUDGET = 20_000
 # Placeholder used when pruning old tool results
 _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
@ -65,10 +63,10 @@ class ContextCompressor:
    def __init__(
        self,
        model: str,
-        threshold_percent: float = 0.50,
+        threshold_percent: float = 0.80,
        protect_first_n: int = 3,
-        protect_last_n: int = 4,
+        protect_last_n: int = 20,
-        summary_target_tokens: int = 2500,
+        summary_target_ratio: float = 0.40,
        quiet_mode: bool = False,
        summary_model_override: str = None,
        base_url: str = "",
@ -83,7 +81,7 @@ class ContextCompressor:
        self.threshold_percent = threshold_percent
        self.protect_first_n = protect_first_n
        self.protect_last_n = protect_last_n
-        self.summary_target_tokens = summary_target_tokens
+        self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
        self.quiet_mode = quiet_mode
        self.context_length = get_model_context_length(
@ -94,12 +92,22 @@ class ContextCompressor:
        self.threshold_tokens = int(self.context_length * threshold_percent)
        self.compression_count = 0
        # Derive token budgets from the target ratio and context length
        target_tokens = int(self.context_length * self.summary_target_ratio)
        self.tail_token_budget = target_tokens
        self.max_summary_tokens = min(
            int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
        )
        if not quiet_mode:
            logger.info(
                "Context compressor initialized: model=%s context_length=%d "
-                "threshold=%d (%.0f%%) provider=%s base_url=%s",
+                "threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d "
                "provider=%s base_url=%s",
                model, self.context_length, self.threshold_tokens,
-                threshold_percent * 100, provider or "none", base_url or "none",
+                threshold_percent * 100, self.summary_target_ratio * 100,
                self.tail_token_budget,
                provider or "none", base_url or "none",
            )
        self._context_probed = False  # True after a step-down from context error
@ -179,10 +187,15 @@ class ContextCompressor:
    # ------------------------------------------------------------------
    def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int:
-        """Scale summary token budget with the amount of content being compressed."""
+        """Scale summary token budget with the amount of content being compressed.
        The maximum scales with the model's context window (5% of context,
        capped at ``_SUMMARY_TOKENS_CEILING``) so large-context models get
        richer summaries instead of being hard-capped at 8K tokens.
        """
        content_tokens = estimate_messages_tokens_rough(turns_to_summarize)
        budget = int(content_tokens * _SUMMARY_RATIO)
-        return max(_MIN_SUMMARY_TOKENS, min(budget, _MAX_SUMMARY_TOKENS))
+        return max(_MIN_SUMMARY_TOKENS, min(budget, self.max_summary_tokens))
    def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str:
        """Serialize conversation turns into labeled text for the summarizer.
@ -477,14 +490,20 @@ Write only the summary body. Do not include any preamble or prefix."""
    def _find_tail_cut_by_tokens(
        self, messages: List[Dict[str, Any]], head_end: int,
-        token_budget: int = _DEFAULT_TAIL_TOKEN_BUDGET,
+        token_budget: int | None = None,
    ) -> int:
        """Walk backward from the end of messages, accumulating tokens until
        the budget is reached. Returns the index where the tail starts.
        ``token_budget`` defaults to ``self.tail_token_budget`` which is
        derived from ``summary_target_ratio * context_length``, so it
        scales automatically with the model's context window.
        Never cuts inside a tool_call/result group. Falls back to the old
        ``protect_last_n`` if the budget would protect fewer messages.
        """
        if token_budget is None:
            token_budget = self.tail_token_budget
        n = len(messages)
        min_tail = self.protect_last_n
        accumulated = 0
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@ -232,19 +232,33 @@ browser:
 # 1. Tracks actual token usage from API responses (not estimates)
 # 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
 # 3. Protects first 3 turns (system prompt, initial request, first response)
-# 4. Protects last 4 turns (recent context is most relevant)
+# 4. Protects last N turns (default 20 messages = ~10 full turns of recent context)
 # 5. Summarizes middle turns using a fast/cheap model
 # 6. Inserts summary as a user message, continues conversation seamlessly
 #
 # Post-compression size scales with the model's context window via target_ratio:
 #   MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
 #   GPT-5   1M   context → ~400K post-compression (at 0.40 ratio)
 #
 compression:
  # Enable automatic context compression (default: true)
  # Set to false if you prefer to manage context manually or want errors on overflow
  enabled: true
-  # Trigger compression at this % of model's context limit (default: 0.85 = 85%)
+  # Trigger compression at this % of model's context limit (default: 0.80 = 80%)
  # Lower values = more aggressive compression, higher values = compress later
-  threshold: 0.85
+  threshold: 0.80
  # Target post-compression size as a fraction of context window (default: 0.40 = 40%)
  # Controls how much context survives compression. Tail token budget and summary
  # cap scale with this value. Range: 0.10 - 0.80
  target_ratio: 0.40
  # Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
  # Higher values keep more recent conversation intact at the cost of more aggressive
  # compression of older turns.
  protect_last_n: 20
  # Model to use for generating summaries (fast/cheap recommended)
  # This model compresses the middle turns into a concise summary.
  # IMPORTANT: it receives the full middle section of the conversation, so it
--- a/run_agent.py
+++ b/run_agent.py
@ -1009,9 +1009,11 @@ class AIAgent:
        _compression_cfg = _agent_cfg.get("compression", {})
        if not isinstance(_compression_cfg, dict):
            _compression_cfg = {}
-        compression_threshold = float(_compression_cfg.get("threshold", 0.50))
+        compression_threshold = float(_compression_cfg.get("threshold", 0.80))
        compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
        compression_summary_model = _compression_cfg.get("summary_model") or None
        compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
        compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
        # Read explicit context_length override from model config
        _model_cfg = _agent_cfg.get("model", {})
@ -1050,8 +1052,8 @@ class AIAgent:
            model=self.model,
            threshold_percent=compression_threshold,
            protect_first_n=3,
-            protect_last_n=4,
+            protect_last_n=compression_protect_last,
-            summary_target_tokens=500,
+            summary_target_ratio=compression_target_ratio,
            summary_model_override=compression_summary_model,
            quiet_mode=self.quiet_mode,
            base_url=self.base_url,
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@ -217,7 +217,7 @@ class TestCompressWithClient:
        mock_client.chat.completions.create.return_value = mock_response
        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
-            c = ContextCompressor(model="test", quiet_mode=True)
+            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
        msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
        with patch("agent.context_compressor.call_llm", return_value=mock_response):
@ -513,3 +513,50 @@ class TestCompressWithClient:
        for msg in result:
            if msg.get("role") == "tool" and msg.get("tool_call_id"):
                assert msg["tool_call_id"] in called_ids
 class TestSummaryTargetRatio:
    """Verify that summary_target_ratio properly scales budgets with context window."""
    def test_tail_budget_scales_with_context(self):
        """Tail token budget should be context_length * summary_target_ratio."""
        with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
        assert c.tail_token_budget == 80_000
        with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
        assert c.tail_token_budget == 400_000
    def test_summary_cap_scales_with_context(self):
        """Max summary tokens should be 5% of context, capped at 32K."""
        with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
            c = ContextCompressor(model="test", quiet_mode=True)
        assert c.max_summary_tokens == 10_000  # 200K * 0.05
        with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
            c = ContextCompressor(model="test", quiet_mode=True)
        assert c.max_summary_tokens == 32_000  # capped at ceiling
    def test_ratio_clamped(self):
        """Ratio should be clamped to [0.10, 0.80]."""
        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.05)
        assert c.summary_target_ratio == 0.10
        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
            c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
        assert c.summary_target_ratio == 0.80
    def test_default_threshold_is_80_percent(self):
        """Default compression threshold should be 80%."""
        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
            c = ContextCompressor(model="test", quiet_mode=True)
        assert c.threshold_percent == 0.80
        assert c.threshold_tokens == 80_000
    def test_default_protect_last_n_is_20(self):
        """Default protect_last_n should be 20."""
        with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
            c = ContextCompressor(model="test", quiet_mode=True)
        assert c.protect_last_n == 20