fix(compression): replace dead summary_target_tokens with ratio-based scaling (#2554)

The summary_target_tokens parameter was accepted in the constructor,
stored on the instance, and never used — the summary budget was always
computed from hardcoded module constants (_SUMMARY_RATIO=0.20,
_MAX_SUMMARY_TOKENS=8000). This caused two compounding problems:

1. The config value was silently ignored, giving users no control
   over post-compression size.
2. Fixed budgets (20K tail, 8K summary cap) didn't scale with
   context window size. Switching from a 1M-context model to a
   200K model would trigger compression that nuked 350K tokens
   of conversation history down to ~30K.

Changes:
- Replace summary_target_tokens with summary_target_ratio (default 0.40)
  which sets the post-compression target as a fraction of context_length.
  Tail token budget and summary cap now scale proportionally:
    MiniMax 200K → ~80K post-compression
    GPT-5   1M  → ~400K post-compression
- Change threshold_percent default: 0.50 → 0.80 (don't fire until
  80% of context is consumed)
- Change protect_last_n default: 4 → 20 (preserve ~10 full turns)
- Summary token cap scales to 5% of context (was fixed 8K), capped
  at 32K ceiling
- Read target_ratio and protect_last_n from config.yaml compression
  section (both are now configurable)
- Remove hardcoded summary_target_tokens=500 from run_agent.py
- Add 5 new tests for ratio scaling, clamping, and new defaults
This commit is contained in:
Teknium 2026-03-24 17:45:49 -07:00 committed by GitHub
parent 7efaa5968d
commit 9231a335d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 103 additions and 21 deletions

View file

@ -35,14 +35,12 @@ SUMMARY_PREFIX = (
) )
LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:" LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
# Minimum / maximum tokens for the summary output # Minimum tokens for the summary output
_MIN_SUMMARY_TOKENS = 2000 _MIN_SUMMARY_TOKENS = 2000
_MAX_SUMMARY_TOKENS = 8000
# Proportion of compressed content to allocate for summary # Proportion of compressed content to allocate for summary
_SUMMARY_RATIO = 0.20 _SUMMARY_RATIO = 0.20
# Absolute ceiling for summary tokens (even on very large context windows)
# Token budget for tail protection (keep most-recent context) _SUMMARY_TOKENS_CEILING = 32_000
_DEFAULT_TAIL_TOKEN_BUDGET = 20_000
# Placeholder used when pruning old tool results # Placeholder used when pruning old tool results
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]" _PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
@ -65,10 +63,10 @@ class ContextCompressor:
def __init__( def __init__(
self, self,
model: str, model: str,
threshold_percent: float = 0.50, threshold_percent: float = 0.80,
protect_first_n: int = 3, protect_first_n: int = 3,
protect_last_n: int = 4, protect_last_n: int = 20,
summary_target_tokens: int = 2500, summary_target_ratio: float = 0.40,
quiet_mode: bool = False, quiet_mode: bool = False,
summary_model_override: str = None, summary_model_override: str = None,
base_url: str = "", base_url: str = "",
@ -83,7 +81,7 @@ class ContextCompressor:
self.threshold_percent = threshold_percent self.threshold_percent = threshold_percent
self.protect_first_n = protect_first_n self.protect_first_n = protect_first_n
self.protect_last_n = protect_last_n self.protect_last_n = protect_last_n
self.summary_target_tokens = summary_target_tokens self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
self.quiet_mode = quiet_mode self.quiet_mode = quiet_mode
self.context_length = get_model_context_length( self.context_length = get_model_context_length(
@ -94,12 +92,22 @@ class ContextCompressor:
self.threshold_tokens = int(self.context_length * threshold_percent) self.threshold_tokens = int(self.context_length * threshold_percent)
self.compression_count = 0 self.compression_count = 0
# Derive token budgets from the target ratio and context length
target_tokens = int(self.context_length * self.summary_target_ratio)
self.tail_token_budget = target_tokens
self.max_summary_tokens = min(
int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
)
if not quiet_mode: if not quiet_mode:
logger.info( logger.info(
"Context compressor initialized: model=%s context_length=%d " "Context compressor initialized: model=%s context_length=%d "
"threshold=%d (%.0f%%) provider=%s base_url=%s", "threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d "
"provider=%s base_url=%s",
model, self.context_length, self.threshold_tokens, model, self.context_length, self.threshold_tokens,
threshold_percent * 100, provider or "none", base_url or "none", threshold_percent * 100, self.summary_target_ratio * 100,
self.tail_token_budget,
provider or "none", base_url or "none",
) )
self._context_probed = False # True after a step-down from context error self._context_probed = False # True after a step-down from context error
@ -179,10 +187,15 @@ class ContextCompressor:
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int: def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int:
"""Scale summary token budget with the amount of content being compressed.""" """Scale summary token budget with the amount of content being compressed.
The maximum scales with the model's context window (5% of context,
capped at ``_SUMMARY_TOKENS_CEILING``) so large-context models get
richer summaries instead of being hard-capped at 8K tokens.
"""
content_tokens = estimate_messages_tokens_rough(turns_to_summarize) content_tokens = estimate_messages_tokens_rough(turns_to_summarize)
budget = int(content_tokens * _SUMMARY_RATIO) budget = int(content_tokens * _SUMMARY_RATIO)
return max(_MIN_SUMMARY_TOKENS, min(budget, _MAX_SUMMARY_TOKENS)) return max(_MIN_SUMMARY_TOKENS, min(budget, self.max_summary_tokens))
def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str: def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str:
"""Serialize conversation turns into labeled text for the summarizer. """Serialize conversation turns into labeled text for the summarizer.
@ -477,14 +490,20 @@ Write only the summary body. Do not include any preamble or prefix."""
def _find_tail_cut_by_tokens( def _find_tail_cut_by_tokens(
self, messages: List[Dict[str, Any]], head_end: int, self, messages: List[Dict[str, Any]], head_end: int,
token_budget: int = _DEFAULT_TAIL_TOKEN_BUDGET, token_budget: int | None = None,
) -> int: ) -> int:
"""Walk backward from the end of messages, accumulating tokens until """Walk backward from the end of messages, accumulating tokens until
the budget is reached. Returns the index where the tail starts. the budget is reached. Returns the index where the tail starts.
``token_budget`` defaults to ``self.tail_token_budget`` which is
derived from ``summary_target_ratio * context_length``, so it
scales automatically with the model's context window.
Never cuts inside a tool_call/result group. Falls back to the old Never cuts inside a tool_call/result group. Falls back to the old
``protect_last_n`` if the budget would protect fewer messages. ``protect_last_n`` if the budget would protect fewer messages.
""" """
if token_budget is None:
token_budget = self.tail_token_budget
n = len(messages) n = len(messages)
min_tail = self.protect_last_n min_tail = self.protect_last_n
accumulated = 0 accumulated = 0

View file

@ -232,19 +232,33 @@ browser:
# 1. Tracks actual token usage from API responses (not estimates) # 1. Tracks actual token usage from API responses (not estimates)
# 2. When prompt_tokens >= threshold% of model's context_length, triggers compression # 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
# 3. Protects first 3 turns (system prompt, initial request, first response) # 3. Protects first 3 turns (system prompt, initial request, first response)
# 4. Protects last 4 turns (recent context is most relevant) # 4. Protects last N turns (default 20 messages = ~10 full turns of recent context)
# 5. Summarizes middle turns using a fast/cheap model # 5. Summarizes middle turns using a fast/cheap model
# 6. Inserts summary as a user message, continues conversation seamlessly # 6. Inserts summary as a user message, continues conversation seamlessly
# #
# Post-compression size scales with the model's context window via target_ratio:
# MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
# GPT-5 1M context → ~400K post-compression (at 0.40 ratio)
#
compression: compression:
# Enable automatic context compression (default: true) # Enable automatic context compression (default: true)
# Set to false if you prefer to manage context manually or want errors on overflow # Set to false if you prefer to manage context manually or want errors on overflow
enabled: true enabled: true
# Trigger compression at this % of model's context limit (default: 0.85 = 85%) # Trigger compression at this % of model's context limit (default: 0.80 = 80%)
# Lower values = more aggressive compression, higher values = compress later # Lower values = more aggressive compression, higher values = compress later
threshold: 0.85 threshold: 0.80
# Target post-compression size as a fraction of context window (default: 0.40 = 40%)
# Controls how much context survives compression. Tail token budget and summary
# cap scale with this value. Range: 0.10 - 0.80
target_ratio: 0.40
# Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
# Higher values keep more recent conversation intact at the cost of more aggressive
# compression of older turns.
protect_last_n: 20
# Model to use for generating summaries (fast/cheap recommended) # Model to use for generating summaries (fast/cheap recommended)
# This model compresses the middle turns into a concise summary. # This model compresses the middle turns into a concise summary.
# IMPORTANT: it receives the full middle section of the conversation, so it # IMPORTANT: it receives the full middle section of the conversation, so it

View file

@ -1009,9 +1009,11 @@ class AIAgent:
_compression_cfg = _agent_cfg.get("compression", {}) _compression_cfg = _agent_cfg.get("compression", {})
if not isinstance(_compression_cfg, dict): if not isinstance(_compression_cfg, dict):
_compression_cfg = {} _compression_cfg = {}
compression_threshold = float(_compression_cfg.get("threshold", 0.50)) compression_threshold = float(_compression_cfg.get("threshold", 0.80))
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes") compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
compression_summary_model = _compression_cfg.get("summary_model") or None compression_summary_model = _compression_cfg.get("summary_model") or None
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
# Read explicit context_length override from model config # Read explicit context_length override from model config
_model_cfg = _agent_cfg.get("model", {}) _model_cfg = _agent_cfg.get("model", {})
@ -1050,8 +1052,8 @@ class AIAgent:
model=self.model, model=self.model,
threshold_percent=compression_threshold, threshold_percent=compression_threshold,
protect_first_n=3, protect_first_n=3,
protect_last_n=4, protect_last_n=compression_protect_last,
summary_target_tokens=500, summary_target_ratio=compression_target_ratio,
summary_model_override=compression_summary_model, summary_model_override=compression_summary_model,
quiet_mode=self.quiet_mode, quiet_mode=self.quiet_mode,
base_url=self.base_url, base_url=self.base_url,

View file

@ -217,7 +217,7 @@ class TestCompressWithClient:
mock_client.chat.completions.create.return_value = mock_response mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000): with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(model="test", quiet_mode=True) c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)] msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
with patch("agent.context_compressor.call_llm", return_value=mock_response): with patch("agent.context_compressor.call_llm", return_value=mock_response):
@ -513,3 +513,50 @@ class TestCompressWithClient:
for msg in result: for msg in result:
if msg.get("role") == "tool" and msg.get("tool_call_id"): if msg.get("role") == "tool" and msg.get("tool_call_id"):
assert msg["tool_call_id"] in called_ids assert msg["tool_call_id"] in called_ids
class TestSummaryTargetRatio:
"""Verify that summary_target_ratio properly scales budgets with context window."""
def test_tail_budget_scales_with_context(self):
"""Tail token budget should be context_length * summary_target_ratio."""
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
assert c.tail_token_budget == 80_000
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
assert c.tail_token_budget == 400_000
def test_summary_cap_scales_with_context(self):
"""Max summary tokens should be 5% of context, capped at 32K."""
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
c = ContextCompressor(model="test", quiet_mode=True)
assert c.max_summary_tokens == 10_000 # 200K * 0.05
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
c = ContextCompressor(model="test", quiet_mode=True)
assert c.max_summary_tokens == 32_000 # capped at ceiling
def test_ratio_clamped(self):
"""Ratio should be clamped to [0.10, 0.80]."""
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.05)
assert c.summary_target_ratio == 0.10
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
assert c.summary_target_ratio == 0.80
def test_default_threshold_is_80_percent(self):
"""Default compression threshold should be 80%."""
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
c = ContextCompressor(model="test", quiet_mode=True)
assert c.threshold_percent == 0.80
assert c.threshold_tokens == 80_000
def test_default_protect_last_n_is_20(self):
"""Default protect_last_n should be 20."""
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
c = ContextCompressor(model="test", quiet_mode=True)
assert c.protect_last_n == 20