mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(compression): replace dead summary_target_tokens with ratio-based scaling (#2554)
The summary_target_tokens parameter was accepted in the constructor,
stored on the instance, and never used — the summary budget was always
computed from hardcoded module constants (_SUMMARY_RATIO=0.20,
_MAX_SUMMARY_TOKENS=8000). This caused two compounding problems:
1. The config value was silently ignored, giving users no control
over post-compression size.
2. Fixed budgets (20K tail, 8K summary cap) didn't scale with
context window size. Switching from a 1M-context model to a
200K model would trigger compression that nuked 350K tokens
of conversation history down to ~30K.
Changes:
- Replace summary_target_tokens with summary_target_ratio (default 0.40)
which sets the post-compression target as a fraction of context_length.
Tail token budget and summary cap now scale proportionally:
MiniMax 200K → ~80K post-compression
GPT-5 1M → ~400K post-compression
- Change threshold_percent default: 0.50 → 0.80 (don't fire until
80% of context is consumed)
- Change protect_last_n default: 4 → 20 (preserve ~10 full turns)
- Summary token cap scales to 5% of context (was fixed 8K), capped
at 32K ceiling
- Read target_ratio and protect_last_n from config.yaml compression
section (both are now configurable)
- Remove hardcoded summary_target_tokens=500 from run_agent.py
- Add 5 new tests for ratio scaling, clamping, and new defaults
This commit is contained in:
parent
7efaa5968d
commit
9231a335d4
4 changed files with 103 additions and 21 deletions
|
|
@ -35,14 +35,12 @@ SUMMARY_PREFIX = (
|
||||||
)
|
)
|
||||||
LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
|
LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
|
||||||
|
|
||||||
# Minimum / maximum tokens for the summary output
|
# Minimum tokens for the summary output
|
||||||
_MIN_SUMMARY_TOKENS = 2000
|
_MIN_SUMMARY_TOKENS = 2000
|
||||||
_MAX_SUMMARY_TOKENS = 8000
|
|
||||||
# Proportion of compressed content to allocate for summary
|
# Proportion of compressed content to allocate for summary
|
||||||
_SUMMARY_RATIO = 0.20
|
_SUMMARY_RATIO = 0.20
|
||||||
|
# Absolute ceiling for summary tokens (even on very large context windows)
|
||||||
# Token budget for tail protection (keep most-recent context)
|
_SUMMARY_TOKENS_CEILING = 32_000
|
||||||
_DEFAULT_TAIL_TOKEN_BUDGET = 20_000
|
|
||||||
|
|
||||||
# Placeholder used when pruning old tool results
|
# Placeholder used when pruning old tool results
|
||||||
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
|
||||||
|
|
@ -65,10 +63,10 @@ class ContextCompressor:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model: str,
|
model: str,
|
||||||
threshold_percent: float = 0.50,
|
threshold_percent: float = 0.80,
|
||||||
protect_first_n: int = 3,
|
protect_first_n: int = 3,
|
||||||
protect_last_n: int = 4,
|
protect_last_n: int = 20,
|
||||||
summary_target_tokens: int = 2500,
|
summary_target_ratio: float = 0.40,
|
||||||
quiet_mode: bool = False,
|
quiet_mode: bool = False,
|
||||||
summary_model_override: str = None,
|
summary_model_override: str = None,
|
||||||
base_url: str = "",
|
base_url: str = "",
|
||||||
|
|
@ -83,7 +81,7 @@ class ContextCompressor:
|
||||||
self.threshold_percent = threshold_percent
|
self.threshold_percent = threshold_percent
|
||||||
self.protect_first_n = protect_first_n
|
self.protect_first_n = protect_first_n
|
||||||
self.protect_last_n = protect_last_n
|
self.protect_last_n = protect_last_n
|
||||||
self.summary_target_tokens = summary_target_tokens
|
self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
|
||||||
self.quiet_mode = quiet_mode
|
self.quiet_mode = quiet_mode
|
||||||
|
|
||||||
self.context_length = get_model_context_length(
|
self.context_length = get_model_context_length(
|
||||||
|
|
@ -94,12 +92,22 @@ class ContextCompressor:
|
||||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||||
self.compression_count = 0
|
self.compression_count = 0
|
||||||
|
|
||||||
|
# Derive token budgets from the target ratio and context length
|
||||||
|
target_tokens = int(self.context_length * self.summary_target_ratio)
|
||||||
|
self.tail_token_budget = target_tokens
|
||||||
|
self.max_summary_tokens = min(
|
||||||
|
int(self.context_length * 0.05), _SUMMARY_TOKENS_CEILING,
|
||||||
|
)
|
||||||
|
|
||||||
if not quiet_mode:
|
if not quiet_mode:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Context compressor initialized: model=%s context_length=%d "
|
"Context compressor initialized: model=%s context_length=%d "
|
||||||
"threshold=%d (%.0f%%) provider=%s base_url=%s",
|
"threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d "
|
||||||
|
"provider=%s base_url=%s",
|
||||||
model, self.context_length, self.threshold_tokens,
|
model, self.context_length, self.threshold_tokens,
|
||||||
threshold_percent * 100, provider or "none", base_url or "none",
|
threshold_percent * 100, self.summary_target_ratio * 100,
|
||||||
|
self.tail_token_budget,
|
||||||
|
provider or "none", base_url or "none",
|
||||||
)
|
)
|
||||||
self._context_probed = False # True after a step-down from context error
|
self._context_probed = False # True after a step-down from context error
|
||||||
|
|
||||||
|
|
@ -179,10 +187,15 @@ class ContextCompressor:
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int:
|
def _compute_summary_budget(self, turns_to_summarize: List[Dict[str, Any]]) -> int:
|
||||||
"""Scale summary token budget with the amount of content being compressed."""
|
"""Scale summary token budget with the amount of content being compressed.
|
||||||
|
|
||||||
|
The maximum scales with the model's context window (5% of context,
|
||||||
|
capped at ``_SUMMARY_TOKENS_CEILING``) so large-context models get
|
||||||
|
richer summaries instead of being hard-capped at 8K tokens.
|
||||||
|
"""
|
||||||
content_tokens = estimate_messages_tokens_rough(turns_to_summarize)
|
content_tokens = estimate_messages_tokens_rough(turns_to_summarize)
|
||||||
budget = int(content_tokens * _SUMMARY_RATIO)
|
budget = int(content_tokens * _SUMMARY_RATIO)
|
||||||
return max(_MIN_SUMMARY_TOKENS, min(budget, _MAX_SUMMARY_TOKENS))
|
return max(_MIN_SUMMARY_TOKENS, min(budget, self.max_summary_tokens))
|
||||||
|
|
||||||
def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str:
|
def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str:
|
||||||
"""Serialize conversation turns into labeled text for the summarizer.
|
"""Serialize conversation turns into labeled text for the summarizer.
|
||||||
|
|
@ -477,14 +490,20 @@ Write only the summary body. Do not include any preamble or prefix."""
|
||||||
|
|
||||||
def _find_tail_cut_by_tokens(
|
def _find_tail_cut_by_tokens(
|
||||||
self, messages: List[Dict[str, Any]], head_end: int,
|
self, messages: List[Dict[str, Any]], head_end: int,
|
||||||
token_budget: int = _DEFAULT_TAIL_TOKEN_BUDGET,
|
token_budget: int | None = None,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Walk backward from the end of messages, accumulating tokens until
|
"""Walk backward from the end of messages, accumulating tokens until
|
||||||
the budget is reached. Returns the index where the tail starts.
|
the budget is reached. Returns the index where the tail starts.
|
||||||
|
|
||||||
|
``token_budget`` defaults to ``self.tail_token_budget`` which is
|
||||||
|
derived from ``summary_target_ratio * context_length``, so it
|
||||||
|
scales automatically with the model's context window.
|
||||||
|
|
||||||
Never cuts inside a tool_call/result group. Falls back to the old
|
Never cuts inside a tool_call/result group. Falls back to the old
|
||||||
``protect_last_n`` if the budget would protect fewer messages.
|
``protect_last_n`` if the budget would protect fewer messages.
|
||||||
"""
|
"""
|
||||||
|
if token_budget is None:
|
||||||
|
token_budget = self.tail_token_budget
|
||||||
n = len(messages)
|
n = len(messages)
|
||||||
min_tail = self.protect_last_n
|
min_tail = self.protect_last_n
|
||||||
accumulated = 0
|
accumulated = 0
|
||||||
|
|
|
||||||
|
|
@ -232,19 +232,33 @@ browser:
|
||||||
# 1. Tracks actual token usage from API responses (not estimates)
|
# 1. Tracks actual token usage from API responses (not estimates)
|
||||||
# 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
|
# 2. When prompt_tokens >= threshold% of model's context_length, triggers compression
|
||||||
# 3. Protects first 3 turns (system prompt, initial request, first response)
|
# 3. Protects first 3 turns (system prompt, initial request, first response)
|
||||||
# 4. Protects last 4 turns (recent context is most relevant)
|
# 4. Protects last N turns (default 20 messages = ~10 full turns of recent context)
|
||||||
# 5. Summarizes middle turns using a fast/cheap model
|
# 5. Summarizes middle turns using a fast/cheap model
|
||||||
# 6. Inserts summary as a user message, continues conversation seamlessly
|
# 6. Inserts summary as a user message, continues conversation seamlessly
|
||||||
#
|
#
|
||||||
|
# Post-compression size scales with the model's context window via target_ratio:
|
||||||
|
# MiniMax 200K context → ~80K post-compression (at 0.40 ratio)
|
||||||
|
# GPT-5 1M context → ~400K post-compression (at 0.40 ratio)
|
||||||
|
#
|
||||||
compression:
|
compression:
|
||||||
# Enable automatic context compression (default: true)
|
# Enable automatic context compression (default: true)
|
||||||
# Set to false if you prefer to manage context manually or want errors on overflow
|
# Set to false if you prefer to manage context manually or want errors on overflow
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
||||||
# Trigger compression at this % of model's context limit (default: 0.85 = 85%)
|
# Trigger compression at this % of model's context limit (default: 0.80 = 80%)
|
||||||
# Lower values = more aggressive compression, higher values = compress later
|
# Lower values = more aggressive compression, higher values = compress later
|
||||||
threshold: 0.85
|
threshold: 0.80
|
||||||
|
|
||||||
|
# Target post-compression size as a fraction of context window (default: 0.40 = 40%)
|
||||||
|
# Controls how much context survives compression. Tail token budget and summary
|
||||||
|
# cap scale with this value. Range: 0.10 - 0.80
|
||||||
|
target_ratio: 0.40
|
||||||
|
|
||||||
|
# Number of most-recent messages to always preserve (default: 20 ≈ 10 full turns)
|
||||||
|
# Higher values keep more recent conversation intact at the cost of more aggressive
|
||||||
|
# compression of older turns.
|
||||||
|
protect_last_n: 20
|
||||||
|
|
||||||
# Model to use for generating summaries (fast/cheap recommended)
|
# Model to use for generating summaries (fast/cheap recommended)
|
||||||
# This model compresses the middle turns into a concise summary.
|
# This model compresses the middle turns into a concise summary.
|
||||||
# IMPORTANT: it receives the full middle section of the conversation, so it
|
# IMPORTANT: it receives the full middle section of the conversation, so it
|
||||||
|
|
|
||||||
|
|
@ -1009,9 +1009,11 @@ class AIAgent:
|
||||||
_compression_cfg = _agent_cfg.get("compression", {})
|
_compression_cfg = _agent_cfg.get("compression", {})
|
||||||
if not isinstance(_compression_cfg, dict):
|
if not isinstance(_compression_cfg, dict):
|
||||||
_compression_cfg = {}
|
_compression_cfg = {}
|
||||||
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
compression_threshold = float(_compression_cfg.get("threshold", 0.80))
|
||||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
||||||
compression_summary_model = _compression_cfg.get("summary_model") or None
|
compression_summary_model = _compression_cfg.get("summary_model") or None
|
||||||
|
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.40))
|
||||||
|
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
||||||
|
|
||||||
# Read explicit context_length override from model config
|
# Read explicit context_length override from model config
|
||||||
_model_cfg = _agent_cfg.get("model", {})
|
_model_cfg = _agent_cfg.get("model", {})
|
||||||
|
|
@ -1050,8 +1052,8 @@ class AIAgent:
|
||||||
model=self.model,
|
model=self.model,
|
||||||
threshold_percent=compression_threshold,
|
threshold_percent=compression_threshold,
|
||||||
protect_first_n=3,
|
protect_first_n=3,
|
||||||
protect_last_n=4,
|
protect_last_n=compression_protect_last,
|
||||||
summary_target_tokens=500,
|
summary_target_ratio=compression_target_ratio,
|
||||||
summary_model_override=compression_summary_model,
|
summary_model_override=compression_summary_model,
|
||||||
quiet_mode=self.quiet_mode,
|
quiet_mode=self.quiet_mode,
|
||||||
base_url=self.base_url,
|
base_url=self.base_url,
|
||||||
|
|
|
||||||
|
|
@ -217,7 +217,7 @@ class TestCompressWithClient:
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
mock_client.chat.completions.create.return_value = mock_response
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
c = ContextCompressor(model="test", quiet_mode=True)
|
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
||||||
|
|
||||||
msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
|
msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
|
||||||
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
|
@ -513,3 +513,50 @@ class TestCompressWithClient:
|
||||||
for msg in result:
|
for msg in result:
|
||||||
if msg.get("role") == "tool" and msg.get("tool_call_id"):
|
if msg.get("role") == "tool" and msg.get("tool_call_id"):
|
||||||
assert msg["tool_call_id"] in called_ids
|
assert msg["tool_call_id"] in called_ids
|
||||||
|
|
||||||
|
|
||||||
|
class TestSummaryTargetRatio:
|
||||||
|
"""Verify that summary_target_ratio properly scales budgets with context window."""
|
||||||
|
|
||||||
|
def test_tail_budget_scales_with_context(self):
|
||||||
|
"""Tail token budget should be context_length * summary_target_ratio."""
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||||
|
assert c.tail_token_budget == 80_000
|
||||||
|
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
|
||||||
|
assert c.tail_token_budget == 400_000
|
||||||
|
|
||||||
|
def test_summary_cap_scales_with_context(self):
|
||||||
|
"""Max summary tokens should be 5% of context, capped at 32K."""
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
|
assert c.max_summary_tokens == 10_000 # 200K * 0.05
|
||||||
|
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
|
assert c.max_summary_tokens == 32_000 # capped at ceiling
|
||||||
|
|
||||||
|
def test_ratio_clamped(self):
|
||||||
|
"""Ratio should be clamped to [0.10, 0.80]."""
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.05)
|
||||||
|
assert c.summary_target_ratio == 0.10
|
||||||
|
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.95)
|
||||||
|
assert c.summary_target_ratio == 0.80
|
||||||
|
|
||||||
|
def test_default_threshold_is_80_percent(self):
|
||||||
|
"""Default compression threshold should be 80%."""
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
|
assert c.threshold_percent == 0.80
|
||||||
|
assert c.threshold_tokens == 80_000
|
||||||
|
|
||||||
|
def test_default_protect_last_n_is_20(self):
|
||||||
|
"""Default protect_last_n should be 20."""
|
||||||
|
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||||
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
|
assert c.protect_last_n == 20
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue