mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
feat(compression): make protect_first_n configurable
The number of head messages preserved verbatim across context compactions was previously hardcoded to 3 in AIAgent.__init__. Expose it as `compression.protect_first_n` in config, matching the existing `protect_last_n` pattern. Motivation: users who rely on rolling compaction for long-running sessions had the opening user/assistant exchange pinned as head forever, which doesn't always match how they want the session framed after many compactions. Lowering to 1 preserves the system prompt + first non-system message; lowering to 0 preserves only the system prompt and lets the entire first exchange age out naturally through the summary. Semantics: `protect_first_n` counts non-system head messages protected **in addition to** the system prompt, which is always implicitly protected when present. Same meaning across both code paths: protect_first_n=0 → system prompt only (or nothing if no system message) protect_first_n=2 → system prompt + first 2 non-system messages (default) This unifies the CLI path (which reads messages with the system prompt at position 0) and the gateway path (where the gateway /compress handler strips the system prompt before calling compress() — see gateway/run.py L9150-9154 on the parent fork). Previously these two paths disagreed: CLI path: protect_first_n=1 → protect system prompt only Gateway path: protect_first_n=1 → protect first USER turn forever In practice on long-running gateway sessions the old semantics pinned whatever stale aside happened to be the first user message, reinserting it into every compaction summary indefinitely. Default chosen as 2 (not 3) so that the effective protected head count remains 3 messages in the common case — assuming a system prompt is present, default protection becomes system + 2 non-system = 3 total, matching the pre-feature behaviour where `protect_first_n` was hardcoded to protect 3 messages total. Sessions without a system prompt will see a small behaviour change (2 protected head messages instead of 3), but this is the rare path and the new semantics make the system-prompt-present case the well-defined one. Changes: - agent/context_compressor.py: redefine protect_first_n as the count of non-system head messages protected beyond the implicit system-prompt guarantee; both paths converge. Constructor default updated to 2. - hermes_cli/config.py: add `compression.protect_first_n` default (2), matching the new semantics. `show_config` label tweaked to 'Protect first: N non-system head messages' for clarity. - run_agent.py: read protect_first_n from config; 0 is now valid (system prompt is always implicitly protected). - cli-config.yaml.example: document the new key and rationale. - tests/agent/test_context_compressor.py: cover default, override, the end-to-end `protect_first_n=0` and `protect_first_n=1` behaviour, the no-system-prompt (gateway) path, and the new shared-semantics regression test. Fixes #13751 Tested on Ubuntu 24.04.
This commit is contained in:
parent
ffbc21100d
commit
dee71a31e5
5 changed files with 149 additions and 11 deletions
|
|
@ -405,7 +405,7 @@ class ContextCompressor(ContextEngine):
|
|||
self,
|
||||
model: str,
|
||||
threshold_percent: float = 0.50,
|
||||
protect_first_n: int = 3,
|
||||
protect_first_n: int = 2,
|
||||
protect_last_n: int = 20,
|
||||
summary_target_ratio: float = 0.20,
|
||||
quiet_mode: bool = False,
|
||||
|
|
@ -1185,6 +1185,26 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||
idx += 1
|
||||
return idx
|
||||
|
||||
def _protect_head_size(self, messages: List[Dict[str, Any]]) -> int:
|
||||
"""Total count of head messages to protect.
|
||||
|
||||
``protect_first_n`` is defined as *additional* messages protected
|
||||
beyond the system prompt. The system prompt (if present at index 0)
|
||||
is always implicitly protected — it's load-bearing context that
|
||||
must never be summarised away. This keeps semantics stable across
|
||||
call paths where the system prompt may or may not be included in
|
||||
the ``messages`` list (e.g. the gateway ``/compress`` handler
|
||||
strips it before calling compress()).
|
||||
|
||||
Examples:
|
||||
protect_first_n=0 → system prompt only (or nothing if no system msg)
|
||||
protect_first_n=3 → system + first 3 non-system messages
|
||||
"""
|
||||
head = 0
|
||||
if messages and messages[0].get("role") == "system":
|
||||
head = 1
|
||||
return head + self.protect_first_n
|
||||
|
||||
def _align_boundary_backward(self, messages: List[Dict[str, Any]], idx: int) -> int:
|
||||
"""Pull a compress-end boundary backward to avoid splitting a
|
||||
tool_call / result group.
|
||||
|
|
@ -1343,7 +1363,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||
skip the LLM call when the transcript is still entirely inside
|
||||
the protected head/tail.
|
||||
"""
|
||||
compress_start = self._align_boundary_forward(messages, self.protect_first_n)
|
||||
compress_start = self._align_boundary_forward(messages, self._protect_head_size(messages))
|
||||
compress_end = self._find_tail_cut_by_tokens(messages, compress_start)
|
||||
return compress_start < compress_end
|
||||
|
||||
|
|
@ -1379,7 +1399,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||
self._last_aux_model_failure_model = None
|
||||
n_messages = len(messages)
|
||||
# Only need head + 3 tail messages minimum (token budget decides the real tail size)
|
||||
_min_for_compress = self.protect_first_n + 3 + 1
|
||||
_min_for_compress = self._protect_head_size(messages) + 3 + 1
|
||||
if n_messages <= _min_for_compress:
|
||||
if not self.quiet_mode:
|
||||
logger.warning(
|
||||
|
|
@ -1399,7 +1419,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||
logger.info("Pre-compression: pruned %d old tool result(s)", pruned_count)
|
||||
|
||||
# Phase 2: Determine boundaries
|
||||
compress_start = self.protect_first_n
|
||||
compress_start = self._protect_head_size(messages)
|
||||
compress_start = self._align_boundary_forward(messages, compress_start)
|
||||
|
||||
# Use token-budget tail protection instead of fixed message count
|
||||
|
|
|
|||
|
|
@ -364,6 +364,18 @@ compression:
|
|||
# compression of older turns.
|
||||
protect_last_n: 20
|
||||
|
||||
# Number of non-system messages to protect at the head of the transcript, in
|
||||
# ADDITION to the system prompt (which is always implicitly protected).
|
||||
# Head messages are NEVER summarized — they survive every compression
|
||||
# indefinitely. This gives stable early context for short/medium sessions,
|
||||
# but in long-running sessions that rely on rolling compaction the pinned
|
||||
# opening turns may not match how you want the session framed over time.
|
||||
# Set to 0 to preserve ONLY the system prompt (plus the rolling summary
|
||||
# and recent tail) — the cleanest configuration for long-running sessions.
|
||||
# Default 2 preserves the system prompt plus the first user/assistant
|
||||
# exchange (≈ 3 messages total when a system prompt is present).
|
||||
protect_first_n: 2
|
||||
|
||||
# To pin a specific model/provider for compression summaries, use the
|
||||
# auxiliary section below (auxiliary.compression.provider / model).
|
||||
|
||||
|
|
|
|||
|
|
@ -731,8 +731,13 @@ DEFAULT_CONFIG = {
|
|||
"target_ratio": 0.20, # fraction of threshold to preserve as recent tail
|
||||
"protect_last_n": 20, # minimum recent messages to keep uncompressed
|
||||
"hygiene_hard_message_limit": 400, # gateway session-hygiene force-compress threshold by message count
|
||||
"protect_first_n": 2, # non-system head messages always preserved beyond the system prompt
|
||||
# verbatim, in ADDITION to the system prompt
|
||||
# (which is always implicitly protected). Set to
|
||||
# 0 for long-running rolling-compaction sessions
|
||||
# where you want nothing pinned except the
|
||||
# system prompt + rolling summary + recent tail.
|
||||
},
|
||||
|
||||
# Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
|
||||
# cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored.
|
||||
"prompt_caching": {
|
||||
|
|
@ -4862,6 +4867,7 @@ def show_config():
|
|||
print(f" Threshold: {compression.get('threshold', 0.50) * 100:.0f}%")
|
||||
print(f" Target ratio: {compression.get('target_ratio', 0.20) * 100:.0f}% of threshold preserved")
|
||||
print(f" Protect last: {compression.get('protect_last_n', 20)} messages")
|
||||
print(f" Protect first: {compression.get('protect_first_n', 2)} non-system head messages")
|
||||
_aux_comp = config.get('auxiliary', {}).get('compression', {})
|
||||
_sm = _aux_comp.get('model', '') or '(auto)'
|
||||
print(f" Model: {_sm}")
|
||||
|
|
|
|||
11
run_agent.py
11
run_agent.py
|
|
@ -2115,6 +2115,15 @@ class AIAgent:
|
|||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"}
|
||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20))
|
||||
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
||||
# protect_first_n is the number of non-system messages to protect at
|
||||
# the head, in addition to the system prompt (which is always
|
||||
# implicitly protected by the compressor). Floor at 0 — a value of
|
||||
# 0 means "preserve only the system prompt + summary + tail", which
|
||||
# is a legitimate (and common) configuration for long-running
|
||||
# rolling-compaction sessions.
|
||||
compression_protect_first = max(
|
||||
0, int(_compression_cfg.get("protect_first_n", 2))
|
||||
)
|
||||
|
||||
# Read optional explicit context_length override for the auxiliary
|
||||
# compression model. Custom endpoints often cannot report this via
|
||||
|
|
@ -2315,7 +2324,7 @@ class AIAgent:
|
|||
self.context_compressor = ContextCompressor(
|
||||
model=self.model,
|
||||
threshold_percent=compression_threshold,
|
||||
protect_first_n=3,
|
||||
protect_first_n=compression_protect_first,
|
||||
protect_last_n=compression_protect_last,
|
||||
summary_target_ratio=compression_target_ratio,
|
||||
summary_model_override=None,
|
||||
|
|
|
|||
|
|
@ -991,9 +991,12 @@ class TestCompressWithClient:
|
|||
mock_client.chat.completions.create.return_value = mock_response
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2)
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
||||
|
||||
# Last head message (index 2) is "user" → summary should be "assistant"
|
||||
# NOTE: protect_first_n=2 preserves 2 non-system messages in addition to
|
||||
# the system prompt (always implicitly protected), yielding head [system,
|
||||
# user, user] with last head = user.
|
||||
msgs = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": "msg 1"},
|
||||
|
|
@ -1059,11 +1062,13 @@ class TestCompressWithClient:
|
|||
mock_response.choices[0].message.content = "summary text"
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=3)
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=3)
|
||||
|
||||
# Head: [system, user, assistant] → last head = assistant
|
||||
# Tail: [user, assistant, user] → first tail = user
|
||||
# summary_role="user" collides with tail, "assistant" collides with head → merge
|
||||
# NOTE: protect_first_n=2 preserves 2 non-system messages in addition to
|
||||
# the system prompt (always implicitly protected).
|
||||
msgs = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": "msg 1"},
|
||||
|
|
@ -1097,7 +1102,7 @@ class TestCompressWithClient:
|
|||
mock_response.choices[0].message.content = "summary text"
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=3)
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=3)
|
||||
|
||||
msgs = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
|
|
@ -1133,13 +1138,15 @@ class TestCompressWithClient:
|
|||
mock_response.choices[0].message.content = "summary text"
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=1, protect_last_n=2)
|
||||
|
||||
# Head: [system, user] → last head = user
|
||||
# Tail: [assistant, user, assistant] → first tail = assistant
|
||||
# summary_role="assistant" collides with tail, "user" collides with head → merge
|
||||
# NOTE: protect_first_n=1 preserves 1 non-system message in addition to
|
||||
# the system prompt (always implicitly protected).
|
||||
# With min_tail=3, tail = last 3 messages (indices 5-7).
|
||||
# Need 8 messages: min_for_compress = 2+3+1 = 6, must have > 6.
|
||||
# Need 8 messages: _min_for_compress = head(2) + 3 + 1 = 6, must have > 6.
|
||||
msgs = [
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{"role": "user", "content": "msg 1"},
|
||||
|
|
@ -1292,6 +1299,90 @@ class TestSummaryTargetRatio:
|
|||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.protect_last_n == 20
|
||||
|
||||
def test_default_protect_first_n_is_2(self):
|
||||
"""Default protect_first_n is 2 (system + 2 extra non-system messages =
|
||||
3 protected messages total, preserving the pre-feature behaviour where
|
||||
protect_first_n was hardcoded to protect 3 head messages total).
|
||||
"""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.protect_first_n == 2
|
||||
|
||||
def test_protect_first_n_override(self):
|
||||
"""protect_first_n=0 should be honoured — for users who rely on rolling
|
||||
compaction and want NOTHING pinned at head except the system prompt
|
||||
(always implicitly protected)."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=0)
|
||||
assert c.protect_first_n == 0
|
||||
|
||||
def test_protect_first_n_0_preserves_only_system_prompt(self):
|
||||
"""End-to-end: when protect_first_n=0, compression should treat only
|
||||
the system prompt as head. All user/assistant messages between the
|
||||
system prompt and the protected tail become summarization candidates.
|
||||
|
||||
This is the cleanest configuration for long-running rolling-compaction
|
||||
sessions — no user/assistant turn gets pinned verbatim forever just
|
||||
because it happened to be early in the session."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(
|
||||
model="test",
|
||||
quiet_mode=True,
|
||||
protect_first_n=0,
|
||||
protect_last_n=2,
|
||||
)
|
||||
msgs = (
|
||||
[{"role": "system", "content": "System prompt"}]
|
||||
+ [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"}
|
||||
for i in range(8)]
|
||||
)
|
||||
result = c.compress(msgs)
|
||||
# System prompt (msg[0]) survives as head
|
||||
assert result[0]["role"] == "system"
|
||||
assert result[0]["content"].startswith("System prompt")
|
||||
# The first user/assistant exchange (msg 0, msg 1) should NOT be pinned
|
||||
# as head verbatim — those would have been summarized or absorbed.
|
||||
# Under default protect_first_n=2, result[1] and result[2] would be
|
||||
# the literal "msg 0" / "msg 1"; with protect_first_n=0 they aren't.
|
||||
assert result[1].get("content") != "msg 0"
|
||||
# Last 2 messages are tail-protected under protect_last_n=2
|
||||
assert result[-1]["content"] == msgs[-1]["content"]
|
||||
|
||||
def test_protect_first_n_semantics_stable_without_system_prompt(self):
|
||||
"""Regression: gateway /compress handler strips the system prompt
|
||||
before calling compress(). protect_first_n must mean the same thing
|
||||
in both paths — "N non-system head messages" — so configuring
|
||||
protect_first_n=0 preserves NOTHING at the head regardless of whether
|
||||
the system prompt is in the messages list.
|
||||
|
||||
Bug this covers: under the old semantics, protect_first_n counted
|
||||
literally from messages[0]. In the gateway path (no system prompt)
|
||||
that meant protect_first_n=1 would pin the first user turn of the
|
||||
session forever — a user-reported complaint that a week-old
|
||||
resolved question kept getting reinserted into every compaction
|
||||
summary."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(
|
||||
model="test",
|
||||
quiet_mode=True,
|
||||
protect_first_n=0,
|
||||
protect_last_n=2,
|
||||
)
|
||||
# No system prompt — this is what the gateway passes to compress().
|
||||
msgs = [
|
||||
{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"}
|
||||
for i in range(10)
|
||||
]
|
||||
head_size = c._protect_head_size(msgs)
|
||||
# With no system prompt and protect_first_n=0 → head is empty.
|
||||
# The first user message is NOT pinned as head.
|
||||
assert head_size == 0
|
||||
|
||||
# And with protect_first_n=3 on the same no-system-prompt list →
|
||||
# head size is 3 (the three earliest non-system messages).
|
||||
c.protect_first_n = 3
|
||||
assert c._protect_head_size(msgs) == 3
|
||||
|
||||
|
||||
class TestTokenBudgetTailProtection:
|
||||
"""Tests for token-budget-based tail protection (PR #6240).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue