diff --git a/agent/context_compressor.py b/agent/context_compressor.py index d16236737c4..99012c73c1b 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -405,7 +405,7 @@ class ContextCompressor(ContextEngine): self, model: str, threshold_percent: float = 0.50, - protect_first_n: int = 3, + protect_first_n: int = 2, protect_last_n: int = 20, summary_target_ratio: float = 0.20, quiet_mode: bool = False, @@ -1185,6 +1185,26 @@ The user has requested that this compaction PRIORITISE preserving all informatio idx += 1 return idx + def _protect_head_size(self, messages: List[Dict[str, Any]]) -> int: + """Total count of head messages to protect. + + ``protect_first_n`` is defined as *additional* messages protected + beyond the system prompt. The system prompt (if present at index 0) + is always implicitly protected — it's load-bearing context that + must never be summarised away. This keeps semantics stable across + call paths where the system prompt may or may not be included in + the ``messages`` list (e.g. the gateway ``/compress`` handler + strips it before calling compress()). + + Examples: + protect_first_n=0 → system prompt only (or nothing if no system msg) + protect_first_n=3 → system + first 3 non-system messages + """ + head = 0 + if messages and messages[0].get("role") == "system": + head = 1 + return head + self.protect_first_n + def _align_boundary_backward(self, messages: List[Dict[str, Any]], idx: int) -> int: """Pull a compress-end boundary backward to avoid splitting a tool_call / result group. @@ -1343,7 +1363,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio skip the LLM call when the transcript is still entirely inside the protected head/tail. """ - compress_start = self._align_boundary_forward(messages, self.protect_first_n) + compress_start = self._align_boundary_forward(messages, self._protect_head_size(messages)) compress_end = self._find_tail_cut_by_tokens(messages, compress_start) return compress_start < compress_end @@ -1379,7 +1399,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio self._last_aux_model_failure_model = None n_messages = len(messages) # Only need head + 3 tail messages minimum (token budget decides the real tail size) - _min_for_compress = self.protect_first_n + 3 + 1 + _min_for_compress = self._protect_head_size(messages) + 3 + 1 if n_messages <= _min_for_compress: if not self.quiet_mode: logger.warning( @@ -1399,7 +1419,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio logger.info("Pre-compression: pruned %d old tool result(s)", pruned_count) # Phase 2: Determine boundaries - compress_start = self.protect_first_n + compress_start = self._protect_head_size(messages) compress_start = self._align_boundary_forward(messages, compress_start) # Use token-budget tail protection instead of fixed message count diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 6daceba04a9..1bfec39698a 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -364,6 +364,18 @@ compression: # compression of older turns. protect_last_n: 20 + # Number of non-system messages to protect at the head of the transcript, in + # ADDITION to the system prompt (which is always implicitly protected). + # Head messages are NEVER summarized — they survive every compression + # indefinitely. This gives stable early context for short/medium sessions, + # but in long-running sessions that rely on rolling compaction the pinned + # opening turns may not match how you want the session framed over time. + # Set to 0 to preserve ONLY the system prompt (plus the rolling summary + # and recent tail) — the cleanest configuration for long-running sessions. + # Default 2 preserves the system prompt plus the first user/assistant + # exchange (≈ 3 messages total when a system prompt is present). + protect_first_n: 2 + # To pin a specific model/provider for compression summaries, use the # auxiliary section below (auxiliary.compression.provider / model). diff --git a/hermes_cli/config.py b/hermes_cli/config.py index fd9784d7847..3feb2cbddbb 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -731,8 +731,13 @@ DEFAULT_CONFIG = { "target_ratio": 0.20, # fraction of threshold to preserve as recent tail "protect_last_n": 20, # minimum recent messages to keep uncompressed "hygiene_hard_message_limit": 400, # gateway session-hygiene force-compress threshold by message count + "protect_first_n": 2, # non-system head messages always preserved beyond the system prompt + # verbatim, in ADDITION to the system prompt + # (which is always implicitly protected). Set to + # 0 for long-running rolling-compaction sessions + # where you want nothing pinned except the + # system prompt + rolling summary + recent tail. }, - # Anthropic prompt caching (Claude via OpenRouter or native Anthropic API). # cache_ttl must be "5m" or "1h" (Anthropic-supported tiers); other values are ignored. "prompt_caching": { @@ -4862,6 +4867,7 @@ def show_config(): print(f" Threshold: {compression.get('threshold', 0.50) * 100:.0f}%") print(f" Target ratio: {compression.get('target_ratio', 0.20) * 100:.0f}% of threshold preserved") print(f" Protect last: {compression.get('protect_last_n', 20)} messages") + print(f" Protect first: {compression.get('protect_first_n', 2)} non-system head messages") _aux_comp = config.get('auxiliary', {}).get('compression', {}) _sm = _aux_comp.get('model', '') or '(auto)' print(f" Model: {_sm}") diff --git a/run_agent.py b/run_agent.py index 53177931b81..8c7dfe2b061 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2115,6 +2115,15 @@ class AIAgent: compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"} compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20)) compression_protect_last = int(_compression_cfg.get("protect_last_n", 20)) + # protect_first_n is the number of non-system messages to protect at + # the head, in addition to the system prompt (which is always + # implicitly protected by the compressor). Floor at 0 — a value of + # 0 means "preserve only the system prompt + summary + tail", which + # is a legitimate (and common) configuration for long-running + # rolling-compaction sessions. + compression_protect_first = max( + 0, int(_compression_cfg.get("protect_first_n", 2)) + ) # Read optional explicit context_length override for the auxiliary # compression model. Custom endpoints often cannot report this via @@ -2315,7 +2324,7 @@ class AIAgent: self.context_compressor = ContextCompressor( model=self.model, threshold_percent=compression_threshold, - protect_first_n=3, + protect_first_n=compression_protect_first, protect_last_n=compression_protect_last, summary_target_ratio=compression_target_ratio, summary_model_override=None, diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 97a7c7b3d0f..821d3c4c4b7 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -991,9 +991,12 @@ class TestCompressWithClient: mock_client.chat.completions.create.return_value = mock_response with patch("agent.context_compressor.get_model_context_length", return_value=100000): - c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2) + c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) # Last head message (index 2) is "user" → summary should be "assistant" + # NOTE: protect_first_n=2 preserves 2 non-system messages in addition to + # the system prompt (always implicitly protected), yielding head [system, + # user, user] with last head = user. msgs = [ {"role": "system", "content": "system prompt"}, {"role": "user", "content": "msg 1"}, @@ -1059,11 +1062,13 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "summary text" with patch("agent.context_compressor.get_model_context_length", return_value=100000): - c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=3) + c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=3) # Head: [system, user, assistant] → last head = assistant # Tail: [user, assistant, user] → first tail = user # summary_role="user" collides with tail, "assistant" collides with head → merge + # NOTE: protect_first_n=2 preserves 2 non-system messages in addition to + # the system prompt (always implicitly protected). msgs = [ {"role": "system", "content": "system prompt"}, {"role": "user", "content": "msg 1"}, @@ -1097,7 +1102,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "summary text" with patch("agent.context_compressor.get_model_context_length", return_value=100000): - c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=3) + c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=3) msgs = [ {"role": "system", "content": "system prompt"}, @@ -1133,13 +1138,15 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "summary text" with patch("agent.context_compressor.get_model_context_length", return_value=100000): - c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) + c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=1, protect_last_n=2) # Head: [system, user] → last head = user # Tail: [assistant, user, assistant] → first tail = assistant # summary_role="assistant" collides with tail, "user" collides with head → merge + # NOTE: protect_first_n=1 preserves 1 non-system message in addition to + # the system prompt (always implicitly protected). # With min_tail=3, tail = last 3 messages (indices 5-7). - # Need 8 messages: min_for_compress = 2+3+1 = 6, must have > 6. + # Need 8 messages: _min_for_compress = head(2) + 3 + 1 = 6, must have > 6. msgs = [ {"role": "system", "content": "system prompt"}, {"role": "user", "content": "msg 1"}, @@ -1292,6 +1299,90 @@ class TestSummaryTargetRatio: c = ContextCompressor(model="test", quiet_mode=True) assert c.protect_last_n == 20 + def test_default_protect_first_n_is_2(self): + """Default protect_first_n is 2 (system + 2 extra non-system messages = + 3 protected messages total, preserving the pre-feature behaviour where + protect_first_n was hardcoded to protect 3 head messages total). + """ + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor(model="test", quiet_mode=True) + assert c.protect_first_n == 2 + + def test_protect_first_n_override(self): + """protect_first_n=0 should be honoured — for users who rely on rolling + compaction and want NOTHING pinned at head except the system prompt + (always implicitly protected).""" + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=0) + assert c.protect_first_n == 0 + + def test_protect_first_n_0_preserves_only_system_prompt(self): + """End-to-end: when protect_first_n=0, compression should treat only + the system prompt as head. All user/assistant messages between the + system prompt and the protected tail become summarization candidates. + + This is the cleanest configuration for long-running rolling-compaction + sessions — no user/assistant turn gets pinned verbatim forever just + because it happened to be early in the session.""" + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor( + model="test", + quiet_mode=True, + protect_first_n=0, + protect_last_n=2, + ) + msgs = ( + [{"role": "system", "content": "System prompt"}] + + [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} + for i in range(8)] + ) + result = c.compress(msgs) + # System prompt (msg[0]) survives as head + assert result[0]["role"] == "system" + assert result[0]["content"].startswith("System prompt") + # The first user/assistant exchange (msg 0, msg 1) should NOT be pinned + # as head verbatim — those would have been summarized or absorbed. + # Under default protect_first_n=2, result[1] and result[2] would be + # the literal "msg 0" / "msg 1"; with protect_first_n=0 they aren't. + assert result[1].get("content") != "msg 0" + # Last 2 messages are tail-protected under protect_last_n=2 + assert result[-1]["content"] == msgs[-1]["content"] + + def test_protect_first_n_semantics_stable_without_system_prompt(self): + """Regression: gateway /compress handler strips the system prompt + before calling compress(). protect_first_n must mean the same thing + in both paths — "N non-system head messages" — so configuring + protect_first_n=0 preserves NOTHING at the head regardless of whether + the system prompt is in the messages list. + + Bug this covers: under the old semantics, protect_first_n counted + literally from messages[0]. In the gateway path (no system prompt) + that meant protect_first_n=1 would pin the first user turn of the + session forever — a user-reported complaint that a week-old + resolved question kept getting reinserted into every compaction + summary.""" + with patch("agent.context_compressor.get_model_context_length", return_value=100_000): + c = ContextCompressor( + model="test", + quiet_mode=True, + protect_first_n=0, + protect_last_n=2, + ) + # No system prompt — this is what the gateway passes to compress(). + msgs = [ + {"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} + for i in range(10) + ] + head_size = c._protect_head_size(msgs) + # With no system prompt and protect_first_n=0 → head is empty. + # The first user message is NOT pinned as head. + assert head_size == 0 + + # And with protect_first_n=3 on the same no-system-prompt list → + # head size is 3 (the three earliest non-system messages). + c.protect_first_n = 3 + assert c._protect_head_size(msgs) == 3 + class TestTokenBudgetTailProtection: """Tests for token-budget-based tail protection (PR #6240).