diff --git a/agent/agent_init.py b/agent/agent_init.py index 71b04e3e540..9b89028e3fa 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1105,6 +1105,9 @@ def init_agent( compression_protect_first = max( 0, int(_compression_cfg.get("protect_first_n", 3)) ) + compression_abort_on_summary_failure = str( + _compression_cfg.get("abort_on_summary_failure", False) + ).lower() in {"true", "1", "yes"} # Read optional explicit context_length override for the auxiliary # compression model. Custom endpoints often cannot report this via @@ -1319,6 +1322,7 @@ def init_agent( config_context_length=_config_context_length, provider=agent.provider, api_mode=agent.api_mode, + abort_on_summary_failure=compression_abort_on_summary_failure, ) agent.compression_enabled = compression_enabled diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 8ef9796df7f..62636809094 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -523,6 +523,7 @@ class ContextCompressor(ContextEngine): config_context_length: int | None = None, provider: str = "", api_mode: str = "", + abort_on_summary_failure: bool = False, ): self.model = model self.base_url = base_url @@ -534,6 +535,11 @@ class ContextCompressor(ContextEngine): self.protect_last_n = protect_last_n self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80)) self.quiet_mode = quiet_mode + # When True, summary-generation failure aborts compression entirely + # (returns messages unchanged, sets _last_compress_aborted=True). + # When False (default = historical behavior), insert a static + # "summary unavailable" placeholder and drop the middle window. + self.abort_on_summary_failure = abort_on_summary_failure self.context_length = get_model_context_length( model, base_url=base_url, api_key=api_key, @@ -1596,24 +1602,26 @@ The user has requested that this compaction PRIORITISE preserving all informatio # Phase 3: Generate structured summary summary = self._generate_summary(turns_to_summarize, focus_topic=focus_topic) - # If summary generation failed, ABORT compression entirely. Returning - # the original messages unchanged preserves the full conversation - # context. Previously this branch dropped every middle message and - # replaced them with a static "summary unavailable" placeholder, - # which silently lost N turns of work whenever the aux LLM hiccuped. - # Auto-compress callers detect the no-op (post-compress length == - # pre-compress length) and stop looping. The next call to - # _generate_summary is gated by _summary_failure_cooldown_until, so - # we don't burn the aux model every turn. Users can force a retry - # via /compress (which passes force=True to clear the cooldown). - if not summary: + # If summary generation failed, behavior splits on + # ``abort_on_summary_failure`` (config: compression.abort_on_summary_failure): + # True → ABORT compression entirely. Return messages unchanged + # and set _last_compress_aborted=True so callers can warn + # the user and stop the auto-compress retry loop. + # False → Fall through to the legacy fallback path below: insert + # a static "summary unavailable" placeholder and drop the + # middle window. Records _last_summary_fallback_used / + # _last_summary_dropped_count for gateway hygiene to + # surface a warning. + # Default is False (historical behavior). + if not summary and self.abort_on_summary_failure: n_skipped = compress_end - compress_start self._last_summary_dropped_count = 0 # nothing actually dropped self._last_summary_fallback_used = False self._last_compress_aborted = True if not self.quiet_mode: logger.warning( - "Summary generation failed — aborting compression. " + "Summary generation failed — aborting compression " + "(compression.abort_on_summary_failure=true). " "%d message(s) preserved unchanged. Conversation is " "frozen until the next /compress or /new.", n_skipped, @@ -1634,6 +1642,23 @@ The user has requested that this compaction PRIORITISE preserving all informatio ) compressed.append(msg) + # Legacy fallback path: LLM summary failed and abort_on_summary_failure + # is False (the default). Insert a static placeholder so the model + # knows context was lost rather than silently dropping everything. + if not summary: + if not self.quiet_mode: + logger.warning("Summary generation failed — inserting static fallback context marker") + n_dropped = compress_end - compress_start + self._last_summary_dropped_count = n_dropped + self._last_summary_fallback_used = True + summary = ( + f"{SUMMARY_PREFIX}\n" + f"Summary generation was unavailable. {n_dropped} message(s) were " + f"removed to free context space but could not be summarized. The removed " + f"messages contained earlier work in this session. Continue based on the " + f"recent messages below and the current state of any files or resources." + ) + _merge_summary_into_tail = False last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user" first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user" diff --git a/hermes_cli/config.py b/hermes_cli/config.py index e69c51a4d3b..ce3ddd54108 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -803,6 +803,17 @@ DEFAULT_CONFIG = { # 0 for long-running rolling-compaction sessions # where you want nothing pinned except the # system prompt + rolling summary + recent tail. + "abort_on_summary_failure": False, # When True, auto-compression that fails + # to generate a summary (aux LLM errored / returned + # non-JSON / timed out) aborts entirely instead of + # dropping the middle window with a static + # "summary unavailable" placeholder. Messages are + # preserved unchanged and the session "freezes" at + # its current size until the user runs /compress + # (which bypasses the failure cooldown) or /new. + # Default False matches historical behavior; set to + # True if you'd rather pause than silently lose + # context turns when your aux model is flaky. }, # Anthropic prompt caching (Claude via OpenRouter or native Anthropic API). diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index e952732075e..d8691fdf87c 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -64,31 +64,28 @@ class TestCompress: result = compressor.compress(msgs) assert result == msgs - def test_no_client_aborts_compression_with_messages_preserved(self, compressor): - """compressor has no provider configured, so _generate_summary returns - None → compression aborts entirely. Messages must be returned - unchanged (no placeholder, no drop) and _last_compress_aborted set.""" + def test_truncation_fallback_no_client(self, compressor): + # compressor has client=None and abort_on_summary_failure=False (default), + # so the LEGACY fallback path inserts a static "summary unavailable" + # placeholder and the middle window is dropped. msgs = [{"role": "system", "content": "System prompt"}] + self._make_messages(10) result = compressor.compress(msgs) - # Abort path: messages preserved byte-for-byte - assert result == msgs - assert compressor._last_compress_aborted is True - # Compression count NOT incremented on abort — nothing was compressed. - assert compressor.compression_count == 0 + assert len(result) < len(msgs) + # Should keep system message and last N + assert result[0]["role"] == "system" + assert compressor.compression_count == 1 + # Abort flag must NOT fire under the default config. + assert compressor._last_compress_aborted is False + assert compressor._last_summary_fallback_used is True def test_compression_increments_count(self, compressor): msgs = self._make_messages(10) - mock_resp = MagicMock() - mock_resp.choices = [MagicMock()] - mock_resp.choices[0].message.content = "summary text" - with patch("agent.context_compressor.call_llm", return_value=mock_resp): - compressor.compress(msgs) - assert compressor.compression_count == 1 - # Reset cooldown isn't needed (no prior failure) but reset - # iterative-summary state so the next call follows the same - # path as the first. - compressor.compress(msgs) - assert compressor.compression_count == 2 + # Default config (abort_on_summary_failure=False) — fallback path + # increments the count even on summary failure. + compressor.compress(msgs) + assert compressor.compression_count == 1 + compressor.compress(msgs) + assert compressor.compression_count == 2 def test_protects_first_and_last(self, compressor): msgs = self._make_messages(10) @@ -138,11 +135,7 @@ class TestGenerateSummaryNoneContent: {"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10) ] - mock_resp = MagicMock() - mock_resp.choices = [MagicMock()] - mock_resp.choices[0].message.content = "summary text" - with patch("agent.context_compressor.call_llm", return_value=mock_resp): - result = c.compress(msgs) + result = c.compress(msgs) assert len(result) < len(msgs) @@ -730,14 +723,12 @@ class TestAuxModelFallbackSurfacedToCallers: class TestSummaryFailureTrackingForGatewayWarning: - """When summary generation fails, the compressor must ABORT compression - entirely (return the original messages unchanged) and set the abort flag - so gateway hygiene & /compress can surface a visible warning. Previous - behavior of inserting a static "summary unavailable" placeholder while - silently dropping the middle window has been removed — losing N turns - of context is worse than freezing the chat until the user retries.""" + """Default behavior (compression.abort_on_summary_failure=False): + summary-generation failure inserts a static fallback placeholder and + records dropped count + fallback flag so gateway hygiene & /compress + can surface a visible warning.""" - def test_compress_aborts_and_preserves_messages_on_summary_failure(self): + def test_compress_records_fallback_and_dropped_count_on_summary_failure(self): with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) @@ -752,28 +743,20 @@ class TestSummaryFailureTrackingForGatewayWarning: {"role": "user", "content": "msg 7"}, ] - # Simulate summary LLM call failing — covers the 404 / model-not-found - # case from issue (auxiliary compression model misconfigured). with patch("agent.context_compressor.call_llm", side_effect=Exception("404 model not found")): result = c.compress(msgs) - # Abort flag set, error recorded - assert c._last_compress_aborted is True + assert c._last_summary_fallback_used is True + assert c._last_summary_dropped_count > 0 assert c._last_summary_error is not None - # No fallback inserted, no messages dropped - assert c._last_summary_fallback_used is False - assert c._last_summary_dropped_count == 0 - # Original messages preserved byte-for-byte — the agent loop's - # "did compression help?" check (len(after) < len(before)) sees a - # no-op and stops looping. - assert result == msgs - # No "Summary generation was unavailable" placeholder leaked in. - assert not any( + # Default mode: abort flag must NOT fire. + assert c._last_compress_aborted is False + assert any( isinstance(m.get("content"), str) and "Summary generation was unavailable" in m["content"] for m in result ) - def test_compress_clears_abort_flag_on_subsequent_success(self): + def test_compress_clears_fallback_flag_on_subsequent_success(self): mock_response = MagicMock() mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "summary text" @@ -792,12 +775,76 @@ class TestSummaryFailureTrackingForGatewayWarning: {"role": "user", "content": "msg 7"}, ] - # First call fails, second succeeds — abort flag must reset on second compress. + with patch("agent.context_compressor.call_llm", side_effect=Exception("boom")): + c.compress(msgs) + assert c._last_summary_fallback_used is True + + c._summary_failure_cooldown_until = 0.0 + with patch("agent.context_compressor.call_llm", return_value=mock_response): + c.compress(msgs) + assert c._last_summary_fallback_used is False + assert c._last_summary_dropped_count == 0 + + +class TestAbortOnSummaryFailure: + """Opt-in behavior (compression.abort_on_summary_failure=True): + summary-generation failure ABORTS compression entirely — returns the + original messages unchanged and sets _last_compress_aborted=True so + gateway hygiene & /compress can surface a visible warning.""" + + def _make_msgs(self): + return [ + {"role": "system", "content": "sys"}, + {"role": "user", "content": "msg 1"}, + {"role": "assistant", "content": "msg 2"}, + {"role": "user", "content": "msg 3"}, + {"role": "assistant", "content": "msg 4"}, + {"role": "user", "content": "msg 5"}, + {"role": "assistant", "content": "msg 6"}, + {"role": "user", "content": "msg 7"}, + ] + + def _make_compressor(self): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): + return ContextCompressor( + model="test", + quiet_mode=True, + protect_first_n=2, + protect_last_n=2, + abort_on_summary_failure=True, + ) + + def test_compress_aborts_and_preserves_messages_on_summary_failure(self): + c = self._make_compressor() + msgs = self._make_msgs() + with patch("agent.context_compressor.call_llm", side_effect=Exception("404 model not found")): + result = c.compress(msgs) + + assert c._last_compress_aborted is True + assert c._last_summary_error is not None + # No fallback inserted, no messages dropped + assert c._last_summary_fallback_used is False + assert c._last_summary_dropped_count == 0 + # Original messages preserved byte-for-byte. + assert result == msgs + # No "Summary generation was unavailable" placeholder leaked in. + assert not any( + isinstance(m.get("content"), str) and "Summary generation was unavailable" in m["content"] + for m in result + ) + + def test_compress_clears_abort_flag_on_subsequent_success(self): + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = "summary text" + + c = self._make_compressor() + msgs = self._make_msgs() + with patch("agent.context_compressor.call_llm", side_effect=Exception("boom")): c.compress(msgs) assert c._last_compress_aborted is True - # Reset cooldown to allow retry on second compress c._summary_failure_cooldown_until = 0.0 with patch("agent.context_compressor.call_llm", return_value=mock_response): c.compress(msgs) @@ -813,34 +860,17 @@ class TestSummaryFailureTrackingForGatewayWarning: mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "summary text" - with patch("agent.context_compressor.get_model_context_length", return_value=100000): - c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) + c = self._make_compressor() + msgs = self._make_msgs() - msgs = [ - {"role": "system", "content": "sys"}, - {"role": "user", "content": "msg 1"}, - {"role": "assistant", "content": "msg 2"}, - {"role": "user", "content": "msg 3"}, - {"role": "assistant", "content": "msg 4"}, - {"role": "user", "content": "msg 5"}, - {"role": "assistant", "content": "msg 6"}, - {"role": "user", "content": "msg 7"}, - ] - - # Pre-populate an active cooldown (as if a prior auto-compress aborted). import time as _time c._summary_failure_cooldown_until = _time.monotonic() + 999.0 - # Without force, _generate_summary would short-circuit on cooldown - # and return None → abort. With force=True the cooldown is cleared - # and the call goes through. with patch("agent.context_compressor.call_llm", return_value=mock_response): result = c.compress(msgs, force=True) assert c._last_compress_aborted is False - # Cooldown was cleared and a real summary attempt was made. assert c._summary_failure_cooldown_until == 0.0 - # Result is actually compressed (shorter than input). assert len(result) < len(msgs) @@ -1401,11 +1431,7 @@ class TestSummaryTargetRatio: + [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(8)] ) - mock_resp = MagicMock() - mock_resp.choices = [MagicMock()] - mock_resp.choices[0].message.content = "summary text" - with patch("agent.context_compressor.call_llm", return_value=mock_resp): - result = c.compress(msgs) + result = c.compress(msgs) # System prompt (msg[0]) survives as head assert result[0]["role"] == "system" assert result[0]["content"].startswith("System prompt")