fix(compress): make abort-on-summary-failure opt-in via config flag (#28117)

PR #28102 made the summary-failure abort path the unconditional default,
changing established behavior. Gate it behind config.yaml flag
`compression.abort_on_summary_failure` (default False = historical
fallback-placeholder behavior).

- hermes_cli/config.py: new `compression.abort_on_summary_failure` key,
  default False, documented inline.
- agent/agent_init.py: read the flag from compression config and pass to
  ContextCompressor.
- agent/context_compressor.py: `__init__` accepts `abort_on_summary_failure`
  (default False). `compress()` failure branch gates the abort on the
  flag; when False, falls through to the restored legacy fallback path
  (static "summary unavailable" placeholder + drop middle window).
- tests: restore original fallback expectations as default; add new
  TestAbortOnSummaryFailure class for the opt-in mode.

Gateway/CLI plumbing (force=True on /compress, hygiene/handler abort
detection, locale `gateway.compress.aborted` key) from PR #28102 stays
intact — those paths only fire when `_last_compress_aborted` is True,
which now only happens when the flag is enabled.
This commit is contained in:
Teknium 2026-05-18 10:28:20 -07:00 committed by GitHub
parent 5e40f83cb7
commit 9aae59feab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 150 additions and 84 deletions

View file

@ -1105,6 +1105,9 @@ def init_agent(
compression_protect_first = max(
0, int(_compression_cfg.get("protect_first_n", 3))
)
compression_abort_on_summary_failure = str(
_compression_cfg.get("abort_on_summary_failure", False)
).lower() in {"true", "1", "yes"}
# Read optional explicit context_length override for the auxiliary
# compression model. Custom endpoints often cannot report this via
@ -1319,6 +1322,7 @@ def init_agent(
config_context_length=_config_context_length,
provider=agent.provider,
api_mode=agent.api_mode,
abort_on_summary_failure=compression_abort_on_summary_failure,
)
agent.compression_enabled = compression_enabled

View file

@ -523,6 +523,7 @@ class ContextCompressor(ContextEngine):
config_context_length: int | None = None,
provider: str = "",
api_mode: str = "",
abort_on_summary_failure: bool = False,
):
self.model = model
self.base_url = base_url
@ -534,6 +535,11 @@ class ContextCompressor(ContextEngine):
self.protect_last_n = protect_last_n
self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
self.quiet_mode = quiet_mode
# When True, summary-generation failure aborts compression entirely
# (returns messages unchanged, sets _last_compress_aborted=True).
# When False (default = historical behavior), insert a static
# "summary unavailable" placeholder and drop the middle window.
self.abort_on_summary_failure = abort_on_summary_failure
self.context_length = get_model_context_length(
model, base_url=base_url, api_key=api_key,
@ -1596,24 +1602,26 @@ The user has requested that this compaction PRIORITISE preserving all informatio
# Phase 3: Generate structured summary
summary = self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
# If summary generation failed, ABORT compression entirely. Returning
# the original messages unchanged preserves the full conversation
# context. Previously this branch dropped every middle message and
# replaced them with a static "summary unavailable" placeholder,
# which silently lost N turns of work whenever the aux LLM hiccuped.
# Auto-compress callers detect the no-op (post-compress length ==
# pre-compress length) and stop looping. The next call to
# _generate_summary is gated by _summary_failure_cooldown_until, so
# we don't burn the aux model every turn. Users can force a retry
# via /compress (which passes force=True to clear the cooldown).
if not summary:
# If summary generation failed, behavior splits on
# ``abort_on_summary_failure`` (config: compression.abort_on_summary_failure):
# True → ABORT compression entirely. Return messages unchanged
# and set _last_compress_aborted=True so callers can warn
# the user and stop the auto-compress retry loop.
# False → Fall through to the legacy fallback path below: insert
# a static "summary unavailable" placeholder and drop the
# middle window. Records _last_summary_fallback_used /
# _last_summary_dropped_count for gateway hygiene to
# surface a warning.
# Default is False (historical behavior).
if not summary and self.abort_on_summary_failure:
n_skipped = compress_end - compress_start
self._last_summary_dropped_count = 0 # nothing actually dropped
self._last_summary_fallback_used = False
self._last_compress_aborted = True
if not self.quiet_mode:
logger.warning(
"Summary generation failed — aborting compression. "
"Summary generation failed — aborting compression "
"(compression.abort_on_summary_failure=true). "
"%d message(s) preserved unchanged. Conversation is "
"frozen until the next /compress or /new.",
n_skipped,
@ -1634,6 +1642,23 @@ The user has requested that this compaction PRIORITISE preserving all informatio
)
compressed.append(msg)
# Legacy fallback path: LLM summary failed and abort_on_summary_failure
# is False (the default). Insert a static placeholder so the model
# knows context was lost rather than silently dropping everything.
if not summary:
if not self.quiet_mode:
logger.warning("Summary generation failed — inserting static fallback context marker")
n_dropped = compress_end - compress_start
self._last_summary_dropped_count = n_dropped
self._last_summary_fallback_used = True
summary = (
f"{SUMMARY_PREFIX}\n"
f"Summary generation was unavailable. {n_dropped} message(s) were "
f"removed to free context space but could not be summarized. The removed "
f"messages contained earlier work in this session. Continue based on the "
f"recent messages below and the current state of any files or resources."
)
_merge_summary_into_tail = False
last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user"

View file

@ -803,6 +803,17 @@ DEFAULT_CONFIG = {
# 0 for long-running rolling-compaction sessions
# where you want nothing pinned except the
# system prompt + rolling summary + recent tail.
"abort_on_summary_failure": False, # When True, auto-compression that fails
# to generate a summary (aux LLM errored / returned
# non-JSON / timed out) aborts entirely instead of
# dropping the middle window with a static
# "summary unavailable" placeholder. Messages are
# preserved unchanged and the session "freezes" at
# its current size until the user runs /compress
# (which bypasses the failure cooldown) or /new.
# Default False matches historical behavior; set to
# True if you'd rather pause than silently lose
# context turns when your aux model is flaky.
},
# Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).

View file

@ -64,31 +64,28 @@ class TestCompress:
result = compressor.compress(msgs)
assert result == msgs
def test_no_client_aborts_compression_with_messages_preserved(self, compressor):
"""compressor has no provider configured, so _generate_summary returns
None compression aborts entirely. Messages must be returned
unchanged (no placeholder, no drop) and _last_compress_aborted set."""
def test_truncation_fallback_no_client(self, compressor):
# compressor has client=None and abort_on_summary_failure=False (default),
# so the LEGACY fallback path inserts a static "summary unavailable"
# placeholder and the middle window is dropped.
msgs = [{"role": "system", "content": "System prompt"}] + self._make_messages(10)
result = compressor.compress(msgs)
# Abort path: messages preserved byte-for-byte
assert result == msgs
assert compressor._last_compress_aborted is True
# Compression count NOT incremented on abort — nothing was compressed.
assert compressor.compression_count == 0
assert len(result) < len(msgs)
# Should keep system message and last N
assert result[0]["role"] == "system"
assert compressor.compression_count == 1
# Abort flag must NOT fire under the default config.
assert compressor._last_compress_aborted is False
assert compressor._last_summary_fallback_used is True
def test_compression_increments_count(self, compressor):
msgs = self._make_messages(10)
mock_resp = MagicMock()
mock_resp.choices = [MagicMock()]
mock_resp.choices[0].message.content = "summary text"
with patch("agent.context_compressor.call_llm", return_value=mock_resp):
compressor.compress(msgs)
assert compressor.compression_count == 1
# Reset cooldown isn't needed (no prior failure) but reset
# iterative-summary state so the next call follows the same
# path as the first.
compressor.compress(msgs)
assert compressor.compression_count == 2
# Default config (abort_on_summary_failure=False) — fallback path
# increments the count even on summary failure.
compressor.compress(msgs)
assert compressor.compression_count == 1
compressor.compress(msgs)
assert compressor.compression_count == 2
def test_protects_first_and_last(self, compressor):
msgs = self._make_messages(10)
@ -138,11 +135,7 @@ class TestGenerateSummaryNoneContent:
{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"}
for i in range(10)
]
mock_resp = MagicMock()
mock_resp.choices = [MagicMock()]
mock_resp.choices[0].message.content = "summary text"
with patch("agent.context_compressor.call_llm", return_value=mock_resp):
result = c.compress(msgs)
result = c.compress(msgs)
assert len(result) < len(msgs)
@ -730,14 +723,12 @@ class TestAuxModelFallbackSurfacedToCallers:
class TestSummaryFailureTrackingForGatewayWarning:
"""When summary generation fails, the compressor must ABORT compression
entirely (return the original messages unchanged) and set the abort flag
so gateway hygiene & /compress can surface a visible warning. Previous
behavior of inserting a static "summary unavailable" placeholder while
silently dropping the middle window has been removed losing N turns
of context is worse than freezing the chat until the user retries."""
"""Default behavior (compression.abort_on_summary_failure=False):
summary-generation failure inserts a static fallback placeholder and
records dropped count + fallback flag so gateway hygiene & /compress
can surface a visible warning."""
def test_compress_aborts_and_preserves_messages_on_summary_failure(self):
def test_compress_records_fallback_and_dropped_count_on_summary_failure(self):
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
@ -752,28 +743,20 @@ class TestSummaryFailureTrackingForGatewayWarning:
{"role": "user", "content": "msg 7"},
]
# Simulate summary LLM call failing — covers the 404 / model-not-found
# case from issue (auxiliary compression model misconfigured).
with patch("agent.context_compressor.call_llm", side_effect=Exception("404 model not found")):
result = c.compress(msgs)
# Abort flag set, error recorded
assert c._last_compress_aborted is True
assert c._last_summary_fallback_used is True
assert c._last_summary_dropped_count > 0
assert c._last_summary_error is not None
# No fallback inserted, no messages dropped
assert c._last_summary_fallback_used is False
assert c._last_summary_dropped_count == 0
# Original messages preserved byte-for-byte — the agent loop's
# "did compression help?" check (len(after) < len(before)) sees a
# no-op and stops looping.
assert result == msgs
# No "Summary generation was unavailable" placeholder leaked in.
assert not any(
# Default mode: abort flag must NOT fire.
assert c._last_compress_aborted is False
assert any(
isinstance(m.get("content"), str) and "Summary generation was unavailable" in m["content"]
for m in result
)
def test_compress_clears_abort_flag_on_subsequent_success(self):
def test_compress_clears_fallback_flag_on_subsequent_success(self):
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "summary text"
@ -792,12 +775,76 @@ class TestSummaryFailureTrackingForGatewayWarning:
{"role": "user", "content": "msg 7"},
]
# First call fails, second succeeds — abort flag must reset on second compress.
with patch("agent.context_compressor.call_llm", side_effect=Exception("boom")):
c.compress(msgs)
assert c._last_summary_fallback_used is True
c._summary_failure_cooldown_until = 0.0
with patch("agent.context_compressor.call_llm", return_value=mock_response):
c.compress(msgs)
assert c._last_summary_fallback_used is False
assert c._last_summary_dropped_count == 0
class TestAbortOnSummaryFailure:
"""Opt-in behavior (compression.abort_on_summary_failure=True):
summary-generation failure ABORTS compression entirely returns the
original messages unchanged and sets _last_compress_aborted=True so
gateway hygiene & /compress can surface a visible warning."""
def _make_msgs(self):
return [
{"role": "system", "content": "sys"},
{"role": "user", "content": "msg 1"},
{"role": "assistant", "content": "msg 2"},
{"role": "user", "content": "msg 3"},
{"role": "assistant", "content": "msg 4"},
{"role": "user", "content": "msg 5"},
{"role": "assistant", "content": "msg 6"},
{"role": "user", "content": "msg 7"},
]
def _make_compressor(self):
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
return ContextCompressor(
model="test",
quiet_mode=True,
protect_first_n=2,
protect_last_n=2,
abort_on_summary_failure=True,
)
def test_compress_aborts_and_preserves_messages_on_summary_failure(self):
c = self._make_compressor()
msgs = self._make_msgs()
with patch("agent.context_compressor.call_llm", side_effect=Exception("404 model not found")):
result = c.compress(msgs)
assert c._last_compress_aborted is True
assert c._last_summary_error is not None
# No fallback inserted, no messages dropped
assert c._last_summary_fallback_used is False
assert c._last_summary_dropped_count == 0
# Original messages preserved byte-for-byte.
assert result == msgs
# No "Summary generation was unavailable" placeholder leaked in.
assert not any(
isinstance(m.get("content"), str) and "Summary generation was unavailable" in m["content"]
for m in result
)
def test_compress_clears_abort_flag_on_subsequent_success(self):
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "summary text"
c = self._make_compressor()
msgs = self._make_msgs()
with patch("agent.context_compressor.call_llm", side_effect=Exception("boom")):
c.compress(msgs)
assert c._last_compress_aborted is True
# Reset cooldown to allow retry on second compress
c._summary_failure_cooldown_until = 0.0
with patch("agent.context_compressor.call_llm", return_value=mock_response):
c.compress(msgs)
@ -813,34 +860,17 @@ class TestSummaryFailureTrackingForGatewayWarning:
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "summary text"
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
c = self._make_compressor()
msgs = self._make_msgs()
msgs = [
{"role": "system", "content": "sys"},
{"role": "user", "content": "msg 1"},
{"role": "assistant", "content": "msg 2"},
{"role": "user", "content": "msg 3"},
{"role": "assistant", "content": "msg 4"},
{"role": "user", "content": "msg 5"},
{"role": "assistant", "content": "msg 6"},
{"role": "user", "content": "msg 7"},
]
# Pre-populate an active cooldown (as if a prior auto-compress aborted).
import time as _time
c._summary_failure_cooldown_until = _time.monotonic() + 999.0
# Without force, _generate_summary would short-circuit on cooldown
# and return None → abort. With force=True the cooldown is cleared
# and the call goes through.
with patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(msgs, force=True)
assert c._last_compress_aborted is False
# Cooldown was cleared and a real summary attempt was made.
assert c._summary_failure_cooldown_until == 0.0
# Result is actually compressed (shorter than input).
assert len(result) < len(msgs)
@ -1401,11 +1431,7 @@ class TestSummaryTargetRatio:
+ [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"}
for i in range(8)]
)
mock_resp = MagicMock()
mock_resp.choices = [MagicMock()]
mock_resp.choices[0].message.content = "summary text"
with patch("agent.context_compressor.call_llm", return_value=mock_resp):
result = c.compress(msgs)
result = c.compress(msgs)
# System prompt (msg[0]) survives as head
assert result[0]["role"] == "system"
assert result[0]["content"].startswith("System prompt")