fix(compression): abort on auth failure instead of rotating into a degraded session

When the auxiliary summary call fails with an authentication/permission
error (HTTP 401/403), context compression now ABORTS and preserves the
session unchanged instead of rotating into a child session with a
placeholder summary.

Before: a 401 (invalid/blocked key, or a token pointed at the wrong
inference host) fell through every transient-error check to 'return
None', and because compression.abort_on_summary_failure defaults False,
compress() took the static-fallback path and rotated the session anyway
(messages N->N). The user landed on a fresh-but-broken session that kept
failing the same way — paying for a full-context API call each turn with
no useful compression.

After: _generate_summary classifies 401/403 as a non-recoverable auth
failure (_last_summary_auth_failure) and compress() aborts on it
regardless of abort_on_summary_failure. A distinct auxiliary summary_model
that 401s still retries once on the main model first (its dedicated creds
may be the only broken thing); the abort only sticks when the main model
itself auth-fails or the fallback also auth-fails. The existing
_last_compress_aborted handling in conversation_compression.py already
skips rotation and emits a warning, so no session rotation occurs.

Tests: TestAuthFailureAborts — 401/403 flagging, compress() aborts despite
flag=False, non-auth failures keep the historical fallback path, and
aux-model auth failure recovers on main without aborting.
This commit is contained in:
teknium1 2026-06-20 10:56:56 -07:00 committed by Teknium
parent f22dd8a75a
commit 5a53e0f0f4
2 changed files with 161 additions and 8 deletions

View file

@ -761,6 +761,14 @@ class ContextCompressor(ContextEngine):
# this flag to know "compression was attempted but aborted, freeze
# the chat until the user manually retries via /compress".
self._last_compress_aborted: bool = False
# Set True when the summary call failed with an authentication /
# permission error (HTTP 401/403). Auth failures are non-recoverable
# at the request level — the credential or endpoint is broken — so
# compress() must ABORT (preserve the session unchanged) rather than
# rotate into a degraded child session with a placeholder summary.
# This is independent of the abort_on_summary_failure config flag:
# rotating on a broken credential is never the right behavior.
self._last_summary_auth_failure: bool = False
# When a user-configured summary model fails and we recover by
# retrying on the main model, record the failure so gateway /
# CLI callers can still warn the user even though compression
@ -1524,6 +1532,7 @@ This compaction should PRIORITISE preserving all information related to the focu
self._summary_failure_cooldown_until = 0.0
self._summary_model_fallen_back = False
self._last_summary_error = None
self._last_summary_auth_failure = False
return self._with_summary_prefix(summary)
except RuntimeError:
# No provider configured — long cooldown, unlikely to self-resolve
@ -1571,6 +1580,26 @@ This compaction should PRIORITISE preserving all information related to the focu
# back to the main model instead of entering a 60-second cooldown.
# See issue #18458.
_is_streaming_closed = _is_connection_error(e)
# Authentication / permission failures (401/403) are NOT transient
# and NOT fixable by retrying the same request: the credential is
# invalid/blocked/expired or the endpoint is wrong (e.g. a prod
# token sent to a staging inference URL). Flag them so compress()
# aborts and preserves the session instead of rotating into a
# degraded child with a placeholder summary. We still allow the
# one-shot fallback to the MAIN model below when the failure came
# from a distinct auxiliary summary_model (its dedicated creds may
# be the only broken thing); only a failure on the main model — or
# a fallback that also auth-fails — makes the abort stick.
_is_auth_error = (
_status in {401, 403}
or "invalid api key" in _err_str
or "invalid x-api-key" in _err_str
or ("api key" in _err_str and ("invalid" in _err_str or "blocked" in _err_str))
or "unauthorized" in _err_str
or "authentication" in _err_str
)
if _is_auth_error:
self._last_summary_auth_failure = True
if _is_json_decode and not _is_model_not_found and not _is_timeout:
logger.error(
"Context compression failed: auxiliary LLM returned a "
@ -2178,6 +2207,7 @@ This compaction should PRIORITISE preserving all information related to the focu
self._last_aux_model_failure_error = None
self._last_aux_model_failure_model = None
self._last_compress_aborted = False
self._last_summary_auth_failure = False
# Manual /compress (force=True) bypasses the failure cooldown so the
# user can retry immediately after an auto-compress abort. Without
@ -2293,19 +2323,38 @@ This compaction should PRIORITISE preserving all information related to the focu
# _last_summary_dropped_count for gateway hygiene to
# surface a warning.
# Default is False (historical behavior).
if not summary and self.abort_on_summary_failure:
#
# EXCEPTION — auth failures always abort. A 401/403 from the summary
# call means the credential or endpoint is broken (invalid/blocked
# key, or a token pointed at the wrong inference host). Rotating into
# a child session with a placeholder summary on a broken credential
# strands the user on a degraded session for zero benefit — every
# subsequent call fails the same way. So when the failure was an auth
# error we abort regardless of abort_on_summary_failure, preserving
# the conversation unchanged until the credential is fixed.
if not summary and (self.abort_on_summary_failure or self._last_summary_auth_failure):
n_skipped = compress_end - compress_start
self._last_summary_dropped_count = 0 # nothing actually dropped
self._last_summary_fallback_used = False
self._last_compress_aborted = True
if not self.quiet_mode:
logger.warning(
"Summary generation failed — aborting compression "
"(compression.abort_on_summary_failure=true). "
"%d message(s) preserved unchanged. Conversation is "
"frozen until the next /compress or /new.",
n_skipped,
)
if self._last_summary_auth_failure:
logger.warning(
"Summary generation failed with an authentication "
"error — aborting compression. %d message(s) preserved "
"unchanged; the session was NOT rotated. Check your "
"provider credential / inference endpoint, then retry "
"with /compress or start fresh with /new.",
n_skipped,
)
else:
logger.warning(
"Summary generation failed — aborting compression "
"(compression.abort_on_summary_failure=true). "
"%d message(s) preserved unchanged. Conversation is "
"frozen until the next /compress or /new.",
n_skipped,
)
return messages
# Phase 4: Assemble compressed message list

View file

@ -365,6 +365,110 @@ class TestSummaryFailureCooldown:
assert mock_call.call_count == 1
class TestAuthFailureAborts:
"""A 401/403 on the summary call must ABORT compression (preserve the
session unchanged) instead of rotating into a degraded child session
with a placeholder summary regardless of abort_on_summary_failure.
Real incident: a nous token pointed at a stale staging inference URL
401'd on every compression attempt, and because abort_on_summary_failure
defaults False the session rotated anyway (messages N->N), stranding the
user on a fresh-but-broken session that kept failing the same way.
"""
def _msgs(self, n=10):
return [
{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"}
for i in range(n)
]
def _auth_err(self, status=401):
err = Exception(
f"Error code: {status} - "
"{'status': 401, 'message': 'Your API key is invalid, blocked or out of funds.'}"
)
err.status_code = status
return err
def test_generate_summary_flags_auth_failure(self):
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(model="test", quiet_mode=True)
with patch("agent.context_compressor.call_llm", side_effect=self._auth_err(401)):
result = c._generate_summary(self._msgs())
assert result is None
assert c._last_summary_auth_failure is True
def test_403_also_flags_auth_failure(self):
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(model="test", quiet_mode=True)
with patch("agent.context_compressor.call_llm", side_effect=self._auth_err(403)):
c._generate_summary(self._msgs())
assert c._last_summary_auth_failure is True
def test_compress_aborts_on_auth_failure_despite_flag_false(self):
"""abort_on_summary_failure=False (the default), but a 401 must still
abort: messages returned unchanged, _last_compress_aborted=True."""
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(
model="test",
quiet_mode=True,
protect_first_n=2,
protect_last_n=2,
abort_on_summary_failure=False,
)
msgs = self._msgs(12)
with patch("agent.context_compressor.call_llm", side_effect=self._auth_err(401)):
result = c.compress(msgs, current_tokens=999999, force=True)
# Session must NOT be compressed/rotated — same messages back.
assert result == msgs
assert len(result) == len(msgs)
assert c._last_compress_aborted is True
assert c._last_summary_auth_failure is True
# Did NOT fall through to the static-fallback (drop-the-middle) path.
assert c._last_summary_fallback_used is False
def test_non_auth_failure_still_uses_fallback_path(self):
"""A generic (non-auth) failure with abort_on_summary_failure=False
keeps the historical behavior: insert a static fallback + drop the
middle window (does NOT abort)."""
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(
model="test",
quiet_mode=True,
protect_first_n=2,
protect_last_n=2,
abort_on_summary_failure=False,
)
msgs = self._msgs(12)
with patch("agent.context_compressor.call_llm", side_effect=Exception("boom 500")):
result = c.compress(msgs, current_tokens=999999, force=True)
assert c._last_summary_auth_failure is False
assert c._last_compress_aborted is False
assert len(result) < len(msgs) # middle window dropped
def test_aux_model_auth_failure_recovers_on_main_no_abort(self):
"""A 401 from a DISTINCT auxiliary summary_model retries on the main
model; if main succeeds, the auth flag is cleared and compression is
NOT aborted (the aux creds were the only broken thing)."""
mock_ok = MagicMock()
mock_ok.choices = [MagicMock()]
mock_ok.choices[0].message.content = "summary via main model"
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(
model="main-model",
summary_model_override="broken-aux-model",
quiet_mode=True,
)
with patch(
"agent.context_compressor.call_llm",
side_effect=[self._auth_err(401), mock_ok],
) as mock_call:
result = c._generate_summary(self._msgs())
assert mock_call.call_count == 2
assert isinstance(result, str)
assert c._last_summary_auth_failure is False # cleared on success
class TestSummaryFallbackToMainModel:
"""When ``summary_model`` differs from the main model and the summary LLM
call fails, the compressor should retry once on the main model before