fix(compressor): treat empty-content summary response as failure, not an empty summary (#50297)

When an OpenAI-compatible proxy (e.g. cmkey.cn, one-api Anthropic channels)
returns a well-formed HTTP 200 whose summary content is null or empty/
whitespace-only, _generate_summary coerced it to "" and stored a prefix-only
summary — silently replacing the compacted turns with nothing. The model then
lost all in-progress context after compression (#11978, #11914).

_validate_llm_response already guards None / empty-choices, so those never
reach the compressor; the gap was a well-formed response with empty *content*.
Now treat empty content as a summary failure: raise so it routes through the
existing main-model fallback then transient cooldown, dropping the turns
without a summary rather than wiping context with an empty one.

Also narrow the bare 'except RuntimeError' so only genuine 'No LLM provider
configured' errors take the 600s no-provider cooldown; empty/invalid-response
RuntimeErrors from a configured provider now correctly get the main-model
fallback instead of being misrouted into the long no-provider cooldown.

Reported by @Hung2124; area identified by @annguyenNous in #39590.
This commit is contained in:
Teknium 2026-06-21 11:27:07 -07:00 committed by GitHub
parent 296b290f8f
commit b6a4638b6d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 97 additions and 13 deletions

View file

@ -1591,6 +1591,22 @@ This compaction should PRIORITISE preserving all information related to the focu
# Handle cases where content is not a string (e.g., dict from llama.cpp)
if not isinstance(content, str):
content = str(content) if content else ""
# Some OpenAI-compatible proxies (e.g. cmkey.cn, one-api channels)
# return a well-formed HTTP 200 with an empty or whitespace-only
# ``content`` instead of an error or empty ``choices``. That payload
# passes ``_validate_llm_response`` (a ``message`` exists), so it
# reaches here and would otherwise be stored as a prefix-only
# summary with no body — silently wiping the compacted turns and
# making the model forget the in-progress task (#11978, #11914).
# Treat empty content as a failure so it routes through the same
# main-model fallback + cooldown machinery as a transport error,
# rather than replacing real context with an empty summary.
if not content.strip():
raise RuntimeError(
"Context compression LLM returned empty content "
f"(provider={self.provider or 'auto'} "
f"model={self.summary_model or self.model})"
)
# Redact the summary output as well — the summarizer LLM may
# ignore prompt instructions and echo back secrets verbatim.
summary = redact_sensitive_text(content.strip())
@ -1601,16 +1617,27 @@ This compaction should PRIORITISE preserving all information related to the focu
self._last_summary_error = None
self._last_summary_auth_failure = False
return self._with_summary_prefix(summary)
except RuntimeError:
# No provider configured — long cooldown, unlikely to self-resolve
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
self._last_summary_error = "no auxiliary LLM provider configured"
logger.warning("Context compression: no provider available for "
"summary. Middle turns will be dropped without summary "
"for %d seconds.",
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
return None
except Exception as e:
# ``call_llm`` raises ``RuntimeError`` for two very different cases:
# 1. No provider configured ("No LLM provider configured ...") —
# a permanent misconfiguration, long cooldown is correct.
# 2. An empty/invalid response from a configured provider
# (``_validate_llm_response`` empty-``choices``/``None``, or our
# empty-``content`` guard above) — a transient/proxy fault that
# should fall back to the main model first, exactly like the
# transport errors handled below.
# Only (1) belongs in the long no-provider cooldown; (2) and every
# other exception flow into the generic fallback logic so they get
# a main-model retry before any cooldown. (#11978, #11914)
if isinstance(e, RuntimeError) and "no llm provider configured" in str(e).lower():
# No provider configured — long cooldown, unlikely to self-resolve
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
self._last_summary_error = "no auxiliary LLM provider configured"
logger.warning("Context compression: no provider available for "
"summary. Middle turns will be dropped without summary "
"for %d seconds.",
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
return None
# If the summary model is different from the main model and the
# error looks permanent (model not found, 503, 404), fall back to
# using the main model instead of entering cooldown that leaves

View file

@ -357,11 +357,41 @@ class TestNonStringContent:
assert isinstance(summary, str)
assert summary.startswith(SUMMARY_PREFIX)
def test_none_content_coerced_to_empty(self):
def test_none_content_treated_as_failure_not_empty_summary(self):
"""Regression #11978/#11914: a well-formed response with ``content=None``
(some OpenAI-compatible proxies, e.g. cmkey.cn, return HTTP 200 with
null/empty content) must NOT be stored as a prefix-only summary that
silently wipes the compacted turns. It is treated as a summary failure
and routed through cooldown so the turns are dropped without a summary
rather than replaced by an empty one."""
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = None
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
# summary_model == model here, so no fallback path: straight to cooldown.
c = ContextCompressor(model="test", quiet_mode=True)
messages = [
{"role": "user", "content": "do something"},
{"role": "assistant", "content": "ok"},
]
with patch("agent.context_compressor.call_llm", return_value=mock_response):
summary = c._generate_summary(messages)
# Empty content → failure → None (drop turns), NOT a prefix-only summary.
assert summary is None
assert summary != SUMMARY_PREFIX
# Transient cooldown engaged so we don't immediately retry the bad proxy.
assert c._summary_failure_cooldown_until > 0
def test_empty_string_content_treated_as_failure(self):
"""An empty-string (or whitespace-only) ``content`` is handled the same
as ``None`` failure, not an empty summary (#11978)."""
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = " \n "
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(model="test", quiet_mode=True)
@ -372,9 +402,36 @@ class TestNonStringContent:
with patch("agent.context_compressor.call_llm", return_value=mock_response):
summary = c._generate_summary(messages)
# None content → empty string → standardized compaction handoff prefix added
assert summary is not None
assert summary == SUMMARY_PREFIX
assert summary is None
assert c._summary_failure_cooldown_until > 0
def test_empty_content_falls_back_to_main_model(self):
"""When the auxiliary summary model returns empty content and a distinct
main model is configured, compression falls back to the main model
before entering cooldown (#11978 glm-5.1 → glm-5 path)."""
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = ""
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
c = ContextCompressor(
model="glm-5",
summary_model_override="glm-5.1",
quiet_mode=True,
)
messages = [
{"role": "user", "content": "do something"},
{"role": "assistant", "content": "ok"},
]
with patch("agent.context_compressor.call_llm", return_value=mock_response) as mock_call:
summary = c._generate_summary(messages)
# Two calls: aux model (glm-5.1) then fallback to main (glm-5).
assert mock_call.call_count == 2
assert c._summary_model_fallen_back is True
assert summary is None
assert c._summary_failure_cooldown_until > 0
def test_summary_call_does_not_force_temperature(self):
mock_response = MagicMock()