mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(compressor): treat empty-content summary response as failure, not an empty summary (#50297)
When an OpenAI-compatible proxy (e.g. cmkey.cn, one-api Anthropic channels) returns a well-formed HTTP 200 whose summary content is null or empty/ whitespace-only, _generate_summary coerced it to "" and stored a prefix-only summary — silently replacing the compacted turns with nothing. The model then lost all in-progress context after compression (#11978, #11914). _validate_llm_response already guards None / empty-choices, so those never reach the compressor; the gap was a well-formed response with empty *content*. Now treat empty content as a summary failure: raise so it routes through the existing main-model fallback then transient cooldown, dropping the turns without a summary rather than wiping context with an empty one. Also narrow the bare 'except RuntimeError' so only genuine 'No LLM provider configured' errors take the 600s no-provider cooldown; empty/invalid-response RuntimeErrors from a configured provider now correctly get the main-model fallback instead of being misrouted into the long no-provider cooldown. Reported by @Hung2124; area identified by @annguyenNous in #39590.
This commit is contained in:
parent
296b290f8f
commit
b6a4638b6d
2 changed files with 97 additions and 13 deletions
|
|
@ -1591,6 +1591,22 @@ This compaction should PRIORITISE preserving all information related to the focu
|
|||
# Handle cases where content is not a string (e.g., dict from llama.cpp)
|
||||
if not isinstance(content, str):
|
||||
content = str(content) if content else ""
|
||||
# Some OpenAI-compatible proxies (e.g. cmkey.cn, one-api channels)
|
||||
# return a well-formed HTTP 200 with an empty or whitespace-only
|
||||
# ``content`` instead of an error or empty ``choices``. That payload
|
||||
# passes ``_validate_llm_response`` (a ``message`` exists), so it
|
||||
# reaches here and would otherwise be stored as a prefix-only
|
||||
# summary with no body — silently wiping the compacted turns and
|
||||
# making the model forget the in-progress task (#11978, #11914).
|
||||
# Treat empty content as a failure so it routes through the same
|
||||
# main-model fallback + cooldown machinery as a transport error,
|
||||
# rather than replacing real context with an empty summary.
|
||||
if not content.strip():
|
||||
raise RuntimeError(
|
||||
"Context compression LLM returned empty content "
|
||||
f"(provider={self.provider or 'auto'} "
|
||||
f"model={self.summary_model or self.model})"
|
||||
)
|
||||
# Redact the summary output as well — the summarizer LLM may
|
||||
# ignore prompt instructions and echo back secrets verbatim.
|
||||
summary = redact_sensitive_text(content.strip())
|
||||
|
|
@ -1601,16 +1617,27 @@ This compaction should PRIORITISE preserving all information related to the focu
|
|||
self._last_summary_error = None
|
||||
self._last_summary_auth_failure = False
|
||||
return self._with_summary_prefix(summary)
|
||||
except RuntimeError:
|
||||
# No provider configured — long cooldown, unlikely to self-resolve
|
||||
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
|
||||
self._last_summary_error = "no auxiliary LLM provider configured"
|
||||
logger.warning("Context compression: no provider available for "
|
||||
"summary. Middle turns will be dropped without summary "
|
||||
"for %d seconds.",
|
||||
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
|
||||
return None
|
||||
except Exception as e:
|
||||
# ``call_llm`` raises ``RuntimeError`` for two very different cases:
|
||||
# 1. No provider configured ("No LLM provider configured ...") —
|
||||
# a permanent misconfiguration, long cooldown is correct.
|
||||
# 2. An empty/invalid response from a configured provider
|
||||
# (``_validate_llm_response`` empty-``choices``/``None``, or our
|
||||
# empty-``content`` guard above) — a transient/proxy fault that
|
||||
# should fall back to the main model first, exactly like the
|
||||
# transport errors handled below.
|
||||
# Only (1) belongs in the long no-provider cooldown; (2) and every
|
||||
# other exception flow into the generic fallback logic so they get
|
||||
# a main-model retry before any cooldown. (#11978, #11914)
|
||||
if isinstance(e, RuntimeError) and "no llm provider configured" in str(e).lower():
|
||||
# No provider configured — long cooldown, unlikely to self-resolve
|
||||
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
|
||||
self._last_summary_error = "no auxiliary LLM provider configured"
|
||||
logger.warning("Context compression: no provider available for "
|
||||
"summary. Middle turns will be dropped without summary "
|
||||
"for %d seconds.",
|
||||
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
|
||||
return None
|
||||
# If the summary model is different from the main model and the
|
||||
# error looks permanent (model not found, 503, 404), fall back to
|
||||
# using the main model instead of entering cooldown that leaves
|
||||
|
|
|
|||
|
|
@ -357,11 +357,41 @@ class TestNonStringContent:
|
|||
assert isinstance(summary, str)
|
||||
assert summary.startswith(SUMMARY_PREFIX)
|
||||
|
||||
def test_none_content_coerced_to_empty(self):
|
||||
def test_none_content_treated_as_failure_not_empty_summary(self):
|
||||
"""Regression #11978/#11914: a well-formed response with ``content=None``
|
||||
(some OpenAI-compatible proxies, e.g. cmkey.cn, return HTTP 200 with
|
||||
null/empty content) must NOT be stored as a prefix-only summary that
|
||||
silently wipes the compacted turns. It is treated as a summary failure
|
||||
and routed through cooldown so the turns are dropped without a summary
|
||||
rather than replaced by an empty one."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [MagicMock()]
|
||||
mock_response.choices[0].message.content = None
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
# summary_model == model here, so no fallback path: straight to cooldown.
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "do something"},
|
||||
{"role": "assistant", "content": "ok"},
|
||||
]
|
||||
|
||||
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||
summary = c._generate_summary(messages)
|
||||
# Empty content → failure → None (drop turns), NOT a prefix-only summary.
|
||||
assert summary is None
|
||||
assert summary != SUMMARY_PREFIX
|
||||
# Transient cooldown engaged so we don't immediately retry the bad proxy.
|
||||
assert c._summary_failure_cooldown_until > 0
|
||||
|
||||
def test_empty_string_content_treated_as_failure(self):
|
||||
"""An empty-string (or whitespace-only) ``content`` is handled the same
|
||||
as ``None`` — failure, not an empty summary (#11978)."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [MagicMock()]
|
||||
mock_response.choices[0].message.content = " \n "
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
|
||||
|
|
@ -372,9 +402,36 @@ class TestNonStringContent:
|
|||
|
||||
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||
summary = c._generate_summary(messages)
|
||||
# None content → empty string → standardized compaction handoff prefix added
|
||||
assert summary is not None
|
||||
assert summary == SUMMARY_PREFIX
|
||||
assert summary is None
|
||||
assert c._summary_failure_cooldown_until > 0
|
||||
|
||||
def test_empty_content_falls_back_to_main_model(self):
|
||||
"""When the auxiliary summary model returns empty content and a distinct
|
||||
main model is configured, compression falls back to the main model
|
||||
before entering cooldown (#11978 glm-5.1 → glm-5 path)."""
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [MagicMock()]
|
||||
mock_response.choices[0].message.content = ""
|
||||
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||
c = ContextCompressor(
|
||||
model="glm-5",
|
||||
summary_model_override="glm-5.1",
|
||||
quiet_mode=True,
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "user", "content": "do something"},
|
||||
{"role": "assistant", "content": "ok"},
|
||||
]
|
||||
|
||||
with patch("agent.context_compressor.call_llm", return_value=mock_response) as mock_call:
|
||||
summary = c._generate_summary(messages)
|
||||
# Two calls: aux model (glm-5.1) then fallback to main (glm-5).
|
||||
assert mock_call.call_count == 2
|
||||
assert c._summary_model_fallen_back is True
|
||||
assert summary is None
|
||||
assert c._summary_failure_cooldown_until > 0
|
||||
|
||||
def test_summary_call_does_not_force_temperature(self):
|
||||
mock_response = MagicMock()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue