fix(context): handle JSON decode errors in compression — salvage of #22248 (#22416)

When an auxiliary LLM provider (or an upstream proxy) returns a non-JSON
body with `Content-Type: application/json` — e.g. an HTML 502 page from a
misconfigured gateway — the OpenAI SDK's `response.json()` raises a raw
`json.JSONDecodeError` (or wraps it in `APIResponseValidationError` whose
message contains "expecting value"). Previously this fell through to the
unknown-error branch and entered a 60s cooldown without retrying on the
main model, dropping the middle conversation turns instead.

This change folds JSON-decode detection into the existing fast-path
fallback chain: detect by `isinstance(e, JSONDecodeError)` OR substring
match for "expecting value", retry once on the main model, and use a
shorter 30s cooldown when already on main (the body shape tends to flip
back to valid quickly when the upstream proxy recovers).

The three duplicated fallback bodies (model-not-found, unknown-error,
JSON-decode) are consolidated into a single `_fallback_to_main_for_compression`
helper that handles the shared bookkeeping (record aux-model failure for
`/usage`-style callers, clear summary_model, clear cooldown).

Also adds three unit tests covering: raw `JSONDecodeError` retries on main,
substring-match for wrapped exceptions, and the 30s cooldown when already
on main.

Salvage of #22248 by @0xharryriddle. Closes #22244.

Co-authored-by: Harry Riddle <ntconguit@gmail.com>
This commit is contained in:
kshitij 2026-05-09 01:47:15 -07:00 committed by GitHub
parent aef297a45e
commit c7e8add120
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 161 additions and 35 deletions

View file

@ -763,6 +763,33 @@ class ContextCompressor(ContextEngine):
return "\n\n".join(parts)
def _fallback_to_main_for_compression(self, e: Exception, reason: str) -> None:
"""Switch from a separate ``summary_model`` back to the main model.
Centralises the bookkeeping shared by every fallback branch in
:meth:`_generate_summary` (model-not-found, timeout, JSON decode,
unknown error): record the aux-model failure for ``/usage``-style
callers, clear the summary model so the next call uses the main one,
and clear the cooldown so the immediate retry can run.
``reason`` is a short human-readable phrase ("unavailable",
"timed out", "returned invalid JSON", "failed") that is interpolated
into the warning log.
"""
self._summary_model_fallen_back = True
logging.warning(
"Summary model '%s' %s (%s). "
"Falling back to main model '%s' for compression.",
self.summary_model, reason, e, self.model,
)
_err_text = str(e).strip() or e.__class__.__name__
if len(_err_text) > 220:
_err_text = _err_text[:217].rstrip() + "..."
self._last_aux_model_failure_error = _err_text
self._last_aux_model_failure_model = self.summary_model
self.summary_model = "" # empty = use main model
self._summary_failure_cooldown_until = 0.0 # no cooldown — retry immediately
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
"""Generate a structured summary of conversation turns.
@ -961,28 +988,42 @@ The user has requested that this compaction PRIORITISE preserving all informatio
_status in (408, 429, 502, 504)
or "timeout" in _err_str
)
# Non-JSON / malformed-body responses from misconfigured providers
# or proxies (e.g. an HTML 502 page returned with
# ``Content-Type: application/json``) bubble up as
# ``json.JSONDecodeError`` from the OpenAI SDK's ``response.json()``,
# or as a wrapping ``APIResponseValidationError`` whose message
# carries the substring "expecting value". Treat these like a
# transient provider failure: one retry on the main model, then a
# short cooldown. Issue #22244.
_is_json_decode = (
isinstance(e, json.JSONDecodeError)
or "expecting value" in _err_str
)
if _is_json_decode and not _is_model_not_found and not _is_timeout:
logger.error(
"Context compression failed: auxiliary LLM returned a "
"non-JSON response. provider=%s summary_model=%s "
"main_model=%s base_url=%s err=%s",
self.provider or "auto",
self.summary_model or "(main)",
self.model,
self.base_url or "default",
e,
)
if (
(_is_model_not_found or _is_timeout)
(_is_model_not_found or _is_timeout or _is_json_decode)
and self.summary_model
and self.summary_model != self.model
and not getattr(self, "_summary_model_fallen_back", False)
):
self._summary_model_fallen_back = True
logging.warning(
"Summary model '%s' unavailable (%s). "
"Falling back to main model '%s' for compression.",
self.summary_model, e, self.model,
)
# Record the aux-model failure so callers can warn the user
# even if the retry-on-main succeeds — a misconfigured aux
# model is something the user needs to fix.
_err_text = str(e).strip() or e.__class__.__name__
if len(_err_text) > 220:
_err_text = _err_text[:217].rstrip() + "..."
self._last_aux_model_failure_error = _err_text
self._last_aux_model_failure_model = self.summary_model
self.summary_model = "" # empty = use main model
self._summary_failure_cooldown_until = 0.0 # no cooldown
if _is_json_decode:
_reason = "returned invalid JSON"
elif _is_model_not_found:
_reason = "unavailable"
else:
_reason = "timed out"
self._fallback_to_main_for_compression(e, _reason)
return self._generate_summary(turns_to_summarize, focus_topic=focus_topic) # retry immediately
# Unknown-error best-effort retry on main model. Losing N turns of
@ -999,26 +1040,13 @@ The user has requested that this compaction PRIORITISE preserving all informatio
and self.summary_model != self.model
and not getattr(self, "_summary_model_fallen_back", False)
):
self._summary_model_fallen_back = True
logging.warning(
"Summary model '%s' failed (%s). "
"Retrying on main model '%s' before giving up.",
self.summary_model, e, self.model,
)
# Record the aux-model failure (see 404 branch above) — user
# should know their configured model is broken even if main
# recovers the call.
_err_text = str(e).strip() or e.__class__.__name__
if len(_err_text) > 220:
_err_text = _err_text[:217].rstrip() + "..."
self._last_aux_model_failure_error = _err_text
self._last_aux_model_failure_model = self.summary_model
self.summary_model = "" # empty = use main model
self._summary_failure_cooldown_until = 0.0
self._fallback_to_main_for_compression(e, "failed")
return self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
# Transient errors (timeout, rate limit, network) — shorter cooldown
_transient_cooldown = 60
# Transient errors (timeout, rate limit, network, JSON decode) —
# shorter cooldown for JSON decode since the body shape can flip
# back to valid quickly when an upstream proxy recovers.
_transient_cooldown = 30 if _is_json_decode else 60
self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
err_text = str(e).strip() or e.__class__.__name__
if len(err_text) > 220: