fix(context): handle JSON decode errors in compression — salvage of #22248 (#22416)

When an auxiliary LLM provider (or an upstream proxy) returns a non-JSON body with `Content-Type: application/json` — e.g. an HTML 502 page from a misconfigured gateway — the OpenAI SDK's `response.json()` raises a raw `json.JSONDecodeError` (or wraps it in `APIResponseValidationError` whose message contains "expecting value"). Previously this fell through to the unknown-error branch and entered a 60s cooldown without retrying on the main model, dropping the middle conversation turns instead. This change folds JSON-decode detection into the existing fast-path fallback chain: detect by `isinstance(e, JSONDecodeError)` OR substring match for "expecting value", retry once on the main model, and use a shorter 30s cooldown when already on main (the body shape tends to flip back to valid quickly when the upstream proxy recovers). The three duplicated fallback bodies (model-not-found, unknown-error, JSON-decode) are consolidated into a single `_fallback_to_main_for_compression` helper that handles the shared bookkeeping (record aux-model failure for `/usage`-style callers, clear summary_model, clear cooldown). Also adds three unit tests covering: raw `JSONDecodeError` retries on main, substring-match for wrapped exceptions, and the 30s cooldown when already on main. Salvage of #22248 by @0xharryriddle. Closes #22244. Co-authored-by: Harry Riddle <ntconguit@gmail.com>
2026-05-25 05:52:34 +00:00 · 2026-05-09 01:47:15 -07:00 · 2026-05-09 01:47:15 -07:00 · c7e8add120
commit c7e8add120
parent aef297a45e
2 changed files with 161 additions and 35 deletions
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -763,6 +763,33 @@ class ContextCompressor(ContextEngine):
        return "\n\n".join(parts)
    def _fallback_to_main_for_compression(self, e: Exception, reason: str) -> None:
        """Switch from a separate ``summary_model`` back to the main model.
        Centralises the bookkeeping shared by every fallback branch in
        :meth:`_generate_summary` (model-not-found, timeout, JSON decode,
        unknown error): record the aux-model failure for ``/usage``-style
        callers, clear the summary model so the next call uses the main one,
        and clear the cooldown so the immediate retry can run.
        ``reason`` is a short human-readable phrase ("unavailable",
        "timed out", "returned invalid JSON", "failed") that is interpolated
        into the warning log.
        """
        self._summary_model_fallen_back = True
        logging.warning(
            "Summary model '%s' %s (%s). "
            "Falling back to main model '%s' for compression.",
            self.summary_model, reason, e, self.model,
        )
        _err_text = str(e).strip() or e.__class__.__name__
        if len(_err_text) > 220:
            _err_text = _err_text[:217].rstrip() + "..."
        self._last_aux_model_failure_error = _err_text
        self._last_aux_model_failure_model = self.summary_model
        self.summary_model = ""  # empty = use main model
        self._summary_failure_cooldown_until = 0.0  # no cooldown — retry immediately
    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
        """Generate a structured summary of conversation turns.
@ -961,28 +988,42 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                _status in (408, 429, 502, 504)
                or "timeout" in _err_str
            )
            # Non-JSON / malformed-body responses from misconfigured providers
            # or proxies (e.g. an HTML 502 page returned with
            # ``Content-Type: application/json``) bubble up as
            # ``json.JSONDecodeError`` from the OpenAI SDK's ``response.json()``,
            # or as a wrapping ``APIResponseValidationError`` whose message
            # carries the substring "expecting value".  Treat these like a
            # transient provider failure: one retry on the main model, then a
            # short cooldown.  Issue #22244.
            _is_json_decode = (
                isinstance(e, json.JSONDecodeError)
                or "expecting value" in _err_str
            )
            if _is_json_decode and not _is_model_not_found and not _is_timeout:
                logger.error(
                    "Context compression failed: auxiliary LLM returned a "
                    "non-JSON response. provider=%s summary_model=%s "
                    "main_model=%s base_url=%s err=%s",
                    self.provider or "auto",
                    self.summary_model or "(main)",
                    self.model,
                    self.base_url or "default",
                    e,
                )
            if (
-                (_is_model_not_found or _is_timeout)
+                (_is_model_not_found or _is_timeout or _is_json_decode)
                and self.summary_model
                and self.summary_model != self.model
                and not getattr(self, "_summary_model_fallen_back", False)
            ):
-                self._summary_model_fallen_back = True
+                if _is_json_decode:
-                logging.warning(
+                    _reason = "returned invalid JSON"
-                    "Summary model '%s' unavailable (%s). "
+                elif _is_model_not_found:
-                    "Falling back to main model '%s' for compression.",
+                    _reason = "unavailable"
-                    self.summary_model, e, self.model,
+                else:
-                )
+                    _reason = "timed out"
-                # Record the aux-model failure so callers can warn the user
+                self._fallback_to_main_for_compression(e, _reason)
                # even if the retry-on-main succeeds — a misconfigured aux
                # model is something the user needs to fix.
                _err_text = str(e).strip() or e.__class__.__name__
                if len(_err_text) > 220:
                    _err_text = _err_text[:217].rstrip() + "..."
                self._last_aux_model_failure_error = _err_text
                self._last_aux_model_failure_model = self.summary_model
                self.summary_model = ""  # empty = use main model
                self._summary_failure_cooldown_until = 0.0  # no cooldown
                return self._generate_summary(turns_to_summarize, focus_topic=focus_topic)  # retry immediately
            # Unknown-error best-effort retry on main model.  Losing N turns of
@ -999,26 +1040,13 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                and self.summary_model != self.model
                and not getattr(self, "_summary_model_fallen_back", False)
            ):
-                self._summary_model_fallen_back = True
+                self._fallback_to_main_for_compression(e, "failed")
                logging.warning(
                    "Summary model '%s' failed (%s). "
                    "Retrying on main model '%s' before giving up.",
                    self.summary_model, e, self.model,
                )
                # Record the aux-model failure (see 404 branch above) — user
                # should know their configured model is broken even if main
                # recovers the call.
                _err_text = str(e).strip() or e.__class__.__name__
                if len(_err_text) > 220:
                    _err_text = _err_text[:217].rstrip() + "..."
                self._last_aux_model_failure_error = _err_text
                self._last_aux_model_failure_model = self.summary_model
                self.summary_model = ""  # empty = use main model
                self._summary_failure_cooldown_until = 0.0
                return self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
-            # Transient errors (timeout, rate limit, network) — shorter cooldown
+            # Transient errors (timeout, rate limit, network, JSON decode) —
-            _transient_cooldown = 60
+            # shorter cooldown for JSON decode since the body shape can flip
            # back to valid quickly when an upstream proxy recovers.
            _transient_cooldown = 30 if _is_json_decode else 60
            self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
            err_text = str(e).strip() or e.__class__.__name__
            if len(err_text) > 220:
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@ -400,6 +400,104 @@ class TestSummaryFallbackToMainModel:
        assert result is None
        assert c._summary_model_fallen_back is True
    def test_json_decode_error_falls_back_to_main_and_succeeds(self):
        """JSONDecodeError from the OpenAI SDK's ``response.json()`` (raised
        when a misconfigured proxy returns HTML/plain-text with
        ``Content-Type: application/json``) should trigger the same
        retry-on-main path as 404/timeout.  Issue #22244."""
        import json as _json
        mock_ok = MagicMock()
        mock_ok.choices = [MagicMock()]
        mock_ok.choices[0].message.content = "summary via main model"
        # Simulate the SDK raising a raw JSONDecodeError with a realistic
        # error message ("Expecting value: line X column Y char Z").
        err_json = _json.JSONDecodeError(
            "Expecting value", "<!DOCTYPE html><html>...</html>", 0
        )
        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
            c = ContextCompressor(
                model="main-model",
                summary_model_override="aux-via-broken-proxy",
                quiet_mode=True,
            )
        with patch(
            "agent.context_compressor.call_llm",
            side_effect=[err_json, mock_ok],
        ) as mock_call:
            result = c._generate_summary(self._msgs())
        assert mock_call.call_count == 2
        assert mock_call.call_args_list[0].kwargs.get("model") == "aux-via-broken-proxy"
        assert "model" not in mock_call.call_args_list[1].kwargs
        assert result is not None
        assert "summary via main model" in result
        # Aux-model failure recorded so /usage / gateway warnings can surface it
        assert c._last_aux_model_failure_model == "aux-via-broken-proxy"
        assert c._last_aux_model_failure_error is not None
        # The 220-char cap is shared with other fallback branches
        assert len(c._last_aux_model_failure_error) <= 220
    def test_json_decode_error_substring_match_in_wrapped_exception(self):
        """When the OpenAI SDK wraps the raw JSONDecodeError inside its own
        ``APIResponseValidationError`` (or similar), ``isinstance`` no longer
        matches but the substring "expecting value" still appears in
        ``str(e)``.  We detect this case by string match and fall back the
        same way."""
        mock_ok = MagicMock()
        mock_ok.choices = [MagicMock()]
        mock_ok.choices[0].message.content = "summary via main model"
        # A plain Exception with the canonical JSON decode error text — what
        # the SDK's APIResponseValidationError looks like at str() time.
        err_wrapped = Exception("Expecting value: line 1 column 1 (char 0)")
        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
            c = ContextCompressor(
                model="main-model",
                summary_model_override="aux-model",
                quiet_mode=True,
            )
        with patch(
            "agent.context_compressor.call_llm",
            side_effect=[err_wrapped, mock_ok],
        ) as mock_call:
            result = c._generate_summary(self._msgs())
        assert mock_call.call_count == 2
        assert result is not None
        assert "summary via main model" in result
    def test_json_decode_error_on_main_uses_short_cooldown(self):
        """When already on the main model (no separate summary_model, or
        fallback already happened), a JSONDecodeError should set the short
        30s cooldown, not the default 60s — provider bodies tend to
        recover quickly when an upstream proxy comes back online."""
        import json as _json
        err_json = _json.JSONDecodeError("Expecting value", "<html/>", 0)
        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
            c = ContextCompressor(
                model="main-model",
                # No summary_model_override → already on main, no fallback path.
                quiet_mode=True,
            )
        with patch(
            "agent.context_compressor.call_llm",
            side_effect=err_json,
        ), patch("agent.context_compressor.time.monotonic", return_value=1000.0):
            result = c._generate_summary(self._msgs())
        assert result is None
        # Short JSON-decode cooldown is 30s, not the default 60s.
        assert c._summary_failure_cooldown_until == 1030.0
 class TestAuxModelFallbackSurfacedToCallers:
    """When summary_model fails but retry-on-main succeeds, compress() must