From b6ca56f651505d6a8ec2489f1048da3d2c07d12e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 26 May 2026 22:01:17 -0700 Subject: [PATCH] fix(codex-responses): gracefully recover from invalid_encrypted_content (salvage #10144) (#33035) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(codex-responses): gracefully recover from invalid_encrypted_content (salvage #10144) When an OpenAI-compatible Responses API surface accepts an initial request but later rejects the replayed `codex_reasoning_items` encrypted blob with HTTP 400 `invalid_encrypted_content`, the session previously got stuck retrying the same poisoned payload. Recovery: classify the error as a dedicated FailoverReason, and on the first hit disable encrypted reasoning replay for the rest of the session, strip cached items from message history, and retry once. Changes: * error_classifier: add FailoverReason.invalid_encrypted_content branch in _classify_400 (before context_overflow so the messages that mention 'encrypted content … could not be verified' don't trip context heuristics), in _classify_by_error_code, and extend _extract_error_code to peek inside wrapped JSON in error.message and ignore the bare '400' as a code. * agent_init: initialize `_codex_reasoning_replay_enabled = True` on every agent. * run_agent: add AIAgent._disable_codex_reasoning_replay() helper that flips the flag and pops cached items. * codex_responses_adapter: thread a `replay_encrypted_reasoning` kwarg through _chat_messages_to_responses_input so that when the flag is False we don't replay codex_reasoning_items. * transports/codex.py: read `replay_encrypted_reasoning` from params, thread it into the adapter, and gate the `include=['reasoning.encrypted_content']` request hint on it. * chat_completion_helpers: pass the agent's replay flag through to the transport. * conversation_loop: in the retry loop, add an invalid_encrypted_content recovery branch that fires once per session, only when api_mode == codex_responses, only when replay is still enabled, and only when at least one assistant message in history actually carries cached reasoning items (otherwise the 400 has nothing to do with our cache and the normal retry path handles it). Tests: * test_error_classifier: new wrapped-JSON _extract_error_code case; new TestClassifyApiError cases proving the 400 is retryable with no fallback, that the broad message match doesn't catch a generic 'parsed' message, and that the error code match is case-insensitive. * test_run_agent_codex_responses: end-to-end test of the recovery branch firing once and disabling replay, plus a sibling test that proves the branch does *not* fire (and the flag stays True) when history has no cached reasoning items. Salvages PR #10144 onto the post-refactor module layout (error_classifier / codex_responses_adapter / transports/codex / conversation_loop / agent_init) since the original diff was written against the pre-refactor monolithic run_agent.py. * chore(release): map victorGPT in AUTHOR_MAP for #10144 salvage --------- Co-authored-by: victorGPT --- agent/agent_init.py | 7 ++ agent/chat_completion_helpers.py | 3 + agent/codex_responses_adapter.py | 15 ++- agent/conversation_loop.py | 44 ++++++++ agent/error_classifier.py | 66 ++++++++++- agent/transports/codex.py | 15 ++- run_agent.py | 33 ++++++ scripts/release.py | 1 + tests/agent/test_error_classifier.py | 59 ++++++++++ .../test_run_agent_codex_responses.py | 104 ++++++++++++++++++ 10 files changed, 342 insertions(+), 5 deletions(-) diff --git a/agent/agent_init.py b/agent/agent_init.py index e20755c5091..92b4a73448a 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1005,6 +1005,13 @@ def init_agent( # Track conversation messages for session logging agent._session_messages: List[Dict[str, Any]] = [] + # Responses encrypted reasoning replay state. Some OpenAI-compatible + # routes accept GPT-5 Responses requests but later reject replayed + # encrypted reasoning blobs (HTTP 400 ``invalid_encrypted_content``). + # When that happens we disable replay for the rest of the session and + # fall back to stateless continuity. See + # agent/conversation_loop.py's invalid_encrypted_content retry branch. + agent._codex_reasoning_replay_enabled = True agent._memory_write_origin = "assistant_tool" agent._memory_write_context = "foreground" diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index 8fe6bcd20cb..975c16008a7 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -507,6 +507,9 @@ def build_api_kwargs(agent, api_messages: list) -> dict: is_codex_backend=is_codex_backend, is_xai_responses=is_xai_responses, github_reasoning_extra=agent._github_models_reasoning_extra_body() if is_github_responses else None, + replay_encrypted_reasoning=bool( + getattr(agent, "_codex_reasoning_replay_enabled", True) + ), ) # ── chat_completions (default) ───────────────────────────────────── diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py index 07ae5cc9506..c3affd185dc 100644 --- a/agent/codex_responses_adapter.py +++ b/agent/codex_responses_adapter.py @@ -248,6 +248,7 @@ def _chat_messages_to_responses_input( messages: List[Dict[str, Any]], *, is_xai_responses: bool = False, + replay_encrypted_reasoning: bool = True, ) -> List[Dict[str, Any]]: """Convert internal chat-style messages to Responses input items. @@ -261,6 +262,14 @@ def _chat_messages_to_responses_input( integration). We now replay encrypted reasoning on every Responses transport (xAI, native Codex, custom relays) and let xAI tell us explicitly if a specific surface ever rejects a payload. + + ``replay_encrypted_reasoning`` is the per-session kill switch. Some + OpenAI-compatible relays accept the request but later reject the + replayed encrypted blob with HTTP 400 ``invalid_encrypted_content``; + when that happens the retry loop calls + ``AIAgent._disable_codex_reasoning_replay`` which both strips cached + items from the conversation history and threads ``replay_enabled=False`` + through this converter so subsequent turns send no reasoning items. """ items: List[Dict[str, Any]] = [] seen_item_ids: set = set() @@ -290,7 +299,11 @@ def _chat_messages_to_responses_input( # This applies to every Responses transport including # xAI — see _chat_messages_to_responses_input docstring # for the May 2026 reversal of the earlier xAI gate. - codex_reasoning = msg.get("codex_reasoning_items") + codex_reasoning = ( + msg.get("codex_reasoning_items") + if replay_encrypted_reasoning + else None + ) has_codex_reasoning = False if isinstance(codex_reasoning, list): for ri in codex_reasoning: diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 35a64df48fe..078c62771ed 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -1019,6 +1019,7 @@ def run_conversation( nous_auth_retry_attempted=False copilot_auth_retry_attempted=False thinking_sig_retry_attempted = False + invalid_encrypted_content_retry_attempted = False image_shrink_retry_attempted = False multimodal_tool_content_retry_attempted = False oauth_1m_beta_retry_attempted = False @@ -2296,6 +2297,49 @@ def run_conversation( ) continue + # ── Invalid encrypted reasoning replay recovery ─────── + # OpenAI Responses API surfaces (and some compatible relays) + # return HTTP 400 ``invalid_encrypted_content`` when a + # replayed ``codex_reasoning_items`` blob from a previous + # turn fails verification (provider rotated the encryption + # key, the route doesn't actually persist reasoning state, + # etc.). Recovery: disable replay for the rest of the + # session, strip cached items from history, retry once. + # One-shot — if a second 400 fires we fall through to the + # normal retry/backoff path. Only fires for codex_responses + # mode with at least one assistant message that has cached + # ``codex_reasoning_items``; without replay state, the + # error is unrelated to our cache so the normal retry path + # handles it (the provider is rejecting something else). + if ( + classified.reason == FailoverReason.invalid_encrypted_content + and not invalid_encrypted_content_retry_attempted + and agent.api_mode == "codex_responses" + and bool(getattr(agent, "_codex_reasoning_replay_enabled", True)) + and any( + isinstance(_m, dict) + and _m.get("role") == "assistant" + and isinstance(_m.get("codex_reasoning_items"), list) + and _m.get("codex_reasoning_items") + for _m in messages + ) + ): + invalid_encrypted_content_retry_attempted = True + replay_stats = agent._disable_codex_reasoning_replay(messages) + agent._vprint( + f"{agent.log_prefix}⚠️ Encrypted reasoning replay was rejected by the provider — " + f"disabled replay and stripped {replay_stats['items']} item(s) from " + f"{replay_stats['messages']} message(s), retrying...", + force=True, + ) + logger.warning( + "%sInvalid encrypted reasoning recovery: disabled replay and stripped %d items from %d messages", + agent.log_prefix, + replay_stats["items"], + replay_stats["messages"], + ) + continue + # ── llama.cpp grammar-parse recovery ────────────────── # llama.cpp's ``json-schema-to-grammar`` converter rejects # regex escape classes (``\d``, ``\w``, ``\s``) and most diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 0afcf66d445..a0726a4e02a 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -50,6 +50,7 @@ class FailoverReason(enum.Enum): # Request format format_error = "format_error" # 400 bad request — abort or strip + retry + invalid_encrypted_content = "invalid_encrypted_content" # Responses replay blob rejected — strip replay state and retry multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported" # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry # Provider-specific @@ -865,6 +866,26 @@ def _classify_400( retryable=True, ) + # Invalid encrypted reasoning replay blob (OpenAI Responses API). Must be + # checked BEFORE context_overflow because some surfaces emit messages that + # contain context-like phrasing ("encrypted content … could not be + # verified") which could otherwise trip the context_overflow heuristics. + # ``error_msg`` is lowercased upstream — match accordingly. + error_code_lower = (error_code or "").lower() + if ( + error_code_lower == "invalid_encrypted_content" + or "invalid_encrypted_content" in error_msg + or ( + "encrypted content for item" in error_msg + and "could not be verified" in error_msg + ) + ): + return result_fn( + FailoverReason.invalid_encrypted_content, + retryable=True, + should_fallback=False, + ) + # Context overflow from 400 if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): return result_fn( @@ -974,6 +995,13 @@ def _classify_by_error_code( should_compress=True, ) + if code_lower == "invalid_encrypted_content": + return result_fn( + FailoverReason.invalid_encrypted_content, + retryable=True, + should_fallback=False, + ) + return None @@ -1141,15 +1169,49 @@ def _extract_error_code(body: dict) -> str: """Extract an error code string from the response body.""" if not body: return "" + + def _code_from_payload(payload) -> str: + """Extract a code/type from a nested error payload dict (defensive).""" + if not isinstance(payload, dict): + return "" + payload_error = payload.get("error", {}) + if isinstance(payload_error, dict): + nested = payload_error.get("code") or payload_error.get("type") or "" + if isinstance(nested, str) and nested.strip() and nested.strip() != "400": + return nested.strip() + code = payload.get("code") or payload.get("error_code") or "" + if isinstance(code, (str, int)): + text = str(code).strip() + if text and text != "400": + return text + return "" + error_obj = body.get("error", {}) if isinstance(error_obj, dict): code = error_obj.get("code") or error_obj.get("type") or "" - if isinstance(code, str) and code.strip(): + if isinstance(code, str) and code.strip() and code.strip() != "400": return code.strip() + + # Some providers wrap the real JSON error body as a string inside + # error.message — peek into it for a nested code (e.g. Responses API + # surfaces ``invalid_encrypted_content`` this way). + message = error_obj.get("message") + if isinstance(message, str) and message.strip().startswith("{"): + import json + try: + inner = json.loads(message) + except (json.JSONDecodeError, TypeError): + inner = None + nested_code = _code_from_payload(inner) + if nested_code: + return nested_code + # Top-level code code = body.get("code") or body.get("error_code") or "" if isinstance(code, (str, int)): - return str(code).strip() + text = str(code).strip() + if text and text != "400": + return text return "" diff --git a/agent/transports/codex.py b/agent/transports/codex.py index 970692c0394..664b2eb6a55 100644 --- a/agent/transports/codex.py +++ b/agent/transports/codex.py @@ -27,6 +27,9 @@ class ResponsesApiTransport(ProviderTransport): return _chat_messages_to_responses_input( messages, is_xai_responses=bool(kwargs.get("is_xai_responses")), + replay_encrypted_reasoning=bool( + kwargs.get("replay_encrypted_reasoning", True) + ), ) def convert_tools(self, tools: List[Dict[str, Any]]) -> Any: @@ -79,6 +82,9 @@ class ResponsesApiTransport(ProviderTransport): is_github_responses = params.get("is_github_responses", False) is_codex_backend = params.get("is_codex_backend", False) is_xai_responses = params.get("is_xai_responses", False) + replay_encrypted_reasoning = bool( + params.get("replay_encrypted_reasoning", True) + ) # Resolve reasoning effort reasoning_effort = "medium" @@ -100,6 +106,7 @@ class ResponsesApiTransport(ProviderTransport): "input": _chat_messages_to_responses_input( payload_messages, is_xai_responses=is_xai_responses, + replay_encrypted_reasoning=replay_encrypted_reasoning, ), "tools": response_tools, "store": False, @@ -121,7 +128,9 @@ class ResponsesApiTransport(ProviderTransport): # replay them on subsequent turns for cross-turn coherence. # See agent/codex_responses_adapter._chat_messages_to_responses_input # for the May 2026 reversal of the earlier suppression gate. - kwargs["include"] = ["reasoning.encrypted_content"] + kwargs["include"] = ( + ["reasoning.encrypted_content"] if replay_encrypted_reasoning else [] + ) # xAI rejects `reasoning.effort` on grok-4 / grok-4-fast / grok-3 # / grok-code-fast / grok-4.20-0309-* with HTTP 400 even though # those models reason natively. Only send the effort dial when @@ -136,7 +145,9 @@ class ResponsesApiTransport(ProviderTransport): kwargs["reasoning"] = github_reasoning else: kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"} - kwargs["include"] = ["reasoning.encrypted_content"] + kwargs["include"] = ( + ["reasoning.encrypted_content"] if replay_encrypted_reasoning else [] + ) elif not is_github_responses and not is_xai_responses: kwargs["include"] = [] diff --git a/run_agent.py b/run_agent.py index d2d65314f75..ac7f928c2bb 100644 --- a/run_agent.py +++ b/run_agent.py @@ -717,6 +717,39 @@ class AIAgent: except Exception: logger.debug("status_callback error in _emit_warning", exc_info=True) + def _disable_codex_reasoning_replay( + self, + messages: Optional[List[Dict[str, Any]]] = None, + ) -> Dict[str, int]: + """Disable Responses encrypted reasoning replay and strip cached state. + + Called from the conversation_loop retry path when the provider + rejects a replayed ``codex_reasoning_items`` blob with HTTP 400 + ``invalid_encrypted_content``. Sets ``self._codex_reasoning_replay_enabled`` + to ``False`` (consumed by ``codex_responses_adapter._chat_messages_to_responses_input`` + and ``transports/codex.py`` to drop ``reasoning.encrypted_content`` + from subsequent requests) and pops ``codex_reasoning_items`` from + every assistant message in ``messages`` so they cannot be replayed + again later in the session. + + Returns a small stats dict ``{"messages": int, "items": int}`` + counting what was stripped — purely for diagnostic logging. + """ + stripped_messages = 0 + stripped_items = 0 + target_messages = messages if isinstance(messages, list) else [] + + for msg in target_messages: + if not isinstance(msg, dict) or msg.get("role") != "assistant": + continue + items = msg.pop("codex_reasoning_items", None) + if isinstance(items, list) and items: + stripped_messages += 1 + stripped_items += len(items) + + self._codex_reasoning_replay_enabled = False + return {"messages": stripped_messages, "items": stripped_items} + # Stream-diagnostic class header preserved for backward compat — # actual list lives in ``agent.stream_diag.STREAM_DIAG_HEADERS``. from agent.stream_diag import STREAM_DIAG_HEADERS as _STREAM_DIAG_HEADERS # noqa: E402 diff --git a/scripts/release.py b/scripts/release.py index d422f52a6f3..82e2b382a5b 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -50,6 +50,7 @@ AUTHOR_MAP = { "276689385+carltonawong@users.noreply.github.com": "carltonawong", "wangpuv@hotmail.com": "wangpuv", "202622897+ticketclosed-wontfix@users.noreply.github.com": "ticketclosed-wontfix", + "wuxuebin1993@gmail.com": "victorGPT", # teknium (multiple emails) "teknium1@gmail.com": "teknium1", "kenyon1977@gmail.com": "kenyonxu", diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 397d2673552..579b364d146 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -56,6 +56,7 @@ class TestFailoverReason: "overloaded", "server_error", "timeout", "context_overflow", "payload_too_large", "image_too_large", "model_not_found", "format_error", + "invalid_encrypted_content", "multimodal_tool_content_unsupported", "provider_policy_blocked", "thinking_signature", "long_context_tier", @@ -144,6 +145,19 @@ class TestExtractErrorCode: body = {"code": "model_not_found"} assert _extract_error_code(body) == "model_not_found" + def test_from_wrapped_json_message(self): + body = { + "error": { + "message": ( + '{"error":{"message":"The encrypted content for item rs_001 could not be verified. ' + 'Reason: Encrypted content could not be decrypted or parsed.",' + '"type":"invalid_request_error","param":"","code":"invalid_encrypted_content"}}' + ), + "type": "400", + } + } + assert _extract_error_code(body) == "invalid_encrypted_content" + def test_empty_when_no_code(self): assert _extract_error_code({}) == "" assert _extract_error_code({"error": {"message": "oops"}}) == "" @@ -535,6 +549,51 @@ class TestClassifyApiError: # Without "thinking" in the message, it shouldn't be thinking_signature assert result.reason != FailoverReason.thinking_signature + def test_invalid_encrypted_content_classified_as_retryable_replay_failure(self): + body = { + "error": { + "message": ( + '{"error":{"message":"The encrypted content for item rs_001 could not be verified. ' + 'Reason: Encrypted content could not be decrypted or parsed.",' + '"type":"invalid_request_error","param":"","code":"invalid_encrypted_content"}}' + ), + "type": "400", + } + } + e = MockAPIError( + "Error code: 400 - invalid_encrypted_content", + status_code=400, + body=body, + ) + result = classify_api_error(e, provider="custom", model="gpt-5.4") + assert result.reason == FailoverReason.invalid_encrypted_content + assert result.retryable is True + assert result.should_fallback is False + + def test_invalid_encrypted_content_broad_message_match_does_not_catch_generic_parse_error(self): + message = "Encrypted content could not be decrypted or parsed." + e = MockAPIError( + message, + status_code=400, + body={"error": {"message": message}}, + ) + result = classify_api_error(e, provider="custom", model="gpt-5.4") + assert result.reason == FailoverReason.format_error + assert result.retryable is False + assert result.should_fallback is True + + @pytest.mark.parametrize("error_code", ["Invalid_Encrypted_Content", "INVALID_ENCRYPTED_CONTENT"]) + def test_invalid_encrypted_content_code_is_case_insensitive_for_400(self, error_code): + e = MockAPIError( + "Error code: 400 - bad request", + status_code=400, + body={"error": {"code": error_code, "message": "Bad request"}}, + ) + result = classify_api_error(e, provider="custom", model="gpt-5.4") + assert result.reason == FailoverReason.invalid_encrypted_content + assert result.retryable is True + assert result.should_fallback is False + # ── Provider-specific: llama.cpp grammar-parse ── def test_llama_cpp_grammar_parse_error(self): diff --git a/tests/run_agent/test_run_agent_codex_responses.py b/tests/run_agent/test_run_agent_codex_responses.py index fad3f68ffe3..4b58419ebcb 100644 --- a/tests/run_agent/test_run_agent_codex_responses.py +++ b/tests/run_agent/test_run_agent_codex_responses.py @@ -2041,3 +2041,107 @@ def test_preflight_codex_input_deduplicates_reasoning_ids(monkeypatch): # IDs must be stripped — with store=False the API 404s on id lookups. for it in reasoning_items: assert "id" not in it + + +def test_run_conversation_codex_disables_reasoning_replay_after_invalid_encrypted_content(monkeypatch): + agent = _build_agent(monkeypatch) + agent.provider = "custom" + agent.base_url = "https://api.example.com/v1" + + request_payloads = [] + + class _InvalidEncryptedContentError(Exception): + def __init__(self): + super().__init__( + "Error code: 400 - The encrypted content for item rs_001 could not be verified. " + "Reason: Encrypted content could not be decrypted or parsed." + ) + self.status_code = 400 + self.body = { + "error": { + "message": ( + '{"error":{"message":"The encrypted content for item rs_001 could not be verified. ' + 'Reason: Encrypted content could not be decrypted or parsed.",' + '"type":"invalid_request_error","param":"","code":"invalid_encrypted_content"}}' + ), + "type": "400", + } + } + + responses = [_InvalidEncryptedContentError(), _codex_message_response("Recovered without replay.")] + + def _fake_api_call(api_kwargs): + request_payloads.append(api_kwargs) + current = responses.pop(0) + if isinstance(current, Exception): + raise current + return current + + monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) + + history = [ + { + "role": "assistant", + "content": "", + "finish_reason": "incomplete", + "codex_reasoning_items": [ + {"type": "reasoning", "id": "rs_001", "encrypted_content": "enc_bad", "summary": []}, + ], + } + ] + + result = agent.run_conversation("continue", conversation_history=history) + + assert result["completed"] is True + assert result["final_response"] == "Recovered without replay." + assert len(request_payloads) == 2 + assert any(item.get("type") == "reasoning" for item in request_payloads[0]["input"]) + assert not any(item.get("type") == "reasoning" for item in request_payloads[1]["input"]) + assert request_payloads[0].get("include") == ["reasoning.encrypted_content"] + assert request_payloads[1].get("include") == [] + assert result["messages"][0].get("codex_reasoning_items") is None + assert agent._codex_reasoning_replay_enabled is False + + +def test_run_conversation_codex_invalid_encrypted_content_without_replay_state_does_not_disable_replay(monkeypatch): + agent = _build_agent(monkeypatch) + agent.provider = "custom" + agent.base_url = "https://api.example.com/v1" + monkeypatch.setattr(run_agent, "jittered_backoff", lambda *args, **kwargs: 0) + + request_payloads = [] + + class _InvalidEncryptedContentError(Exception): + def __init__(self): + super().__init__("Error code: 400 - bad request") + self.status_code = 400 + self.body = { + "error": { + "code": "INVALID_ENCRYPTED_CONTENT", + "message": "Bad request", + } + } + + responses = [_InvalidEncryptedContentError(), _codex_message_response("Recovered after generic retry.")] + + def _fake_api_call(api_kwargs): + request_payloads.append(api_kwargs) + current = responses.pop(0) + if isinstance(current, Exception): + raise current + return current + + monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) + + result = agent.run_conversation( + "continue", + conversation_history=[{"role": "assistant", "content": "No replay state here."}], + ) + + assert result["completed"] is True + assert result["final_response"] == "Recovered after generic retry." + assert len(request_payloads) == 2 + assert all(payload.get("include") == ["reasoning.encrypted_content"] for payload in request_payloads) + assert all(not any(item.get("type") == "reasoning" for item in payload["input"]) for payload in request_payloads) + assert agent._codex_reasoning_replay_enabled is True + assert result["messages"][0].get("codex_reasoning_items") is None