diff --git a/agent/agent_init.py b/agent/agent_init.py index e20755c5091..92b4a73448a 100644 --- a/agent/agent_init.py +++ b/agent/agent_init.py @@ -1005,6 +1005,13 @@ def init_agent( # Track conversation messages for session logging agent._session_messages: List[Dict[str, Any]] = [] + # Responses encrypted reasoning replay state. Some OpenAI-compatible + # routes accept GPT-5 Responses requests but later reject replayed + # encrypted reasoning blobs (HTTP 400 ``invalid_encrypted_content``). + # When that happens we disable replay for the rest of the session and + # fall back to stateless continuity. See + # agent/conversation_loop.py's invalid_encrypted_content retry branch. + agent._codex_reasoning_replay_enabled = True agent._memory_write_origin = "assistant_tool" agent._memory_write_context = "foreground" diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index 8fe6bcd20cb..975c16008a7 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -507,6 +507,9 @@ def build_api_kwargs(agent, api_messages: list) -> dict: is_codex_backend=is_codex_backend, is_xai_responses=is_xai_responses, github_reasoning_extra=agent._github_models_reasoning_extra_body() if is_github_responses else None, + replay_encrypted_reasoning=bool( + getattr(agent, "_codex_reasoning_replay_enabled", True) + ), ) # ── chat_completions (default) ───────────────────────────────────── diff --git a/agent/codex_responses_adapter.py b/agent/codex_responses_adapter.py index 07ae5cc9506..c3affd185dc 100644 --- a/agent/codex_responses_adapter.py +++ b/agent/codex_responses_adapter.py @@ -248,6 +248,7 @@ def _chat_messages_to_responses_input( messages: List[Dict[str, Any]], *, is_xai_responses: bool = False, + replay_encrypted_reasoning: bool = True, ) -> List[Dict[str, Any]]: """Convert internal chat-style messages to Responses input items. @@ -261,6 +262,14 @@ def _chat_messages_to_responses_input( integration). We now replay encrypted reasoning on every Responses transport (xAI, native Codex, custom relays) and let xAI tell us explicitly if a specific surface ever rejects a payload. + + ``replay_encrypted_reasoning`` is the per-session kill switch. Some + OpenAI-compatible relays accept the request but later reject the + replayed encrypted blob with HTTP 400 ``invalid_encrypted_content``; + when that happens the retry loop calls + ``AIAgent._disable_codex_reasoning_replay`` which both strips cached + items from the conversation history and threads ``replay_enabled=False`` + through this converter so subsequent turns send no reasoning items. """ items: List[Dict[str, Any]] = [] seen_item_ids: set = set() @@ -290,7 +299,11 @@ def _chat_messages_to_responses_input( # This applies to every Responses transport including # xAI — see _chat_messages_to_responses_input docstring # for the May 2026 reversal of the earlier xAI gate. - codex_reasoning = msg.get("codex_reasoning_items") + codex_reasoning = ( + msg.get("codex_reasoning_items") + if replay_encrypted_reasoning + else None + ) has_codex_reasoning = False if isinstance(codex_reasoning, list): for ri in codex_reasoning: diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 35a64df48fe..078c62771ed 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -1019,6 +1019,7 @@ def run_conversation( nous_auth_retry_attempted=False copilot_auth_retry_attempted=False thinking_sig_retry_attempted = False + invalid_encrypted_content_retry_attempted = False image_shrink_retry_attempted = False multimodal_tool_content_retry_attempted = False oauth_1m_beta_retry_attempted = False @@ -2296,6 +2297,49 @@ def run_conversation( ) continue + # ── Invalid encrypted reasoning replay recovery ─────── + # OpenAI Responses API surfaces (and some compatible relays) + # return HTTP 400 ``invalid_encrypted_content`` when a + # replayed ``codex_reasoning_items`` blob from a previous + # turn fails verification (provider rotated the encryption + # key, the route doesn't actually persist reasoning state, + # etc.). Recovery: disable replay for the rest of the + # session, strip cached items from history, retry once. + # One-shot — if a second 400 fires we fall through to the + # normal retry/backoff path. Only fires for codex_responses + # mode with at least one assistant message that has cached + # ``codex_reasoning_items``; without replay state, the + # error is unrelated to our cache so the normal retry path + # handles it (the provider is rejecting something else). + if ( + classified.reason == FailoverReason.invalid_encrypted_content + and not invalid_encrypted_content_retry_attempted + and agent.api_mode == "codex_responses" + and bool(getattr(agent, "_codex_reasoning_replay_enabled", True)) + and any( + isinstance(_m, dict) + and _m.get("role") == "assistant" + and isinstance(_m.get("codex_reasoning_items"), list) + and _m.get("codex_reasoning_items") + for _m in messages + ) + ): + invalid_encrypted_content_retry_attempted = True + replay_stats = agent._disable_codex_reasoning_replay(messages) + agent._vprint( + f"{agent.log_prefix}⚠️ Encrypted reasoning replay was rejected by the provider — " + f"disabled replay and stripped {replay_stats['items']} item(s) from " + f"{replay_stats['messages']} message(s), retrying...", + force=True, + ) + logger.warning( + "%sInvalid encrypted reasoning recovery: disabled replay and stripped %d items from %d messages", + agent.log_prefix, + replay_stats["items"], + replay_stats["messages"], + ) + continue + # ── llama.cpp grammar-parse recovery ────────────────── # llama.cpp's ``json-schema-to-grammar`` converter rejects # regex escape classes (``\d``, ``\w``, ``\s``) and most diff --git a/agent/error_classifier.py b/agent/error_classifier.py index 0afcf66d445..a0726a4e02a 100644 --- a/agent/error_classifier.py +++ b/agent/error_classifier.py @@ -50,6 +50,7 @@ class FailoverReason(enum.Enum): # Request format format_error = "format_error" # 400 bad request — abort or strip + retry + invalid_encrypted_content = "invalid_encrypted_content" # Responses replay blob rejected — strip replay state and retry multimodal_tool_content_unsupported = "multimodal_tool_content_unsupported" # Provider rejected list-type content in tool messages (e.g. Xiaomi MiMo) — downgrade to text and retry # Provider-specific @@ -865,6 +866,26 @@ def _classify_400( retryable=True, ) + # Invalid encrypted reasoning replay blob (OpenAI Responses API). Must be + # checked BEFORE context_overflow because some surfaces emit messages that + # contain context-like phrasing ("encrypted content … could not be + # verified") which could otherwise trip the context_overflow heuristics. + # ``error_msg`` is lowercased upstream — match accordingly. + error_code_lower = (error_code or "").lower() + if ( + error_code_lower == "invalid_encrypted_content" + or "invalid_encrypted_content" in error_msg + or ( + "encrypted content for item" in error_msg + and "could not be verified" in error_msg + ) + ): + return result_fn( + FailoverReason.invalid_encrypted_content, + retryable=True, + should_fallback=False, + ) + # Context overflow from 400 if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): return result_fn( @@ -974,6 +995,13 @@ def _classify_by_error_code( should_compress=True, ) + if code_lower == "invalid_encrypted_content": + return result_fn( + FailoverReason.invalid_encrypted_content, + retryable=True, + should_fallback=False, + ) + return None @@ -1141,15 +1169,49 @@ def _extract_error_code(body: dict) -> str: """Extract an error code string from the response body.""" if not body: return "" + + def _code_from_payload(payload) -> str: + """Extract a code/type from a nested error payload dict (defensive).""" + if not isinstance(payload, dict): + return "" + payload_error = payload.get("error", {}) + if isinstance(payload_error, dict): + nested = payload_error.get("code") or payload_error.get("type") or "" + if isinstance(nested, str) and nested.strip() and nested.strip() != "400": + return nested.strip() + code = payload.get("code") or payload.get("error_code") or "" + if isinstance(code, (str, int)): + text = str(code).strip() + if text and text != "400": + return text + return "" + error_obj = body.get("error", {}) if isinstance(error_obj, dict): code = error_obj.get("code") or error_obj.get("type") or "" - if isinstance(code, str) and code.strip(): + if isinstance(code, str) and code.strip() and code.strip() != "400": return code.strip() + + # Some providers wrap the real JSON error body as a string inside + # error.message — peek into it for a nested code (e.g. Responses API + # surfaces ``invalid_encrypted_content`` this way). + message = error_obj.get("message") + if isinstance(message, str) and message.strip().startswith("{"): + import json + try: + inner = json.loads(message) + except (json.JSONDecodeError, TypeError): + inner = None + nested_code = _code_from_payload(inner) + if nested_code: + return nested_code + # Top-level code code = body.get("code") or body.get("error_code") or "" if isinstance(code, (str, int)): - return str(code).strip() + text = str(code).strip() + if text and text != "400": + return text return "" diff --git a/agent/transports/codex.py b/agent/transports/codex.py index 970692c0394..664b2eb6a55 100644 --- a/agent/transports/codex.py +++ b/agent/transports/codex.py @@ -27,6 +27,9 @@ class ResponsesApiTransport(ProviderTransport): return _chat_messages_to_responses_input( messages, is_xai_responses=bool(kwargs.get("is_xai_responses")), + replay_encrypted_reasoning=bool( + kwargs.get("replay_encrypted_reasoning", True) + ), ) def convert_tools(self, tools: List[Dict[str, Any]]) -> Any: @@ -79,6 +82,9 @@ class ResponsesApiTransport(ProviderTransport): is_github_responses = params.get("is_github_responses", False) is_codex_backend = params.get("is_codex_backend", False) is_xai_responses = params.get("is_xai_responses", False) + replay_encrypted_reasoning = bool( + params.get("replay_encrypted_reasoning", True) + ) # Resolve reasoning effort reasoning_effort = "medium" @@ -100,6 +106,7 @@ class ResponsesApiTransport(ProviderTransport): "input": _chat_messages_to_responses_input( payload_messages, is_xai_responses=is_xai_responses, + replay_encrypted_reasoning=replay_encrypted_reasoning, ), "tools": response_tools, "store": False, @@ -121,7 +128,9 @@ class ResponsesApiTransport(ProviderTransport): # replay them on subsequent turns for cross-turn coherence. # See agent/codex_responses_adapter._chat_messages_to_responses_input # for the May 2026 reversal of the earlier suppression gate. - kwargs["include"] = ["reasoning.encrypted_content"] + kwargs["include"] = ( + ["reasoning.encrypted_content"] if replay_encrypted_reasoning else [] + ) # xAI rejects `reasoning.effort` on grok-4 / grok-4-fast / grok-3 # / grok-code-fast / grok-4.20-0309-* with HTTP 400 even though # those models reason natively. Only send the effort dial when @@ -136,7 +145,9 @@ class ResponsesApiTransport(ProviderTransport): kwargs["reasoning"] = github_reasoning else: kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"} - kwargs["include"] = ["reasoning.encrypted_content"] + kwargs["include"] = ( + ["reasoning.encrypted_content"] if replay_encrypted_reasoning else [] + ) elif not is_github_responses and not is_xai_responses: kwargs["include"] = [] diff --git a/run_agent.py b/run_agent.py index d2d65314f75..ac7f928c2bb 100644 --- a/run_agent.py +++ b/run_agent.py @@ -717,6 +717,39 @@ class AIAgent: except Exception: logger.debug("status_callback error in _emit_warning", exc_info=True) + def _disable_codex_reasoning_replay( + self, + messages: Optional[List[Dict[str, Any]]] = None, + ) -> Dict[str, int]: + """Disable Responses encrypted reasoning replay and strip cached state. + + Called from the conversation_loop retry path when the provider + rejects a replayed ``codex_reasoning_items`` blob with HTTP 400 + ``invalid_encrypted_content``. Sets ``self._codex_reasoning_replay_enabled`` + to ``False`` (consumed by ``codex_responses_adapter._chat_messages_to_responses_input`` + and ``transports/codex.py`` to drop ``reasoning.encrypted_content`` + from subsequent requests) and pops ``codex_reasoning_items`` from + every assistant message in ``messages`` so they cannot be replayed + again later in the session. + + Returns a small stats dict ``{"messages": int, "items": int}`` + counting what was stripped — purely for diagnostic logging. + """ + stripped_messages = 0 + stripped_items = 0 + target_messages = messages if isinstance(messages, list) else [] + + for msg in target_messages: + if not isinstance(msg, dict) or msg.get("role") != "assistant": + continue + items = msg.pop("codex_reasoning_items", None) + if isinstance(items, list) and items: + stripped_messages += 1 + stripped_items += len(items) + + self._codex_reasoning_replay_enabled = False + return {"messages": stripped_messages, "items": stripped_items} + # Stream-diagnostic class header preserved for backward compat — # actual list lives in ``agent.stream_diag.STREAM_DIAG_HEADERS``. from agent.stream_diag import STREAM_DIAG_HEADERS as _STREAM_DIAG_HEADERS # noqa: E402 diff --git a/scripts/release.py b/scripts/release.py index d422f52a6f3..82e2b382a5b 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -50,6 +50,7 @@ AUTHOR_MAP = { "276689385+carltonawong@users.noreply.github.com": "carltonawong", "wangpuv@hotmail.com": "wangpuv", "202622897+ticketclosed-wontfix@users.noreply.github.com": "ticketclosed-wontfix", + "wuxuebin1993@gmail.com": "victorGPT", # teknium (multiple emails) "teknium1@gmail.com": "teknium1", "kenyon1977@gmail.com": "kenyonxu", diff --git a/tests/agent/test_error_classifier.py b/tests/agent/test_error_classifier.py index 397d2673552..579b364d146 100644 --- a/tests/agent/test_error_classifier.py +++ b/tests/agent/test_error_classifier.py @@ -56,6 +56,7 @@ class TestFailoverReason: "overloaded", "server_error", "timeout", "context_overflow", "payload_too_large", "image_too_large", "model_not_found", "format_error", + "invalid_encrypted_content", "multimodal_tool_content_unsupported", "provider_policy_blocked", "thinking_signature", "long_context_tier", @@ -144,6 +145,19 @@ class TestExtractErrorCode: body = {"code": "model_not_found"} assert _extract_error_code(body) == "model_not_found" + def test_from_wrapped_json_message(self): + body = { + "error": { + "message": ( + '{"error":{"message":"The encrypted content for item rs_001 could not be verified. ' + 'Reason: Encrypted content could not be decrypted or parsed.",' + '"type":"invalid_request_error","param":"","code":"invalid_encrypted_content"}}' + ), + "type": "400", + } + } + assert _extract_error_code(body) == "invalid_encrypted_content" + def test_empty_when_no_code(self): assert _extract_error_code({}) == "" assert _extract_error_code({"error": {"message": "oops"}}) == "" @@ -535,6 +549,51 @@ class TestClassifyApiError: # Without "thinking" in the message, it shouldn't be thinking_signature assert result.reason != FailoverReason.thinking_signature + def test_invalid_encrypted_content_classified_as_retryable_replay_failure(self): + body = { + "error": { + "message": ( + '{"error":{"message":"The encrypted content for item rs_001 could not be verified. ' + 'Reason: Encrypted content could not be decrypted or parsed.",' + '"type":"invalid_request_error","param":"","code":"invalid_encrypted_content"}}' + ), + "type": "400", + } + } + e = MockAPIError( + "Error code: 400 - invalid_encrypted_content", + status_code=400, + body=body, + ) + result = classify_api_error(e, provider="custom", model="gpt-5.4") + assert result.reason == FailoverReason.invalid_encrypted_content + assert result.retryable is True + assert result.should_fallback is False + + def test_invalid_encrypted_content_broad_message_match_does_not_catch_generic_parse_error(self): + message = "Encrypted content could not be decrypted or parsed." + e = MockAPIError( + message, + status_code=400, + body={"error": {"message": message}}, + ) + result = classify_api_error(e, provider="custom", model="gpt-5.4") + assert result.reason == FailoverReason.format_error + assert result.retryable is False + assert result.should_fallback is True + + @pytest.mark.parametrize("error_code", ["Invalid_Encrypted_Content", "INVALID_ENCRYPTED_CONTENT"]) + def test_invalid_encrypted_content_code_is_case_insensitive_for_400(self, error_code): + e = MockAPIError( + "Error code: 400 - bad request", + status_code=400, + body={"error": {"code": error_code, "message": "Bad request"}}, + ) + result = classify_api_error(e, provider="custom", model="gpt-5.4") + assert result.reason == FailoverReason.invalid_encrypted_content + assert result.retryable is True + assert result.should_fallback is False + # ── Provider-specific: llama.cpp grammar-parse ── def test_llama_cpp_grammar_parse_error(self): diff --git a/tests/run_agent/test_run_agent_codex_responses.py b/tests/run_agent/test_run_agent_codex_responses.py index fad3f68ffe3..4b58419ebcb 100644 --- a/tests/run_agent/test_run_agent_codex_responses.py +++ b/tests/run_agent/test_run_agent_codex_responses.py @@ -2041,3 +2041,107 @@ def test_preflight_codex_input_deduplicates_reasoning_ids(monkeypatch): # IDs must be stripped — with store=False the API 404s on id lookups. for it in reasoning_items: assert "id" not in it + + +def test_run_conversation_codex_disables_reasoning_replay_after_invalid_encrypted_content(monkeypatch): + agent = _build_agent(monkeypatch) + agent.provider = "custom" + agent.base_url = "https://api.example.com/v1" + + request_payloads = [] + + class _InvalidEncryptedContentError(Exception): + def __init__(self): + super().__init__( + "Error code: 400 - The encrypted content for item rs_001 could not be verified. " + "Reason: Encrypted content could not be decrypted or parsed." + ) + self.status_code = 400 + self.body = { + "error": { + "message": ( + '{"error":{"message":"The encrypted content for item rs_001 could not be verified. ' + 'Reason: Encrypted content could not be decrypted or parsed.",' + '"type":"invalid_request_error","param":"","code":"invalid_encrypted_content"}}' + ), + "type": "400", + } + } + + responses = [_InvalidEncryptedContentError(), _codex_message_response("Recovered without replay.")] + + def _fake_api_call(api_kwargs): + request_payloads.append(api_kwargs) + current = responses.pop(0) + if isinstance(current, Exception): + raise current + return current + + monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) + + history = [ + { + "role": "assistant", + "content": "", + "finish_reason": "incomplete", + "codex_reasoning_items": [ + {"type": "reasoning", "id": "rs_001", "encrypted_content": "enc_bad", "summary": []}, + ], + } + ] + + result = agent.run_conversation("continue", conversation_history=history) + + assert result["completed"] is True + assert result["final_response"] == "Recovered without replay." + assert len(request_payloads) == 2 + assert any(item.get("type") == "reasoning" for item in request_payloads[0]["input"]) + assert not any(item.get("type") == "reasoning" for item in request_payloads[1]["input"]) + assert request_payloads[0].get("include") == ["reasoning.encrypted_content"] + assert request_payloads[1].get("include") == [] + assert result["messages"][0].get("codex_reasoning_items") is None + assert agent._codex_reasoning_replay_enabled is False + + +def test_run_conversation_codex_invalid_encrypted_content_without_replay_state_does_not_disable_replay(monkeypatch): + agent = _build_agent(monkeypatch) + agent.provider = "custom" + agent.base_url = "https://api.example.com/v1" + monkeypatch.setattr(run_agent, "jittered_backoff", lambda *args, **kwargs: 0) + + request_payloads = [] + + class _InvalidEncryptedContentError(Exception): + def __init__(self): + super().__init__("Error code: 400 - bad request") + self.status_code = 400 + self.body = { + "error": { + "code": "INVALID_ENCRYPTED_CONTENT", + "message": "Bad request", + } + } + + responses = [_InvalidEncryptedContentError(), _codex_message_response("Recovered after generic retry.")] + + def _fake_api_call(api_kwargs): + request_payloads.append(api_kwargs) + current = responses.pop(0) + if isinstance(current, Exception): + raise current + return current + + monkeypatch.setattr(agent, "_interruptible_api_call", _fake_api_call) + + result = agent.run_conversation( + "continue", + conversation_history=[{"role": "assistant", "content": "No replay state here."}], + ) + + assert result["completed"] is True + assert result["final_response"] == "Recovered after generic retry." + assert len(request_payloads) == 2 + assert all(payload.get("include") == ["reasoning.encrypted_content"] for payload in request_payloads) + assert all(not any(item.get("type") == "reasoning" for item in payload["input"]) for payload in request_payloads) + assert agent._codex_reasoning_replay_enabled is True + assert result["messages"][0].get("codex_reasoning_items") is None