mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
* fix(codex-responses): gracefully recover from invalid_encrypted_content (salvage #10144) When an OpenAI-compatible Responses API surface accepts an initial request but later rejects the replayed `codex_reasoning_items` encrypted blob with HTTP 400 `invalid_encrypted_content`, the session previously got stuck retrying the same poisoned payload. Recovery: classify the error as a dedicated FailoverReason, and on the first hit disable encrypted reasoning replay for the rest of the session, strip cached items from message history, and retry once. Changes: * error_classifier: add FailoverReason.invalid_encrypted_content branch in _classify_400 (before context_overflow so the messages that mention 'encrypted content … could not be verified' don't trip context heuristics), in _classify_by_error_code, and extend _extract_error_code to peek inside wrapped JSON in error.message and ignore the bare '400' as a code. * agent_init: initialize `_codex_reasoning_replay_enabled = True` on every agent. * run_agent: add AIAgent._disable_codex_reasoning_replay() helper that flips the flag and pops cached items. * codex_responses_adapter: thread a `replay_encrypted_reasoning` kwarg through _chat_messages_to_responses_input so that when the flag is False we don't replay codex_reasoning_items. * transports/codex.py: read `replay_encrypted_reasoning` from params, thread it into the adapter, and gate the `include=['reasoning.encrypted_content']` request hint on it. * chat_completion_helpers: pass the agent's replay flag through to the transport. * conversation_loop: in the retry loop, add an invalid_encrypted_content recovery branch that fires once per session, only when api_mode == codex_responses, only when replay is still enabled, and only when at least one assistant message in history actually carries cached reasoning items (otherwise the 400 has nothing to do with our cache and the normal retry path handles it). Tests: * test_error_classifier: new wrapped-JSON _extract_error_code case; new TestClassifyApiError cases proving the 400 is retryable with no fallback, that the broad message match doesn't catch a generic 'parsed' message, and that the error code match is case-insensitive. * test_run_agent_codex_responses: end-to-end test of the recovery branch firing once and disabling replay, plus a sibling test that proves the branch does *not* fire (and the flag stays True) when history has no cached reasoning items. Salvages PR #10144 onto the post-refactor module layout (error_classifier / codex_responses_adapter / transports/codex / conversation_loop / agent_init) since the original diff was written against the pre-refactor monolithic run_agent.py. * chore(release): map victorGPT in AUTHOR_MAP for #10144 salvage --------- Co-authored-by: victorGPT <wuxuebin1993@gmail.com>
1102 lines
49 KiB
Python
1102 lines
49 KiB
Python
"""Codex Responses API adapter.
|
|
|
|
Pure format-conversion and normalization logic for the OpenAI Responses API
|
|
(used by OpenAI Codex, xAI, GitHub Models, and other Responses-compatible endpoints).
|
|
|
|
Extracted from run_agent.py to isolate Responses API-specific logic from the
|
|
core agent loop. All functions are stateless — they operate on the data passed
|
|
in and return transformed results.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import re
|
|
import uuid
|
|
from types import SimpleNamespace
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from agent.prompt_builder import DEFAULT_AGENT_IDENTITY
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Matches Codex/Harmony tool-call serialization that occasionally leaks into
|
|
# assistant-message content when the model fails to emit a structured
|
|
# ``function_call`` item. Accepts the common forms:
|
|
#
|
|
# to=functions.exec_command
|
|
# assistant to=functions.exec_command
|
|
# <|channel|>commentary to=functions.exec_command
|
|
#
|
|
# ``to=functions.<name>`` is the stable marker — the optional ``assistant`` or
|
|
# Harmony channel prefix varies by degeneration mode. Case-insensitive to
|
|
# cover lowercase/uppercase ``assistant`` variants.
|
|
_TOOL_CALL_LEAK_PATTERN = re.compile(
|
|
r"(?:^|[\s>|])to=functions\.[A-Za-z_][\w.]*",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Multimodal content helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _chat_content_to_responses_parts(content: Any, *, role: str = "user") -> List[Dict[str, Any]]:
|
|
"""Convert chat-style multimodal content to Responses API input parts.
|
|
|
|
Input: ``[{"type":"text"|"image_url", ...}]`` (native OpenAI Chat format)
|
|
Output: ``[{"type":"input_text"|"output_text"|"input_image", ...}]`` (Responses format)
|
|
|
|
The ``role`` parameter controls the text content type:
|
|
- ``"user"`` (default) → ``"input_text"``
|
|
- ``"assistant"`` → ``"output_text"``
|
|
|
|
The Responses API rejects ``input_text`` inside assistant messages and
|
|
``output_text`` inside user messages, so callers MUST pass the correct
|
|
role for the message being converted.
|
|
|
|
Returns an empty list when ``content`` is not a list or contains no
|
|
recognized parts — callers fall back to the string path.
|
|
"""
|
|
text_type = "output_text" if role == "assistant" else "input_text"
|
|
if not isinstance(content, list):
|
|
return []
|
|
converted: List[Dict[str, Any]] = []
|
|
for part in content:
|
|
if isinstance(part, str):
|
|
if part:
|
|
converted.append({"type": text_type, "text": part})
|
|
continue
|
|
if not isinstance(part, dict):
|
|
continue
|
|
ptype = str(part.get("type") or "").strip().lower()
|
|
if ptype in {"text", "input_text", "output_text"}:
|
|
text = part.get("text")
|
|
if isinstance(text, str) and text:
|
|
converted.append({"type": text_type, "text": text})
|
|
continue
|
|
if ptype in {"image_url", "input_image"}:
|
|
image_ref = part.get("image_url")
|
|
detail = part.get("detail")
|
|
if isinstance(image_ref, dict):
|
|
url = image_ref.get("url")
|
|
detail = image_ref.get("detail", detail)
|
|
else:
|
|
url = image_ref
|
|
if not isinstance(url, str) or not url:
|
|
continue
|
|
image_part: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
|
if isinstance(detail, str) and detail.strip():
|
|
image_part["detail"] = detail.strip()
|
|
converted.append(image_part)
|
|
return converted
|
|
|
|
|
|
def _summarize_user_message_for_log(content: Any) -> str:
|
|
"""Return a short text summary of a user message for logging/trajectory.
|
|
|
|
Multimodal messages arrive as a list of ``{type:"text"|"image_url", ...}``
|
|
parts from the API server. Logging, spinner previews, and trajectory
|
|
files all want a plain string — this helper extracts the first chunk of
|
|
text and notes any attached images. Returns an empty string for empty
|
|
lists and ``str(content)`` for unexpected scalar types.
|
|
"""
|
|
if content is None:
|
|
return ""
|
|
if isinstance(content, str):
|
|
return content
|
|
if isinstance(content, list):
|
|
text_bits: List[str] = []
|
|
image_count = 0
|
|
for part in content:
|
|
if isinstance(part, str):
|
|
if part:
|
|
text_bits.append(part)
|
|
continue
|
|
if not isinstance(part, dict):
|
|
continue
|
|
ptype = str(part.get("type") or "").strip().lower()
|
|
if ptype in {"text", "input_text", "output_text"}:
|
|
text = part.get("text")
|
|
if isinstance(text, str) and text:
|
|
text_bits.append(text)
|
|
elif ptype in {"image_url", "input_image"}:
|
|
image_count += 1
|
|
summary = " ".join(text_bits).strip()
|
|
if image_count:
|
|
note = f"[{image_count} image{'s' if image_count != 1 else ''}]"
|
|
summary = f"{note} {summary}" if summary else note
|
|
return summary
|
|
try:
|
|
return str(content)
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ID helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _deterministic_call_id(fn_name: str, arguments: str, index: int = 0) -> str:
|
|
"""Generate a deterministic call_id from tool call content.
|
|
|
|
Used as a fallback when the API doesn't provide a call_id.
|
|
Deterministic IDs prevent cache invalidation — random UUIDs would
|
|
make every API call's prefix unique, breaking OpenAI's prompt cache.
|
|
"""
|
|
seed = f"{fn_name}:{arguments}:{index}"
|
|
digest = hashlib.sha256(seed.encode("utf-8", errors="replace")).hexdigest()[:12]
|
|
return f"call_{digest}"
|
|
|
|
|
|
def _split_responses_tool_id(raw_id: Any) -> tuple[Optional[str], Optional[str]]:
|
|
"""Split a stored tool id into (call_id, response_item_id)."""
|
|
if not isinstance(raw_id, str):
|
|
return None, None
|
|
value = raw_id.strip()
|
|
if not value:
|
|
return None, None
|
|
if "|" in value:
|
|
call_id, response_item_id = value.split("|", 1)
|
|
call_id = call_id.strip() or None
|
|
response_item_id = response_item_id.strip() or None
|
|
return call_id, response_item_id
|
|
if value.startswith("fc_"):
|
|
return None, value
|
|
return value, None
|
|
|
|
|
|
def _derive_responses_function_call_id(
|
|
call_id: str,
|
|
response_item_id: Optional[str] = None,
|
|
) -> str:
|
|
"""Build a valid Responses `function_call.id` (must start with `fc_`)."""
|
|
if isinstance(response_item_id, str):
|
|
candidate = response_item_id.strip()
|
|
if candidate.startswith("fc_"):
|
|
return candidate
|
|
|
|
source = (call_id or "").strip()
|
|
if source.startswith("fc_"):
|
|
return source
|
|
if source.startswith("call_") and len(source) > len("call_"):
|
|
return f"fc_{source[len('call_'):]}"
|
|
|
|
sanitized = re.sub(r"[^A-Za-z0-9_-]", "", source)
|
|
if sanitized.startswith("fc_"):
|
|
return sanitized
|
|
if sanitized.startswith("call_") and len(sanitized) > len("call_"):
|
|
return f"fc_{sanitized[len('call_'):]}"
|
|
if sanitized:
|
|
return f"fc_{sanitized[:48]}"
|
|
|
|
seed = source or str(response_item_id or "") or uuid.uuid4().hex
|
|
digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:24]
|
|
return f"fc_{digest}"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Schema conversion
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _responses_tools(tools: Optional[List[Dict[str, Any]]] = None) -> Optional[List[Dict[str, Any]]]:
|
|
"""Convert chat-completions tool schemas to Responses function-tool schemas."""
|
|
if not tools:
|
|
return None
|
|
|
|
converted: List[Dict[str, Any]] = []
|
|
for item in tools:
|
|
fn = item.get("function", {}) if isinstance(item, dict) else {}
|
|
name = fn.get("name")
|
|
if not isinstance(name, str) or not name.strip():
|
|
continue
|
|
converted.append({
|
|
"type": "function",
|
|
"name": name,
|
|
"description": fn.get("description", ""),
|
|
"strict": False,
|
|
"parameters": fn.get("parameters", {"type": "object", "properties": {}}),
|
|
})
|
|
return converted or None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Message format conversion
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_RESPONSE_MESSAGE_STATUSES = {"completed", "incomplete", "in_progress"}
|
|
|
|
|
|
def _normalize_responses_message_status(value: Any, *, default: str = "completed") -> str:
|
|
"""Normalize a Responses assistant message status for replay.
|
|
|
|
The API accepts completed/incomplete/in_progress on replayed assistant
|
|
output messages. Preserve those exactly (modulo case/hyphen spelling) so
|
|
incomplete Codex continuation turns don't get falsely marked completed.
|
|
"""
|
|
if isinstance(value, str):
|
|
status = value.strip().lower().replace("-", "_").replace(" ", "_")
|
|
if status in _RESPONSE_MESSAGE_STATUSES:
|
|
return status
|
|
return default
|
|
|
|
|
|
def _chat_messages_to_responses_input(
|
|
messages: List[Dict[str, Any]],
|
|
*,
|
|
is_xai_responses: bool = False,
|
|
replay_encrypted_reasoning: bool = True,
|
|
) -> List[Dict[str, Any]]:
|
|
"""Convert internal chat-style messages to Responses input items.
|
|
|
|
``is_xai_responses`` is kept for transport signature compatibility but
|
|
no longer suppresses encrypted reasoning replay. Earlier (PR #26644,
|
|
May 2026) we believed xAI's OAuth/SuperGrok ``/v1/responses`` surface
|
|
rejected replayed ``encrypted_content`` reasoning items minted by
|
|
prior turns, and we stripped them. That decision was wrong — xAI
|
|
explicitly relies on Hermes threading encrypted reasoning back across
|
|
turns for cross-turn coherence (the whole point of their partnership
|
|
integration). We now replay encrypted reasoning on every Responses
|
|
transport (xAI, native Codex, custom relays) and let xAI tell us
|
|
explicitly if a specific surface ever rejects a payload.
|
|
|
|
``replay_encrypted_reasoning`` is the per-session kill switch. Some
|
|
OpenAI-compatible relays accept the request but later reject the
|
|
replayed encrypted blob with HTTP 400 ``invalid_encrypted_content``;
|
|
when that happens the retry loop calls
|
|
``AIAgent._disable_codex_reasoning_replay`` which both strips cached
|
|
items from the conversation history and threads ``replay_enabled=False``
|
|
through this converter so subsequent turns send no reasoning items.
|
|
"""
|
|
items: List[Dict[str, Any]] = []
|
|
seen_item_ids: set = set()
|
|
|
|
for msg in messages:
|
|
if not isinstance(msg, dict):
|
|
continue
|
|
role = msg.get("role")
|
|
if role == "system":
|
|
continue
|
|
|
|
if role in {"user", "assistant"}:
|
|
content = msg.get("content", "")
|
|
if isinstance(content, list):
|
|
content_parts = _chat_content_to_responses_parts(content, role=role)
|
|
text_type = "output_text" if role == "assistant" else "input_text"
|
|
content_text = "".join(
|
|
p.get("text", "") for p in content_parts if p.get("type") == text_type
|
|
)
|
|
else:
|
|
content_parts = []
|
|
content_text = str(content) if content is not None else ""
|
|
|
|
if role == "assistant":
|
|
# Replay encrypted reasoning items from previous turns
|
|
# so the API can maintain coherent reasoning chains.
|
|
# This applies to every Responses transport including
|
|
# xAI — see _chat_messages_to_responses_input docstring
|
|
# for the May 2026 reversal of the earlier xAI gate.
|
|
codex_reasoning = (
|
|
msg.get("codex_reasoning_items")
|
|
if replay_encrypted_reasoning
|
|
else None
|
|
)
|
|
has_codex_reasoning = False
|
|
if isinstance(codex_reasoning, list):
|
|
for ri in codex_reasoning:
|
|
if isinstance(ri, dict) and ri.get("encrypted_content"):
|
|
item_id = ri.get("id")
|
|
if item_id and item_id in seen_item_ids:
|
|
continue
|
|
# Strip the "id" field — with store=False the
|
|
# Responses API cannot look up items by ID and
|
|
# returns 404. The encrypted_content blob is
|
|
# self-contained for reasoning chain continuity.
|
|
replay_item = {k: v for k, v in ri.items() if k != "id"}
|
|
items.append(replay_item)
|
|
if item_id:
|
|
seen_item_ids.add(item_id)
|
|
has_codex_reasoning = True
|
|
|
|
# Replay exact assistant message items (with id/phase) from
|
|
# previous turns so the API can maintain prefix-cache hits.
|
|
# OpenAI docs: "preserve and resend phase on all assistant
|
|
# messages — dropping it can degrade performance."
|
|
codex_message_items = msg.get("codex_message_items")
|
|
replayed_message_items = 0
|
|
if isinstance(codex_message_items, list):
|
|
for raw_item in codex_message_items:
|
|
if not isinstance(raw_item, dict):
|
|
continue
|
|
if raw_item.get("type") != "message" or raw_item.get("role") != "assistant":
|
|
continue
|
|
raw_content_parts = raw_item.get("content")
|
|
if not isinstance(raw_content_parts, list):
|
|
continue
|
|
|
|
normalized_content_parts = []
|
|
for part in raw_content_parts:
|
|
if not isinstance(part, dict):
|
|
continue
|
|
part_type = str(part.get("type") or "").strip()
|
|
if part_type not in {"output_text", "text"}:
|
|
continue
|
|
text = part.get("text", "")
|
|
if text is None:
|
|
text = ""
|
|
if not isinstance(text, str):
|
|
text = str(text)
|
|
normalized_content_parts.append({"type": "output_text", "text": text})
|
|
|
|
if not normalized_content_parts:
|
|
continue
|
|
|
|
replay_item = {
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"status": _normalize_responses_message_status(raw_item.get("status")),
|
|
"content": normalized_content_parts,
|
|
}
|
|
item_id = raw_item.get("id")
|
|
if isinstance(item_id, str) and item_id.strip():
|
|
replay_item["id"] = item_id.strip()
|
|
phase = raw_item.get("phase")
|
|
if isinstance(phase, str) and phase.strip():
|
|
replay_item["phase"] = phase.strip()
|
|
items.append(replay_item)
|
|
replayed_message_items += 1
|
|
|
|
if replayed_message_items > 0:
|
|
pass
|
|
elif content_parts:
|
|
items.append({"role": "assistant", "content": content_parts})
|
|
elif content_text.strip():
|
|
items.append({"role": "assistant", "content": content_text})
|
|
elif has_codex_reasoning:
|
|
# The Responses API requires a following item after each
|
|
# reasoning item (otherwise: missing_following_item error).
|
|
# When the assistant produced only reasoning with no visible
|
|
# content, emit an empty assistant message as the required
|
|
# following item.
|
|
items.append({"role": "assistant", "content": ""})
|
|
|
|
tool_calls = msg.get("tool_calls")
|
|
if isinstance(tool_calls, list):
|
|
for tc in tool_calls:
|
|
if not isinstance(tc, dict):
|
|
continue
|
|
fn = tc.get("function", {})
|
|
fn_name = fn.get("name")
|
|
if not isinstance(fn_name, str) or not fn_name.strip():
|
|
continue
|
|
|
|
embedded_call_id, embedded_response_item_id = _split_responses_tool_id(
|
|
tc.get("id")
|
|
)
|
|
call_id = tc.get("call_id")
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
call_id = embedded_call_id
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
if (
|
|
isinstance(embedded_response_item_id, str)
|
|
and embedded_response_item_id.startswith("fc_")
|
|
and len(embedded_response_item_id) > len("fc_")
|
|
):
|
|
call_id = f"call_{embedded_response_item_id[len('fc_'):]}"
|
|
else:
|
|
_raw_args = str(fn.get("arguments", "{}"))
|
|
call_id = _deterministic_call_id(fn_name, _raw_args, len(items))
|
|
call_id = call_id.strip()
|
|
|
|
arguments = fn.get("arguments", "{}")
|
|
if isinstance(arguments, dict):
|
|
arguments = json.dumps(arguments, ensure_ascii=False)
|
|
elif not isinstance(arguments, str):
|
|
arguments = str(arguments)
|
|
arguments = arguments.strip() or "{}"
|
|
|
|
items.append({
|
|
"type": "function_call",
|
|
"call_id": call_id,
|
|
"name": fn_name,
|
|
"arguments": arguments,
|
|
})
|
|
continue
|
|
|
|
# Non-assistant (user) role: emit multimodal parts when present,
|
|
# otherwise fall back to the text payload.
|
|
if content_parts:
|
|
items.append({"role": role, "content": content_parts})
|
|
else:
|
|
items.append({"role": role, "content": content_text})
|
|
continue
|
|
|
|
if role == "tool":
|
|
raw_tool_call_id = msg.get("tool_call_id")
|
|
call_id, _ = _split_responses_tool_id(raw_tool_call_id)
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
if isinstance(raw_tool_call_id, str) and raw_tool_call_id.strip():
|
|
call_id = raw_tool_call_id.strip()
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
continue
|
|
|
|
# Multimodal tool result: convert OpenAI-style content list into
|
|
# Responses ``function_call_output.output`` array. The Responses
|
|
# API accepts ``output`` as either a string or an array of
|
|
# ``input_text``/``input_image`` items. See
|
|
# https://developers.openai.com/api/reference/python/resources/responses/.
|
|
tool_content = msg.get("content")
|
|
output_value: Any
|
|
if isinstance(tool_content, list):
|
|
converted = _chat_content_to_responses_parts(
|
|
tool_content, role="user",
|
|
)
|
|
if converted:
|
|
output_value = converted
|
|
else:
|
|
output_value = ""
|
|
else:
|
|
output_value = str(tool_content or "")
|
|
|
|
items.append({
|
|
"type": "function_call_output",
|
|
"call_id": call_id,
|
|
"output": output_value,
|
|
})
|
|
|
|
return items
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Input preflight / validation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
|
|
if not isinstance(raw_items, list):
|
|
raise ValueError("Codex Responses input must be a list of input items.")
|
|
|
|
normalized: List[Dict[str, Any]] = []
|
|
seen_ids: set = set()
|
|
for idx, item in enumerate(raw_items):
|
|
if not isinstance(item, dict):
|
|
raise ValueError(f"Codex Responses input[{idx}] must be an object.")
|
|
|
|
item_type = item.get("type")
|
|
if item_type == "function_call":
|
|
call_id = item.get("call_id")
|
|
name = item.get("name")
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
raise ValueError(f"Codex Responses input[{idx}] function_call is missing call_id.")
|
|
if not isinstance(name, str) or not name.strip():
|
|
raise ValueError(f"Codex Responses input[{idx}] function_call is missing name.")
|
|
|
|
arguments = item.get("arguments", "{}")
|
|
if isinstance(arguments, dict):
|
|
arguments = json.dumps(arguments, ensure_ascii=False)
|
|
elif not isinstance(arguments, str):
|
|
arguments = str(arguments)
|
|
arguments = arguments.strip() or "{}"
|
|
|
|
normalized.append(
|
|
{
|
|
"type": "function_call",
|
|
"call_id": call_id.strip(),
|
|
"name": name.strip(),
|
|
"arguments": arguments,
|
|
}
|
|
)
|
|
continue
|
|
|
|
if item_type == "function_call_output":
|
|
call_id = item.get("call_id")
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
raise ValueError(f"Codex Responses input[{idx}] function_call_output is missing call_id.")
|
|
output = item.get("output", "")
|
|
if output is None:
|
|
output = ""
|
|
# Output may be a string OR an array of structured content
|
|
# items (input_text / input_image) for multimodal tool results.
|
|
# Both shapes are accepted by the Responses API. We preserve
|
|
# the array form when present.
|
|
if isinstance(output, list):
|
|
# Validate each item is a recognised content shape; drop
|
|
# anything else to avoid 4xx from the API.
|
|
cleaned: List[Dict[str, Any]] = []
|
|
for part in output:
|
|
if not isinstance(part, dict):
|
|
continue
|
|
ptype = part.get("type")
|
|
if ptype == "input_text":
|
|
text = part.get("text")
|
|
if isinstance(text, str) and text:
|
|
cleaned.append({"type": "input_text", "text": text})
|
|
elif ptype == "input_image":
|
|
url = part.get("image_url")
|
|
if isinstance(url, str) and url:
|
|
entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
|
detail = part.get("detail")
|
|
if isinstance(detail, str) and detail.strip():
|
|
entry["detail"] = detail.strip()
|
|
cleaned.append(entry)
|
|
normalized.append(
|
|
{
|
|
"type": "function_call_output",
|
|
"call_id": call_id.strip(),
|
|
"output": cleaned if cleaned else "",
|
|
}
|
|
)
|
|
continue
|
|
if not isinstance(output, str):
|
|
output = str(output)
|
|
|
|
normalized.append(
|
|
{
|
|
"type": "function_call_output",
|
|
"call_id": call_id.strip(),
|
|
"output": output,
|
|
}
|
|
)
|
|
continue
|
|
|
|
if item_type == "reasoning":
|
|
encrypted = item.get("encrypted_content")
|
|
if isinstance(encrypted, str) and encrypted:
|
|
item_id = item.get("id")
|
|
if isinstance(item_id, str) and item_id:
|
|
if item_id in seen_ids:
|
|
continue
|
|
seen_ids.add(item_id)
|
|
reasoning_item = {"type": "reasoning", "encrypted_content": encrypted}
|
|
# Do NOT include the "id" in the outgoing item — with
|
|
# store=False (our default) the API tries to resolve the
|
|
# id server-side and returns 404. The id is still used
|
|
# above for local deduplication via seen_ids.
|
|
summary = item.get("summary")
|
|
if isinstance(summary, list):
|
|
reasoning_item["summary"] = summary
|
|
else:
|
|
reasoning_item["summary"] = []
|
|
normalized.append(reasoning_item)
|
|
continue
|
|
|
|
if item_type == "message":
|
|
role = item.get("role")
|
|
if role != "assistant":
|
|
raise ValueError(f"Codex Responses input[{idx}] message items must have role='assistant'.")
|
|
content = item.get("content")
|
|
if not isinstance(content, list):
|
|
raise ValueError(f"Codex Responses input[{idx}] message item must have content list.")
|
|
normalized_content = []
|
|
for part_idx, part in enumerate(content):
|
|
if not isinstance(part, dict):
|
|
raise ValueError(
|
|
f"Codex Responses input[{idx}] message content[{part_idx}] must be an object."
|
|
)
|
|
part_type = part.get("type")
|
|
if part_type not in {"output_text", "text"}:
|
|
raise ValueError(
|
|
f"Codex Responses input[{idx}] message content[{part_idx}] has unsupported type {part_type!r}."
|
|
)
|
|
text = part.get("text", "")
|
|
if text is None:
|
|
text = ""
|
|
if not isinstance(text, str):
|
|
text = str(text)
|
|
normalized_content.append({"type": "output_text", "text": text})
|
|
if not normalized_content:
|
|
raise ValueError(f"Codex Responses input[{idx}] message item must contain at least one text part.")
|
|
normalized_item: Dict[str, Any] = {
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"status": _normalize_responses_message_status(item.get("status")),
|
|
"content": normalized_content,
|
|
}
|
|
item_id = item.get("id")
|
|
if isinstance(item_id, str) and item_id.strip():
|
|
normalized_item["id"] = item_id.strip()
|
|
phase = item.get("phase")
|
|
if isinstance(phase, str) and phase.strip():
|
|
normalized_item["phase"] = phase.strip()
|
|
normalized.append(normalized_item)
|
|
continue
|
|
|
|
role = item.get("role")
|
|
if role in {"user", "assistant"}:
|
|
content = item.get("content", "")
|
|
if content is None:
|
|
content = ""
|
|
if isinstance(content, list):
|
|
# Multimodal content from ``_chat_messages_to_responses_input``
|
|
# is already in Responses format (``input_text`` / ``output_text``
|
|
# / ``input_image``). Validate each part and pass through.
|
|
# Use the correct text type for the role — ``output_text`` for
|
|
# assistant messages, ``input_text`` for user messages.
|
|
text_type = "output_text" if role == "assistant" else "input_text"
|
|
validated: List[Dict[str, Any]] = []
|
|
for part_idx, part in enumerate(content):
|
|
if isinstance(part, str):
|
|
if part:
|
|
validated.append({"type": text_type, "text": part})
|
|
continue
|
|
if not isinstance(part, dict):
|
|
raise ValueError(
|
|
f"Codex Responses input[{idx}].content[{part_idx}] must be an object or string."
|
|
)
|
|
ptype = str(part.get("type") or "").strip().lower()
|
|
if ptype in {"input_text", "text", "output_text"}:
|
|
text = part.get("text", "")
|
|
if not isinstance(text, str):
|
|
text = str(text or "")
|
|
validated.append({"type": text_type, "text": text})
|
|
elif ptype in {"input_image", "image_url"}:
|
|
image_ref = part.get("image_url", "")
|
|
detail = part.get("detail")
|
|
if isinstance(image_ref, dict):
|
|
url = image_ref.get("url", "")
|
|
detail = image_ref.get("detail", detail)
|
|
else:
|
|
url = image_ref
|
|
if not isinstance(url, str):
|
|
url = str(url or "")
|
|
image_part: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
|
if isinstance(detail, str) and detail.strip():
|
|
image_part["detail"] = detail.strip()
|
|
validated.append(image_part)
|
|
else:
|
|
raise ValueError(
|
|
f"Codex Responses input[{idx}].content[{part_idx}] has unsupported type {part.get('type')!r}."
|
|
)
|
|
normalized.append({"role": role, "content": validated})
|
|
continue
|
|
if not isinstance(content, str):
|
|
content = str(content)
|
|
|
|
normalized.append({"role": role, "content": content})
|
|
continue
|
|
|
|
raise ValueError(
|
|
f"Codex Responses input[{idx}] has unsupported item shape (type={item_type!r}, role={role!r})."
|
|
)
|
|
|
|
return normalized
|
|
|
|
|
|
def _preflight_codex_api_kwargs(
|
|
api_kwargs: Any,
|
|
*,
|
|
allow_stream: bool = False,
|
|
) -> Dict[str, Any]:
|
|
if not isinstance(api_kwargs, dict):
|
|
raise ValueError("Codex Responses request must be a dict.")
|
|
|
|
required = {"model", "instructions", "input"}
|
|
missing = [key for key in required if key not in api_kwargs]
|
|
if missing:
|
|
raise ValueError(f"Codex Responses request missing required field(s): {', '.join(sorted(missing))}.")
|
|
|
|
model = api_kwargs.get("model")
|
|
if not isinstance(model, str) or not model.strip():
|
|
raise ValueError("Codex Responses request 'model' must be a non-empty string.")
|
|
model = model.strip()
|
|
|
|
instructions = api_kwargs.get("instructions")
|
|
if instructions is None:
|
|
instructions = ""
|
|
if not isinstance(instructions, str):
|
|
instructions = str(instructions)
|
|
instructions = instructions.strip() or DEFAULT_AGENT_IDENTITY
|
|
|
|
normalized_input = _preflight_codex_input_items(api_kwargs.get("input"))
|
|
|
|
tools = api_kwargs.get("tools")
|
|
normalized_tools = None
|
|
if tools is not None:
|
|
if not isinstance(tools, list):
|
|
raise ValueError("Codex Responses request 'tools' must be a list when provided.")
|
|
normalized_tools = []
|
|
for idx, tool in enumerate(tools):
|
|
if not isinstance(tool, dict):
|
|
raise ValueError(f"Codex Responses tools[{idx}] must be an object.")
|
|
if tool.get("type") != "function":
|
|
raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.")
|
|
|
|
name = tool.get("name")
|
|
parameters = tool.get("parameters")
|
|
if not isinstance(name, str) or not name.strip():
|
|
raise ValueError(f"Codex Responses tools[{idx}] is missing a valid name.")
|
|
if not isinstance(parameters, dict):
|
|
raise ValueError(f"Codex Responses tools[{idx}] is missing valid parameters.")
|
|
|
|
description = tool.get("description", "")
|
|
if description is None:
|
|
description = ""
|
|
if not isinstance(description, str):
|
|
description = str(description)
|
|
|
|
strict = tool.get("strict", False)
|
|
if not isinstance(strict, bool):
|
|
strict = bool(strict)
|
|
|
|
normalized_tools.append(
|
|
{
|
|
"type": "function",
|
|
"name": name.strip(),
|
|
"description": description,
|
|
"strict": strict,
|
|
"parameters": parameters,
|
|
}
|
|
)
|
|
|
|
store = api_kwargs.get("store", False)
|
|
if store is not False:
|
|
raise ValueError("Codex Responses contract requires 'store' to be false.")
|
|
|
|
allowed_keys = {
|
|
"model", "instructions", "input", "tools", "store",
|
|
"reasoning", "include", "max_output_tokens", "temperature",
|
|
"tool_choice", "parallel_tool_calls", "prompt_cache_key", "service_tier",
|
|
"extra_headers", "extra_body", "timeout",
|
|
}
|
|
normalized: Dict[str, Any] = {
|
|
"model": model,
|
|
"instructions": instructions,
|
|
"input": normalized_input,
|
|
"store": False,
|
|
}
|
|
if normalized_tools is not None:
|
|
normalized["tools"] = normalized_tools
|
|
|
|
# Pass through reasoning config
|
|
reasoning = api_kwargs.get("reasoning")
|
|
if isinstance(reasoning, dict):
|
|
normalized["reasoning"] = reasoning
|
|
include = api_kwargs.get("include")
|
|
if isinstance(include, list):
|
|
normalized["include"] = include
|
|
service_tier = api_kwargs.get("service_tier")
|
|
if isinstance(service_tier, str) and service_tier.strip():
|
|
normalized["service_tier"] = service_tier.strip()
|
|
|
|
# Pass through max_output_tokens and temperature
|
|
max_output_tokens = api_kwargs.get("max_output_tokens")
|
|
if isinstance(max_output_tokens, (int, float)) and max_output_tokens > 0:
|
|
normalized["max_output_tokens"] = int(max_output_tokens)
|
|
timeout = api_kwargs.get("timeout")
|
|
if (
|
|
isinstance(timeout, (int, float))
|
|
and not isinstance(timeout, bool)
|
|
and 0 < float(timeout) < float("inf")
|
|
):
|
|
normalized["timeout"] = float(timeout)
|
|
temperature = api_kwargs.get("temperature")
|
|
if isinstance(temperature, (int, float)):
|
|
normalized["temperature"] = float(temperature)
|
|
|
|
# Pass through tool_choice, parallel_tool_calls, prompt_cache_key
|
|
for passthrough_key in ("tool_choice", "parallel_tool_calls", "prompt_cache_key"):
|
|
val = api_kwargs.get(passthrough_key)
|
|
if val is not None:
|
|
normalized[passthrough_key] = val
|
|
|
|
extra_headers = api_kwargs.get("extra_headers")
|
|
if extra_headers is not None:
|
|
if not isinstance(extra_headers, dict):
|
|
raise ValueError("Codex Responses request 'extra_headers' must be an object.")
|
|
normalized_headers: Dict[str, str] = {}
|
|
for key, value in extra_headers.items():
|
|
if not isinstance(key, str) or not key.strip():
|
|
raise ValueError("Codex Responses request 'extra_headers' keys must be non-empty strings.")
|
|
if value is None:
|
|
continue
|
|
normalized_headers[key.strip()] = str(value)
|
|
if normalized_headers:
|
|
normalized["extra_headers"] = normalized_headers
|
|
|
|
extra_body = api_kwargs.get("extra_body")
|
|
if extra_body is not None:
|
|
if not isinstance(extra_body, dict):
|
|
raise ValueError("Codex Responses request 'extra_body' must be an object.")
|
|
# Pass extra_body through verbatim — used by xAI Responses to
|
|
# carry `prompt_cache_key` as a body-level field (the documented
|
|
# cache-routing surface on /v1/responses). The openai SDK
|
|
# serializes extra_body into the JSON body without per-field
|
|
# type checks, so it survives Responses.stream() kwarg-signature
|
|
# changes that would otherwise raise TypeError before the wire.
|
|
if extra_body:
|
|
normalized["extra_body"] = dict(extra_body)
|
|
|
|
if allow_stream:
|
|
stream = api_kwargs.get("stream")
|
|
if stream is not None and stream is not True:
|
|
raise ValueError("Codex Responses 'stream' must be true when set.")
|
|
if stream is True:
|
|
normalized["stream"] = True
|
|
allowed_keys.add("stream")
|
|
elif "stream" in api_kwargs:
|
|
raise ValueError("Codex Responses stream flag is only allowed in fallback streaming requests.")
|
|
|
|
unexpected = sorted(key for key in api_kwargs if key not in allowed_keys)
|
|
if unexpected:
|
|
raise ValueError(
|
|
f"Codex Responses request has unsupported field(s): {', '.join(unexpected)}."
|
|
)
|
|
|
|
return normalized
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Response extraction helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _extract_responses_message_text(item: Any) -> str:
|
|
"""Extract assistant text from a Responses message output item."""
|
|
content = getattr(item, "content", None)
|
|
if not isinstance(content, list):
|
|
return ""
|
|
|
|
chunks: List[str] = []
|
|
for part in content:
|
|
ptype = getattr(part, "type", None)
|
|
if ptype not in {"output_text", "text"}:
|
|
continue
|
|
text = getattr(part, "text", None)
|
|
if isinstance(text, str) and text:
|
|
chunks.append(text)
|
|
return "".join(chunks).strip()
|
|
|
|
|
|
def _extract_responses_reasoning_text(item: Any) -> str:
|
|
"""Extract a compact reasoning text from a Responses reasoning item."""
|
|
summary = getattr(item, "summary", None)
|
|
if isinstance(summary, list):
|
|
chunks: List[str] = []
|
|
for part in summary:
|
|
text = getattr(part, "text", None)
|
|
if isinstance(text, str) and text:
|
|
chunks.append(text)
|
|
if chunks:
|
|
return "\n".join(chunks).strip()
|
|
text = getattr(item, "text", None)
|
|
if isinstance(text, str) and text:
|
|
return text.strip()
|
|
return ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Full response normalization
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _normalize_codex_response(response: Any) -> tuple[Any, str]:
|
|
"""Normalize a Responses API object to an assistant_message-like object."""
|
|
output = getattr(response, "output", None)
|
|
if not isinstance(output, list) or not output:
|
|
# The Codex backend can return empty output when the answer was
|
|
# delivered entirely via stream events. Check output_text as a
|
|
# last-resort fallback before raising.
|
|
out_text = getattr(response, "output_text", None)
|
|
if isinstance(out_text, str) and out_text.strip():
|
|
logger.debug(
|
|
"Codex response has empty output but output_text is present (%d chars); "
|
|
"synthesizing output item.", len(out_text.strip()),
|
|
)
|
|
output = [SimpleNamespace(
|
|
type="message", role="assistant", status="completed",
|
|
content=[SimpleNamespace(type="output_text", text=out_text.strip())],
|
|
)]
|
|
response.output = output
|
|
else:
|
|
raise RuntimeError("Responses API returned no output items")
|
|
|
|
response_status = getattr(response, "status", None)
|
|
if isinstance(response_status, str):
|
|
response_status = response_status.strip().lower()
|
|
else:
|
|
response_status = None
|
|
|
|
if response_status in {"failed", "cancelled"}:
|
|
error_obj = getattr(response, "error", None)
|
|
if isinstance(error_obj, dict):
|
|
error_msg = error_obj.get("message") or str(error_obj)
|
|
else:
|
|
error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'"
|
|
raise RuntimeError(error_msg)
|
|
|
|
content_parts: List[str] = []
|
|
reasoning_parts: List[str] = []
|
|
reasoning_items_raw: List[Dict[str, Any]] = []
|
|
message_items_raw: List[Dict[str, Any]] = []
|
|
tool_calls: List[Any] = []
|
|
has_incomplete_items = response_status in {"queued", "in_progress", "incomplete"}
|
|
saw_commentary_phase = False
|
|
saw_final_answer_phase = False
|
|
|
|
for item in output:
|
|
item_type = getattr(item, "type", None)
|
|
item_status = getattr(item, "status", None)
|
|
if isinstance(item_status, str):
|
|
item_status = item_status.strip().lower()
|
|
else:
|
|
item_status = None
|
|
|
|
if item_status in {"queued", "in_progress", "incomplete"}:
|
|
has_incomplete_items = True
|
|
|
|
if item_type == "message":
|
|
item_phase = getattr(item, "phase", None)
|
|
normalized_phase = None
|
|
if isinstance(item_phase, str):
|
|
normalized_phase = item_phase.strip().lower()
|
|
if normalized_phase in {"commentary", "analysis"}:
|
|
saw_commentary_phase = True
|
|
elif normalized_phase in {"final_answer", "final"}:
|
|
saw_final_answer_phase = True
|
|
message_text = _extract_responses_message_text(item)
|
|
if message_text:
|
|
content_parts.append(message_text)
|
|
raw_message_item: Dict[str, Any] = {
|
|
"type": "message",
|
|
"role": "assistant",
|
|
"status": _normalize_responses_message_status(item_status),
|
|
"content": [{"type": "output_text", "text": message_text}],
|
|
}
|
|
item_id = getattr(item, "id", None)
|
|
if isinstance(item_id, str) and item_id:
|
|
raw_message_item["id"] = item_id
|
|
if normalized_phase:
|
|
raw_message_item["phase"] = normalized_phase
|
|
message_items_raw.append(raw_message_item)
|
|
elif item_type == "reasoning":
|
|
reasoning_text = _extract_responses_reasoning_text(item)
|
|
if reasoning_text:
|
|
reasoning_parts.append(reasoning_text)
|
|
# Capture the full reasoning item for multi-turn continuity.
|
|
# encrypted_content is an opaque blob the API needs back on
|
|
# subsequent turns to maintain coherent reasoning chains.
|
|
encrypted = getattr(item, "encrypted_content", None)
|
|
if isinstance(encrypted, str) and encrypted:
|
|
raw_item = {"type": "reasoning", "encrypted_content": encrypted}
|
|
item_id = getattr(item, "id", None)
|
|
if isinstance(item_id, str) and item_id:
|
|
raw_item["id"] = item_id
|
|
# Capture summary — required by the API when replaying reasoning items
|
|
summary = getattr(item, "summary", None)
|
|
if isinstance(summary, list):
|
|
raw_summary = []
|
|
for part in summary:
|
|
text = getattr(part, "text", None)
|
|
if isinstance(text, str):
|
|
raw_summary.append({"type": "summary_text", "text": text})
|
|
raw_item["summary"] = raw_summary
|
|
reasoning_items_raw.append(raw_item)
|
|
elif item_type == "function_call":
|
|
if item_status in {"queued", "in_progress", "incomplete"}:
|
|
continue
|
|
fn_name = getattr(item, "name", "") or ""
|
|
arguments = getattr(item, "arguments", "{}")
|
|
if not isinstance(arguments, str):
|
|
arguments = json.dumps(arguments, ensure_ascii=False)
|
|
raw_call_id = getattr(item, "call_id", None)
|
|
raw_item_id = getattr(item, "id", None)
|
|
embedded_call_id, _ = _split_responses_tool_id(raw_item_id)
|
|
call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
call_id = _deterministic_call_id(fn_name, arguments, len(tool_calls))
|
|
call_id = call_id.strip()
|
|
response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
|
|
response_item_id = _derive_responses_function_call_id(call_id, response_item_id)
|
|
tool_calls.append(SimpleNamespace(
|
|
id=call_id,
|
|
call_id=call_id,
|
|
response_item_id=response_item_id,
|
|
type="function",
|
|
function=SimpleNamespace(name=fn_name, arguments=arguments),
|
|
))
|
|
elif item_type == "custom_tool_call":
|
|
fn_name = getattr(item, "name", "") or ""
|
|
arguments = getattr(item, "input", "{}")
|
|
if not isinstance(arguments, str):
|
|
arguments = json.dumps(arguments, ensure_ascii=False)
|
|
raw_call_id = getattr(item, "call_id", None)
|
|
raw_item_id = getattr(item, "id", None)
|
|
embedded_call_id, _ = _split_responses_tool_id(raw_item_id)
|
|
call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
|
|
if not isinstance(call_id, str) or not call_id.strip():
|
|
call_id = _deterministic_call_id(fn_name, arguments, len(tool_calls))
|
|
call_id = call_id.strip()
|
|
response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
|
|
response_item_id = _derive_responses_function_call_id(call_id, response_item_id)
|
|
tool_calls.append(SimpleNamespace(
|
|
id=call_id,
|
|
call_id=call_id,
|
|
response_item_id=response_item_id,
|
|
type="function",
|
|
function=SimpleNamespace(name=fn_name, arguments=arguments),
|
|
))
|
|
|
|
final_text = "\n".join([p for p in content_parts if p]).strip()
|
|
if not final_text and hasattr(response, "output_text"):
|
|
out_text = getattr(response, "output_text", "")
|
|
if isinstance(out_text, str):
|
|
final_text = out_text.strip()
|
|
|
|
# ── Tool-call leak recovery ──────────────────────────────────
|
|
# gpt-5.x on the Codex Responses API sometimes degenerates and emits
|
|
# what should be a structured `function_call` item as plain assistant
|
|
# text using the Harmony/Codex serialization (``to=functions.foo
|
|
# {json}`` or ``assistant to=functions.foo {json}``). The model
|
|
# intended to call a tool, but the intent never made it into
|
|
# ``response.output`` as a ``function_call`` item, so ``tool_calls``
|
|
# is empty here. If we pass this through, the parent sees a
|
|
# confident-looking summary with no audit trail (empty ``tool_trace``)
|
|
# and no tools actually ran — the Taiwan-embassy-email incident.
|
|
#
|
|
# Detection: leaked tokens always contain ``to=functions.<name>`` and
|
|
# the assistant message has no real tool calls. Treat it as incomplete
|
|
# so the existing Codex-incomplete continuation path (3 retries,
|
|
# handled in run_agent.py) gets a chance to re-elicit a proper
|
|
# ``function_call`` item. The existing loop already handles message
|
|
# append, dedup, and retry budget.
|
|
leaked_tool_call_text = False
|
|
if final_text and not tool_calls and _TOOL_CALL_LEAK_PATTERN.search(final_text):
|
|
leaked_tool_call_text = True
|
|
logger.warning(
|
|
"Codex response contains leaked tool-call text in assistant content "
|
|
"(no structured function_call items). Treating as incomplete so the "
|
|
"continuation path can re-elicit a proper tool call. Leaked snippet: %r",
|
|
final_text[:300],
|
|
)
|
|
# Clear the text so downstream code doesn't surface the garbage as
|
|
# a summary. The encrypted reasoning items (if any) are preserved
|
|
# so the model keeps its chain-of-thought on the retry.
|
|
final_text = ""
|
|
|
|
assistant_message = SimpleNamespace(
|
|
content=final_text,
|
|
tool_calls=tool_calls,
|
|
reasoning="\n\n".join(reasoning_parts).strip() if reasoning_parts else None,
|
|
reasoning_content=None,
|
|
reasoning_details=None,
|
|
codex_reasoning_items=reasoning_items_raw or None,
|
|
codex_message_items=message_items_raw or None,
|
|
)
|
|
|
|
if tool_calls:
|
|
finish_reason = "tool_calls"
|
|
elif leaked_tool_call_text:
|
|
finish_reason = "incomplete"
|
|
elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
|
|
finish_reason = "incomplete"
|
|
elif reasoning_items_raw and not final_text:
|
|
# Response contains only reasoning (encrypted thinking state) with
|
|
# no visible content or tool calls. The model is still thinking and
|
|
# needs another turn to produce the actual answer. Marking this as
|
|
# "stop" would send it into the empty-content retry loop which burns
|
|
# 3 retries then fails — treat it as incomplete instead so the Codex
|
|
# continuation path handles it correctly.
|
|
finish_reason = "incomplete"
|
|
else:
|
|
finish_reason = "stop"
|
|
return assistant_message, finish_reason
|