diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 5bdfc47cf6..5bd523b64d 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -87,6 +87,55 @@ _CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex" # read response.choices[0].message.content. This adapter translates those # calls to the Codex Responses API so callers don't need any changes. + +def _convert_content_for_responses(content: Any) -> Any: + """Convert chat.completions content to Responses API format. + + chat.completions uses: + {"type": "text", "text": "..."} + {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}} + + Responses API uses: + {"type": "input_text", "text": "..."} + {"type": "input_image", "image_url": "data:image/png;base64,..."} + + If content is a plain string, it's returned as-is (the Responses API + accepts strings directly for text-only messages). + """ + if isinstance(content, str): + return content + if not isinstance(content, list): + return str(content) if content else "" + + converted: List[Dict[str, Any]] = [] + for part in content: + if not isinstance(part, dict): + continue + ptype = part.get("type", "") + if ptype == "text": + converted.append({"type": "input_text", "text": part.get("text", "")}) + elif ptype == "image_url": + # chat.completions nests the URL: {"image_url": {"url": "..."}} + image_data = part.get("image_url", {}) + url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data) + entry: Dict[str, Any] = {"type": "input_image", "image_url": url} + # Preserve detail if specified + detail = image_data.get("detail") if isinstance(image_data, dict) else None + if detail: + entry["detail"] = detail + converted.append(entry) + elif ptype in ("input_text", "input_image"): + # Already in Responses format — pass through + converted.append(part) + else: + # Unknown content type — try to preserve as text + text = part.get("text", "") + if text: + converted.append({"type": "input_text", "text": text}) + + return converted or "" + + class _CodexCompletionsAdapter: """Drop-in shim that accepts chat.completions.create() kwargs and routes them through the Codex Responses streaming API.""" @@ -100,30 +149,31 @@ class _CodexCompletionsAdapter: model = kwargs.get("model", self._model) temperature = kwargs.get("temperature") - # Separate system/instructions from conversation messages + # Separate system/instructions from conversation messages. + # Convert chat.completions multimodal content blocks to Responses + # API format (input_text / input_image instead of text / image_url). instructions = "You are a helpful assistant." input_msgs: List[Dict[str, Any]] = [] for msg in messages: role = msg.get("role", "user") content = msg.get("content") or "" if role == "system": - instructions = content + instructions = content if isinstance(content, str) else str(content) else: - input_msgs.append({"role": role, "content": content}) + input_msgs.append({ + "role": role, + "content": _convert_content_for_responses(content), + }) resp_kwargs: Dict[str, Any] = { "model": model, "instructions": instructions, "input": input_msgs or [{"role": "user", "content": ""}], - "stream": True, "store": False, } - max_tokens = kwargs.get("max_output_tokens") or kwargs.get("max_completion_tokens") or kwargs.get("max_tokens") - if max_tokens is not None: - resp_kwargs["max_output_tokens"] = int(max_tokens) - if temperature is not None: - resp_kwargs["temperature"] = temperature + # Note: the Codex endpoint (chatgpt.com/backend-api/codex) does NOT + # support max_output_tokens or temperature — omit to avoid 400 errors. # Tools support for flush_memories and similar callers tools = kwargs.get("tools") @@ -438,6 +488,12 @@ def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[st logger.warning("auxiliary.provider=openai but OPENAI_API_KEY not set") return client, model + if forced == "codex": + client, model = _try_codex() + if client is None: + logger.warning("auxiliary.provider=codex but no Codex OAuth token found (run: hermes model)") + return client, model + if forced == "main": # "main" = skip OpenRouter/Nous, use the main chat model's credentials. for try_fn in (_try_custom_endpoint, _try_codex, _resolve_api_key_provider): @@ -515,21 +571,21 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]: auto-detects. Callers may override the returned model with AUXILIARY_VISION_MODEL. - In auto mode, only OpenRouter and Nous Portal are tried because they - are known to support multimodal (Gemini). Custom endpoints, Codex, - and API-key providers are skipped — they may not handle vision input - and would produce confusing errors. To use one of those providers - for vision, set AUXILIARY_VISION_PROVIDER explicitly. + In auto mode, only providers known to support multimodal are tried: + OpenRouter, Nous Portal, and Codex OAuth (gpt-5.3-codex supports + vision via the Responses API). Custom endpoints and API-key + providers are skipped — they may not handle vision input. To use + them, set AUXILIARY_VISION_PROVIDER explicitly. """ forced = _get_auxiliary_provider("vision") if forced != "auto": return _resolve_forced_provider(forced) # Auto: only multimodal-capable providers - for try_fn in (_try_openrouter, _try_nous): + for try_fn in (_try_openrouter, _try_nous, _try_codex): client, model = try_fn() if client is not None: return client, model - logger.debug("Auxiliary vision client: none available (auto only tries OpenRouter/Nous)") + logger.debug("Auxiliary vision client: none available (auto only tries OpenRouter/Nous/Codex)") return None, None diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index f035993197..8454bbea47 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -167,12 +167,14 @@ class TestVisionClientFallback: assert client is None assert model is None - def test_vision_auto_skips_codex(self, codex_auth_dir): - """Even with Codex available, vision auto mode returns None (Codex can't do multimodal).""" - with patch("agent.auxiliary_client._read_nous_auth", return_value=None): + def test_vision_auto_includes_codex(self, codex_auth_dir): + """Codex supports vision (gpt-5.3-codex), so auto mode should use it.""" + with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ + patch("agent.auxiliary_client.OpenAI"): client, model = get_vision_auxiliary_client() - assert client is None - assert model is None + from agent.auxiliary_client import CodexAuxiliaryClient + assert isinstance(client, CodexAuxiliaryClient) + assert model == "gpt-5.3-codex" def test_vision_auto_skips_custom_endpoint(self, monkeypatch): """Custom endpoint is skipped in vision auto mode.""" diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 3ca2d18ba7..3f7214e2fa 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -478,10 +478,11 @@ AUXILIARY_VISION_MODEL=openai/gpt-4o | Provider | Description | Requirements | |----------|-------------|-------------| -| `"auto"` | Best available (default). Vision only tries OpenRouter + Nous Portal. | — | +| `"auto"` | Best available (default). Vision tries OpenRouter → Nous → Codex. | — | | `"openrouter"` | Force OpenRouter — routes to any model (Gemini, GPT-4o, Claude, etc.) | `OPENROUTER_API_KEY` | | `"nous"` | Force Nous Portal | `hermes login` | | `"openai"` | Force OpenAI direct API (`api.openai.com`). Supports vision (GPT-4o). | `OPENAI_API_KEY` | +| `"codex"` | Force Codex OAuth (ChatGPT account). Supports vision (gpt-5.3-codex). | `hermes model` → Codex | | `"main"` | Use your main chat model's provider. For local/self-hosted models. | Depends on your setup | ### Common Setups @@ -502,6 +503,14 @@ auxiliary: model: "openai/gpt-4o" # or "google/gemini-2.5-flash", etc. ``` +**Using Codex OAuth** (ChatGPT Pro/Plus account — no API key needed): +```yaml +auxiliary: + vision: + provider: "codex" # uses your ChatGPT OAuth token + # model defaults to gpt-5.3-codex (supports vision) +``` + **Using a local/self-hosted model:** ```yaml auxiliary: @@ -510,8 +519,12 @@ auxiliary: model: "my-local-model" ``` +:::tip +If you use Codex OAuth as your main model provider, vision works automatically — no extra configuration needed. Codex is included in the auto-detection chain for vision. +::: + :::warning -**Vision requires a multimodal model.** In `auto` mode, only OpenRouter and Nous Portal are tried (they route to Gemini, which supports images). If you set `provider: "main"`, make sure your endpoint supports multimodal/vision — otherwise image analysis will fail. The `"openai"` provider works for vision since GPT-4o supports image input. +**Vision requires a multimodal model.** If you set `provider: "main"`, make sure your endpoint supports multimodal/vision — otherwise image analysis will fail. ::: ### Environment Variables