From 735a6e7651a003053c92dd109aa96ce00e4e8e8c Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 23:21:09 -0700 Subject: [PATCH] fix: convert anthropic image content blocks --- agent/anthropic_adapter.py | 70 ++++++++++-- agent/auxiliary_client.py | 5 +- run_agent.py | 153 ++++++++++++++++++++++++++- tests/agent/test_auxiliary_client.py | 10 +- tests/test_anthropic_adapter.py | 53 ++++++++++ tests/test_provider_parity.py | 2 +- tests/test_run_agent.py | 65 +++++++++++- 7 files changed, 343 insertions(+), 15 deletions(-) diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index 39efa219c..0b76c1e73 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -406,6 +406,66 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]: return result +def _image_source_from_openai_url(url: str) -> Dict[str, str]: + """Convert an OpenAI-style image URL/data URL into Anthropic image source.""" + url = str(url or "").strip() + if not url: + return {"type": "url", "url": ""} + + if url.startswith("data:"): + header, _, data = url.partition(",") + media_type = "image/jpeg" + if header.startswith("data:"): + mime_part = header[len("data:"):].split(";", 1)[0].strip() + if mime_part.startswith("image/"): + media_type = mime_part + return { + "type": "base64", + "media_type": media_type, + "data": data, + } + + return {"type": "url", "url": url} + + +def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]: + """Convert a single OpenAI-style content part to Anthropic format.""" + if part is None: + return None + if isinstance(part, str): + return {"type": "text", "text": part} + if not isinstance(part, dict): + return {"type": "text", "text": str(part)} + + ptype = part.get("type") + + if ptype == "input_text": + block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")} + elif ptype in {"image_url", "input_image"}: + image_value = part.get("image_url", {}) + url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "") + block = {"type": "image", "source": _image_source_from_openai_url(url)} + else: + block = dict(part) + + if isinstance(part.get("cache_control"), dict) and "cache_control" not in block: + block["cache_control"] = dict(part["cache_control"]) + return block + + +def _convert_content_to_anthropic(content: Any) -> Any: + """Convert OpenAI-style multimodal content arrays to Anthropic blocks.""" + if not isinstance(content, list): + return content + + converted = [] + for part in content: + block = _convert_content_part_to_anthropic(part) + if block is not None: + converted.append(block) + return converted + + def convert_messages_to_anthropic( messages: List[Dict], ) -> Tuple[Optional[Any], List[Dict]]: @@ -442,11 +502,9 @@ def convert_messages_to_anthropic( blocks = [] if content: if isinstance(content, list): - for part in content: - if isinstance(part, dict): - blocks.append(dict(part)) - elif part is not None: - blocks.append({"type": "text", "text": str(part)}) + converted_content = _convert_content_to_anthropic(content) + if isinstance(converted_content, list): + blocks.extend(converted_content) else: blocks.append({"type": "text", "text": str(content)}) for tc in m.get("tool_calls", []): @@ -495,7 +553,7 @@ def convert_messages_to_anthropic( continue # Regular user message - result.append({"role": "user", "content": content}) + result.append({"role": "user", "content": _convert_content_to_anthropic(content)}) # Strip orphaned tool_use blocks (no matching tool_result follows) tool_result_ids = set() diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 15f152a09..bfbe3ce82 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -78,7 +78,10 @@ _AUTH_JSON_PATH = get_hermes_home() / "auth.json" # Codex fallback: uses the Responses API (the only endpoint the Codex # OAuth token can access) with a fast model for auxiliary tasks. -_CODEX_AUX_MODEL = "gpt-5.3-codex" +# ChatGPT-backed Codex accounts currently reject gpt-5.3-codex for these +# auxiliary flows, while gpt-5.2-codex remains broadly available and supports +# vision via Responses. +_CODEX_AUX_MODEL = "gpt-5.2-codex" _CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex" diff --git a/run_agent.py b/run_agent.py index f2f71aca7..8e78f9099 100644 --- a/run_agent.py +++ b/run_agent.py @@ -21,6 +21,8 @@ Usage: """ import atexit +import asyncio +import base64 import concurrent.futures import copy import hashlib @@ -31,6 +33,7 @@ import os import random import re import sys +import tempfile import time import threading import weakref @@ -503,6 +506,11 @@ class AIAgent: self._persist_user_message_idx = None self._persist_user_message_override = None + # Cache anthropic image-to-text fallbacks per image payload/URL so a + # single tool loop does not repeatedly re-run auxiliary vision on the + # same image history. + self._anthropic_image_fallback_cache: Dict[str, str] = {} + # Initialize LLM client via centralized provider router. # The router handles auth resolution, base URL, headers, and # Codex/Anthropic wrapping for all known providers. @@ -2921,13 +2929,156 @@ class AIAgent: # ── End provider fallback ────────────────────────────────────────────── + @staticmethod + def _content_has_image_parts(content: Any) -> bool: + if not isinstance(content, list): + return False + for part in content: + if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}: + return True + return False + + @staticmethod + def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]: + header, _, data = str(image_url or "").partition(",") + mime = "image/jpeg" + if header.startswith("data:"): + mime_part = header[len("data:"):].split(";", 1)[0].strip() + if mime_part.startswith("image/"): + mime = mime_part + suffix = { + "image/png": ".png", + "image/gif": ".gif", + "image/webp": ".webp", + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + }.get(mime, ".jpg") + tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False) + with tmp: + tmp.write(base64.b64decode(data)) + path = Path(tmp.name) + return str(path), path + + def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str: + cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest() + cached = self._anthropic_image_fallback_cache.get(cache_key) + if cached: + return cached + + role_label = { + "assistant": "assistant", + "tool": "tool result", + }.get(role, "user") + analysis_prompt = ( + "Describe everything visible in this image in thorough detail. " + "Include any text, code, UI, data, objects, people, layout, colors, " + "and any other notable visual information." + ) + + vision_source = str(image_url or "") + cleanup_path: Optional[Path] = None + if vision_source.startswith("data:"): + vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source) + + description = "" + try: + from tools.vision_tools import vision_analyze_tool + + result_json = asyncio.run( + vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt) + ) + result = json.loads(result_json) if isinstance(result_json, str) else {} + description = (result.get("analysis") or "").strip() + except Exception as e: + description = f"Image analysis failed: {e}" + finally: + if cleanup_path and cleanup_path.exists(): + try: + cleanup_path.unlink() + except OSError: + pass + + if not description: + description = "Image analysis failed." + + note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]" + if vision_source and not str(image_url or "").startswith("data:"): + note += ( + f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]" + ) + + self._anthropic_image_fallback_cache[cache_key] = note + return note + + def _preprocess_anthropic_content(self, content: Any, role: str) -> Any: + if not self._content_has_image_parts(content): + return content + + text_parts: List[str] = [] + image_notes: List[str] = [] + for part in content: + if isinstance(part, str): + if part.strip(): + text_parts.append(part.strip()) + continue + if not isinstance(part, dict): + continue + + ptype = part.get("type") + if ptype in {"text", "input_text"}: + text = str(part.get("text", "") or "").strip() + if text: + text_parts.append(text) + continue + + if ptype in {"image_url", "input_image"}: + image_data = part.get("image_url", {}) + image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "") + if image_url: + image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role)) + else: + image_notes.append("[An image was attached but no image source was available.]") + continue + + text = str(part.get("text", "") or "").strip() + if text: + text_parts.append(text) + + prefix = "\n\n".join(note for note in image_notes if note).strip() + suffix = "\n".join(text for text in text_parts if text).strip() + if prefix and suffix: + return f"{prefix}\n\n{suffix}" + if prefix: + return prefix + if suffix: + return suffix + return "[A multimodal message was converted to text for Anthropic compatibility.]" + + def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list: + if not any( + isinstance(msg, dict) and self._content_has_image_parts(msg.get("content")) + for msg in api_messages + ): + return api_messages + + transformed = copy.deepcopy(api_messages) + for msg in transformed: + if not isinstance(msg, dict): + continue + msg["content"] = self._preprocess_anthropic_content( + msg.get("content"), + str(msg.get("role", "user") or "user"), + ) + return transformed + def _build_api_kwargs(self, api_messages: list) -> dict: """Build the keyword arguments dict for the active API mode.""" if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_kwargs + anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages) return build_anthropic_kwargs( model=self.model, - messages=api_messages, + messages=anthropic_messages, tools=self.tools, max_tokens=self.max_tokens, reasoning_config=self.reasoning_config, diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index d9e07bc6f..ffbf69a2c 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -169,7 +169,7 @@ class TestGetTextAuxiliaryClient: with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ patch("agent.auxiliary_client.OpenAI") as mock_openai: client, model = get_text_auxiliary_client() - assert model == "gpt-5.3-codex" + assert model == "gpt-5.2-codex" # Returns a CodexAuxiliaryClient wrapper, not a raw OpenAI client from agent.auxiliary_client import CodexAuxiliaryClient assert isinstance(client, CodexAuxiliaryClient) @@ -202,7 +202,7 @@ class TestVisionClientFallback: client, model = get_vision_auxiliary_client() from agent.auxiliary_client import CodexAuxiliaryClient assert isinstance(client, CodexAuxiliaryClient) - assert model == "gpt-5.3-codex" + assert model == "gpt-5.2-codex" def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch): """Custom endpoint is used as fallback in vision auto mode. @@ -264,7 +264,7 @@ class TestVisionClientFallback: client, model = get_vision_auxiliary_client() from agent.auxiliary_client import CodexAuxiliaryClient assert isinstance(client, CodexAuxiliaryClient) - assert model == "gpt-5.3-codex" + assert model == "gpt-5.2-codex" class TestGetAuxiliaryProvider: @@ -382,7 +382,7 @@ class TestResolveForcedProvider: client, model = _resolve_forced_provider("main") from agent.auxiliary_client import CodexAuxiliaryClient assert isinstance(client, CodexAuxiliaryClient) - assert model == "gpt-5.3-codex" + assert model == "gpt-5.2-codex" def test_forced_codex(self, codex_auth_dir, monkeypatch): with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ @@ -390,7 +390,7 @@ class TestResolveForcedProvider: client, model = _resolve_forced_provider("codex") from agent.auxiliary_client import CodexAuxiliaryClient assert isinstance(client, CodexAuxiliaryClient) - assert model == "gpt-5.3-codex" + assert model == "gpt-5.2-codex" def test_forced_codex_no_token(self, monkeypatch): with patch("agent.auxiliary_client._read_codex_access_token", return_value=None): diff --git a/tests/test_anthropic_adapter.py b/tests/test_anthropic_adapter.py index 541d8e2bc..02661eede 100644 --- a/tests/test_anthropic_adapter.py +++ b/tests/test_anthropic_adapter.py @@ -467,6 +467,59 @@ class TestConvertMessages: assert len(result) == 1 assert result[0]["role"] == "user" + def test_converts_user_image_url_blocks_to_anthropic_image_blocks(self): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you see this?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}}, + ], + } + ] + + _, result = convert_messages_to_anthropic(messages) + + assert result == [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you see this?"}, + {"type": "image", "source": {"type": "url", "url": "https://example.com/cat.png"}}, + ], + } + ] + + def test_converts_data_url_image_blocks_to_base64_anthropic_image_blocks(self): + messages = [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "What is in this screenshot?"}, + {"type": "input_image", "image_url": "data:image/png;base64,AAAA"}, + ], + } + ] + + _, result = convert_messages_to_anthropic(messages) + + assert result == [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this screenshot?"}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "AAAA", + }, + }, + ], + } + ] + def test_converts_tool_calls(self): messages = [ { diff --git a/tests/test_provider_parity.py b/tests/test_provider_parity.py index bb91b055c..dc976b8f1 100644 --- a/tests/test_provider_parity.py +++ b/tests/test_provider_parity.py @@ -543,7 +543,7 @@ class TestAuxiliaryClientProviderPriority: patch("agent.auxiliary_client._read_codex_access_token", return_value="codex-tok"), \ patch("agent.auxiliary_client.OpenAI"): client, model = get_text_auxiliary_client() - assert model == "gpt-5.3-codex" + assert model == "gpt-5.2-codex" assert isinstance(client, CodexAuxiliaryClient) diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index c3673eb1e..f9623f50a 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -12,7 +12,7 @@ import uuid from logging.handlers import RotatingFileHandler from pathlib import Path from types import SimpleNamespace -from unittest.mock import MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -1986,6 +1986,69 @@ class TestBuildApiKwargsAnthropicMaxTokens: assert call_args[0][3] is None +class TestAnthropicImageFallback: + def test_build_api_kwargs_converts_multimodal_user_image_to_text(self, agent): + agent.api_mode = "anthropic_messages" + agent.reasoning_config = None + + api_messages = [{ + "role": "user", + "content": [ + {"type": "text", "text": "Can you see this now?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}}, + ], + }] + + with ( + patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=json.dumps({"success": True, "analysis": "A cat sitting on a chair."}))), + patch("agent.anthropic_adapter.build_anthropic_kwargs") as mock_build, + ): + mock_build.return_value = {"model": "claude-sonnet-4-20250514", "messages": [], "max_tokens": 4096} + agent._build_api_kwargs(api_messages) + + kwargs = mock_build.call_args.kwargs or dict(zip( + ["model", "messages", "tools", "max_tokens", "reasoning_config"], + mock_build.call_args.args, + )) + transformed = kwargs["messages"] + assert isinstance(transformed[0]["content"], str) + assert "A cat sitting on a chair." in transformed[0]["content"] + assert "Can you see this now?" in transformed[0]["content"] + assert "vision_analyze with image_url: https://example.com/cat.png" in transformed[0]["content"] + + def test_build_api_kwargs_reuses_cached_image_analysis_for_duplicate_images(self, agent): + agent.api_mode = "anthropic_messages" + agent.reasoning_config = None + data_url = "data:image/png;base64,QUFBQQ==" + + api_messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "first"}, + {"type": "input_image", "image_url": data_url}, + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "second"}, + {"type": "input_image", "image_url": data_url}, + ], + }, + ] + + mock_vision = AsyncMock(return_value=json.dumps({"success": True, "analysis": "A small test image."})) + with ( + patch("tools.vision_tools.vision_analyze_tool", new=mock_vision), + patch("agent.anthropic_adapter.build_anthropic_kwargs") as mock_build, + ): + mock_build.return_value = {"model": "claude-sonnet-4-20250514", "messages": [], "max_tokens": 4096} + agent._build_api_kwargs(api_messages) + + assert mock_vision.await_count == 1 + + class TestFallbackAnthropicProvider: """Bug fix: _try_activate_fallback had no case for anthropic provider."""