Merge pull request #1408 from NousResearch/hermes/hermes-daa73839

fix: make Claude image handling work end-to-end
This commit is contained in:
Teknium 2026-03-14 23:45:03 -07:00 committed by GitHub
commit 779f8df6a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 347 additions and 20 deletions

View file

@ -497,6 +497,66 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
return result return result
def _image_source_from_openai_url(url: str) -> Dict[str, str]:
"""Convert an OpenAI-style image URL/data URL into Anthropic image source."""
url = str(url or "").strip()
if not url:
return {"type": "url", "url": ""}
if url.startswith("data:"):
header, _, data = url.partition(",")
media_type = "image/jpeg"
if header.startswith("data:"):
mime_part = header[len("data:"):].split(";", 1)[0].strip()
if mime_part.startswith("image/"):
media_type = mime_part
return {
"type": "base64",
"media_type": media_type,
"data": data,
}
return {"type": "url", "url": url}
def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]:
"""Convert a single OpenAI-style content part to Anthropic format."""
if part is None:
return None
if isinstance(part, str):
return {"type": "text", "text": part}
if not isinstance(part, dict):
return {"type": "text", "text": str(part)}
ptype = part.get("type")
if ptype == "input_text":
block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")}
elif ptype in {"image_url", "input_image"}:
image_value = part.get("image_url", {})
url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "")
block = {"type": "image", "source": _image_source_from_openai_url(url)}
else:
block = dict(part)
if isinstance(part.get("cache_control"), dict) and "cache_control" not in block:
block["cache_control"] = dict(part["cache_control"])
return block
def _convert_content_to_anthropic(content: Any) -> Any:
"""Convert OpenAI-style multimodal content arrays to Anthropic blocks."""
if not isinstance(content, list):
return content
converted = []
for part in content:
block = _convert_content_part_to_anthropic(part)
if block is not None:
converted.append(block)
return converted
def convert_messages_to_anthropic( def convert_messages_to_anthropic(
messages: List[Dict], messages: List[Dict],
) -> Tuple[Optional[Any], List[Dict]]: ) -> Tuple[Optional[Any], List[Dict]]:
@ -533,11 +593,9 @@ def convert_messages_to_anthropic(
blocks = [] blocks = []
if content: if content:
if isinstance(content, list): if isinstance(content, list):
for part in content: converted_content = _convert_content_to_anthropic(content)
if isinstance(part, dict): if isinstance(converted_content, list):
blocks.append(dict(part)) blocks.extend(converted_content)
elif part is not None:
blocks.append({"type": "text", "text": str(part)})
else: else:
blocks.append({"type": "text", "text": str(content)}) blocks.append({"type": "text", "text": str(content)})
for tc in m.get("tool_calls", []): for tc in m.get("tool_calls", []):
@ -587,12 +645,11 @@ def convert_messages_to_anthropic(
# Regular user message # Regular user message
if isinstance(content, list): if isinstance(content, list):
converted_blocks = [] converted_blocks = _convert_content_to_anthropic(content)
for part in content: result.append({
converted = _convert_user_content_part_to_anthropic(part) "role": "user",
if converted is not None: "content": converted_blocks or [{"type": "text", "text": ""}],
converted_blocks.append(converted) })
result.append({"role": "user", "content": converted_blocks or [{"type": "text", "text": ""}]})
else: else:
result.append({"role": "user", "content": content}) result.append({"role": "user", "content": content})

View file

@ -83,7 +83,10 @@ _AUTH_JSON_PATH = get_hermes_home() / "auth.json"
# Codex fallback: uses the Responses API (the only endpoint the Codex # Codex fallback: uses the Responses API (the only endpoint the Codex
# OAuth token can access) with a fast model for auxiliary tasks. # OAuth token can access) with a fast model for auxiliary tasks.
_CODEX_AUX_MODEL = "gpt-5.3-codex" # ChatGPT-backed Codex accounts currently reject gpt-5.3-codex for these
# auxiliary flows, while gpt-5.2-codex remains broadly available and supports
# vision via Responses.
_CODEX_AUX_MODEL = "gpt-5.2-codex"
_CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex" _CODEX_AUX_BASE_URL = "https://chatgpt.com/backend-api/codex"

View file

@ -21,6 +21,8 @@ Usage:
""" """
import atexit import atexit
import asyncio
import base64
import concurrent.futures import concurrent.futures
import copy import copy
import hashlib import hashlib
@ -31,6 +33,7 @@ import os
import random import random
import re import re
import sys import sys
import tempfile
import time import time
import threading import threading
import weakref import weakref
@ -504,6 +507,11 @@ class AIAgent:
self._persist_user_message_idx = None self._persist_user_message_idx = None
self._persist_user_message_override = None self._persist_user_message_override = None
# Cache anthropic image-to-text fallbacks per image payload/URL so a
# single tool loop does not repeatedly re-run auxiliary vision on the
# same image history.
self._anthropic_image_fallback_cache: Dict[str, str] = {}
# Initialize LLM client via centralized provider router. # Initialize LLM client via centralized provider router.
# The router handles auth resolution, base URL, headers, and # The router handles auth resolution, base URL, headers, and
# Codex/Anthropic wrapping for all known providers. # Codex/Anthropic wrapping for all known providers.
@ -3034,13 +3042,156 @@ class AIAgent:
# ── End provider fallback ────────────────────────────────────────────── # ── End provider fallback ──────────────────────────────────────────────
@staticmethod
def _content_has_image_parts(content: Any) -> bool:
if not isinstance(content, list):
return False
for part in content:
if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
return True
return False
@staticmethod
def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]:
header, _, data = str(image_url or "").partition(",")
mime = "image/jpeg"
if header.startswith("data:"):
mime_part = header[len("data:"):].split(";", 1)[0].strip()
if mime_part.startswith("image/"):
mime = mime_part
suffix = {
"image/png": ".png",
"image/gif": ".gif",
"image/webp": ".webp",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
}.get(mime, ".jpg")
tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
with tmp:
tmp.write(base64.b64decode(data))
path = Path(tmp.name)
return str(path), path
def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str:
cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest()
cached = self._anthropic_image_fallback_cache.get(cache_key)
if cached:
return cached
role_label = {
"assistant": "assistant",
"tool": "tool result",
}.get(role, "user")
analysis_prompt = (
"Describe everything visible in this image in thorough detail. "
"Include any text, code, UI, data, objects, people, layout, colors, "
"and any other notable visual information."
)
vision_source = str(image_url or "")
cleanup_path: Optional[Path] = None
if vision_source.startswith("data:"):
vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source)
description = ""
try:
from tools.vision_tools import vision_analyze_tool
result_json = asyncio.run(
vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
)
result = json.loads(result_json) if isinstance(result_json, str) else {}
description = (result.get("analysis") or "").strip()
except Exception as e:
description = f"Image analysis failed: {e}"
finally:
if cleanup_path and cleanup_path.exists():
try:
cleanup_path.unlink()
except OSError:
pass
if not description:
description = "Image analysis failed."
note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]"
if vision_source and not str(image_url or "").startswith("data:"):
note += (
f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]"
)
self._anthropic_image_fallback_cache[cache_key] = note
return note
def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
if not self._content_has_image_parts(content):
return content
text_parts: List[str] = []
image_notes: List[str] = []
for part in content:
if isinstance(part, str):
if part.strip():
text_parts.append(part.strip())
continue
if not isinstance(part, dict):
continue
ptype = part.get("type")
if ptype in {"text", "input_text"}:
text = str(part.get("text", "") or "").strip()
if text:
text_parts.append(text)
continue
if ptype in {"image_url", "input_image"}:
image_data = part.get("image_url", {})
image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "")
if image_url:
image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role))
else:
image_notes.append("[An image was attached but no image source was available.]")
continue
text = str(part.get("text", "") or "").strip()
if text:
text_parts.append(text)
prefix = "\n\n".join(note for note in image_notes if note).strip()
suffix = "\n".join(text for text in text_parts if text).strip()
if prefix and suffix:
return f"{prefix}\n\n{suffix}"
if prefix:
return prefix
if suffix:
return suffix
return "[A multimodal message was converted to text for Anthropic compatibility.]"
def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
if not any(
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
for msg in api_messages
):
return api_messages
transformed = copy.deepcopy(api_messages)
for msg in transformed:
if not isinstance(msg, dict):
continue
msg["content"] = self._preprocess_anthropic_content(
msg.get("content"),
str(msg.get("role", "user") or "user"),
)
return transformed
def _build_api_kwargs(self, api_messages: list) -> dict: def _build_api_kwargs(self, api_messages: list) -> dict:
"""Build the keyword arguments dict for the active API mode.""" """Build the keyword arguments dict for the active API mode."""
if self.api_mode == "anthropic_messages": if self.api_mode == "anthropic_messages":
from agent.anthropic_adapter import build_anthropic_kwargs from agent.anthropic_adapter import build_anthropic_kwargs
anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
return build_anthropic_kwargs( return build_anthropic_kwargs(
model=self.model, model=self.model,
messages=api_messages, messages=anthropic_messages,
tools=self.tools, tools=self.tools,
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
reasoning_config=self.reasoning_config, reasoning_config=self.reasoning_config,

View file

@ -195,7 +195,7 @@ class TestGetTextAuxiliaryClient:
with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
patch("agent.auxiliary_client.OpenAI") as mock_openai: patch("agent.auxiliary_client.OpenAI") as mock_openai:
client, model = get_text_auxiliary_client() client, model = get_text_auxiliary_client()
assert model == "gpt-5.3-codex" assert model == "gpt-5.2-codex"
# Returns a CodexAuxiliaryClient wrapper, not a raw OpenAI client # Returns a CodexAuxiliaryClient wrapper, not a raw OpenAI client
from agent.auxiliary_client import CodexAuxiliaryClient from agent.auxiliary_client import CodexAuxiliaryClient
assert isinstance(client, CodexAuxiliaryClient) assert isinstance(client, CodexAuxiliaryClient)
@ -288,7 +288,7 @@ class TestVisionClientFallback:
client, model = get_vision_auxiliary_client() client, model = get_vision_auxiliary_client()
from agent.auxiliary_client import CodexAuxiliaryClient from agent.auxiliary_client import CodexAuxiliaryClient
assert isinstance(client, CodexAuxiliaryClient) assert isinstance(client, CodexAuxiliaryClient)
assert model == "gpt-5.3-codex" assert model == "gpt-5.2-codex"
def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch): def test_vision_auto_falls_back_to_custom_endpoint(self, monkeypatch):
"""Custom endpoint is used as fallback in vision auto mode. """Custom endpoint is used as fallback in vision auto mode.
@ -371,7 +371,7 @@ class TestVisionClientFallback:
client, model = get_vision_auxiliary_client() client, model = get_vision_auxiliary_client()
from agent.auxiliary_client import CodexAuxiliaryClient from agent.auxiliary_client import CodexAuxiliaryClient
assert isinstance(client, CodexAuxiliaryClient) assert isinstance(client, CodexAuxiliaryClient)
assert model == "gpt-5.3-codex" assert model == "gpt-5.2-codex"
class TestGetAuxiliaryProvider: class TestGetAuxiliaryProvider:
@ -489,7 +489,7 @@ class TestResolveForcedProvider:
client, model = _resolve_forced_provider("main") client, model = _resolve_forced_provider("main")
from agent.auxiliary_client import CodexAuxiliaryClient from agent.auxiliary_client import CodexAuxiliaryClient
assert isinstance(client, CodexAuxiliaryClient) assert isinstance(client, CodexAuxiliaryClient)
assert model == "gpt-5.3-codex" assert model == "gpt-5.2-codex"
def test_forced_codex(self, codex_auth_dir, monkeypatch): def test_forced_codex(self, codex_auth_dir, monkeypatch):
with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
@ -497,7 +497,7 @@ class TestResolveForcedProvider:
client, model = _resolve_forced_provider("codex") client, model = _resolve_forced_provider("codex")
from agent.auxiliary_client import CodexAuxiliaryClient from agent.auxiliary_client import CodexAuxiliaryClient
assert isinstance(client, CodexAuxiliaryClient) assert isinstance(client, CodexAuxiliaryClient)
assert model == "gpt-5.3-codex" assert model == "gpt-5.2-codex"
def test_forced_codex_no_token(self, monkeypatch): def test_forced_codex_no_token(self, monkeypatch):
with patch("agent.auxiliary_client._read_codex_access_token", return_value=None): with patch("agent.auxiliary_client._read_codex_access_token", return_value=None):

View file

@ -495,6 +495,59 @@ class TestConvertMessages:
assert len(result) == 1 assert len(result) == 1
assert result[0]["role"] == "user" assert result[0]["role"] == "user"
def test_converts_user_image_url_blocks_to_anthropic_image_blocks(self):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Can you see this?"},
{"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}},
],
}
]
_, result = convert_messages_to_anthropic(messages)
assert result == [
{
"role": "user",
"content": [
{"type": "text", "text": "Can you see this?"},
{"type": "image", "source": {"type": "url", "url": "https://example.com/cat.png"}},
],
}
]
def test_converts_data_url_image_blocks_to_base64_anthropic_image_blocks(self):
messages = [
{
"role": "user",
"content": [
{"type": "input_text", "text": "What is in this screenshot?"},
{"type": "input_image", "image_url": "data:image/png;base64,AAAA"},
],
}
]
_, result = convert_messages_to_anthropic(messages)
assert result == [
{
"role": "user",
"content": [
{"type": "text", "text": "What is in this screenshot?"},
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": "AAAA",
},
},
],
}
]
def test_converts_tool_calls(self): def test_converts_tool_calls(self):
messages = [ messages = [
{ {

View file

@ -543,7 +543,7 @@ class TestAuxiliaryClientProviderPriority:
patch("agent.auxiliary_client._read_codex_access_token", return_value="codex-tok"), \ patch("agent.auxiliary_client._read_codex_access_token", return_value="codex-tok"), \
patch("agent.auxiliary_client.OpenAI"): patch("agent.auxiliary_client.OpenAI"):
client, model = get_text_auxiliary_client() client, model = get_text_auxiliary_client()
assert model == "gpt-5.3-codex" assert model == "gpt-5.2-codex"
assert isinstance(client, CodexAuxiliaryClient) assert isinstance(client, CodexAuxiliaryClient)

View file

@ -12,7 +12,7 @@ import uuid
from logging.handlers import RotatingFileHandler from logging.handlers import RotatingFileHandler
from pathlib import Path from pathlib import Path
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import MagicMock, patch from unittest.mock import AsyncMock, MagicMock, patch
import pytest import pytest
@ -1986,6 +1986,69 @@ class TestBuildApiKwargsAnthropicMaxTokens:
assert call_args[0][3] is None assert call_args[0][3] is None
class TestAnthropicImageFallback:
def test_build_api_kwargs_converts_multimodal_user_image_to_text(self, agent):
agent.api_mode = "anthropic_messages"
agent.reasoning_config = None
api_messages = [{
"role": "user",
"content": [
{"type": "text", "text": "Can you see this now?"},
{"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}},
],
}]
with (
patch("tools.vision_tools.vision_analyze_tool", new=AsyncMock(return_value=json.dumps({"success": True, "analysis": "A cat sitting on a chair."}))),
patch("agent.anthropic_adapter.build_anthropic_kwargs") as mock_build,
):
mock_build.return_value = {"model": "claude-sonnet-4-20250514", "messages": [], "max_tokens": 4096}
agent._build_api_kwargs(api_messages)
kwargs = mock_build.call_args.kwargs or dict(zip(
["model", "messages", "tools", "max_tokens", "reasoning_config"],
mock_build.call_args.args,
))
transformed = kwargs["messages"]
assert isinstance(transformed[0]["content"], str)
assert "A cat sitting on a chair." in transformed[0]["content"]
assert "Can you see this now?" in transformed[0]["content"]
assert "vision_analyze with image_url: https://example.com/cat.png" in transformed[0]["content"]
def test_build_api_kwargs_reuses_cached_image_analysis_for_duplicate_images(self, agent):
agent.api_mode = "anthropic_messages"
agent.reasoning_config = None
data_url = "data:image/png;base64,QUFBQQ=="
api_messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "first"},
{"type": "input_image", "image_url": data_url},
],
},
{
"role": "user",
"content": [
{"type": "text", "text": "second"},
{"type": "input_image", "image_url": data_url},
],
},
]
mock_vision = AsyncMock(return_value=json.dumps({"success": True, "analysis": "A small test image."}))
with (
patch("tools.vision_tools.vision_analyze_tool", new=mock_vision),
patch("agent.anthropic_adapter.build_anthropic_kwargs") as mock_build,
):
mock_build.return_value = {"model": "claude-sonnet-4-20250514", "messages": [], "max_tokens": 4096}
agent._build_api_kwargs(api_messages)
assert mock_vision.await_count == 1
class TestFallbackAnthropicProvider: class TestFallbackAnthropicProvider:
"""Bug fix: _try_activate_fallback had no case for anthropic provider.""" """Bug fix: _try_activate_fallback had no case for anthropic provider."""