diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index 39efa219c3..ad3dfe582d 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -391,6 +391,68 @@ def _sanitize_tool_id(tool_id: str) -> str: return sanitized or "tool_0" +def _convert_openai_image_part_to_anthropic(part: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Convert an OpenAI-style image block to Anthropic's image source format.""" + image_data = part.get("image_url", {}) + url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data) + if not isinstance(url, str) or not url.strip(): + return None + url = url.strip() + + if url.startswith("data:"): + header, sep, data = url.partition(",") + if sep and ";base64" in header: + media_type = header[5:].split(";", 1)[0] or "image/png" + return { + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": data, + }, + } + + if url.startswith("http://") or url.startswith("https://"): + return { + "type": "image", + "source": { + "type": "url", + "url": url, + }, + } + + return None + + +def _convert_user_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]: + if isinstance(part, dict): + ptype = part.get("type") + if ptype == "text": + block = {"type": "text", "text": part.get("text", "")} + if isinstance(part.get("cache_control"), dict): + block["cache_control"] = dict(part["cache_control"]) + return block + if ptype == "image_url": + return _convert_openai_image_part_to_anthropic(part) + if ptype == "image" and part.get("source"): + return dict(part) + if ptype == "image" and part.get("data"): + media_type = part.get("mimeType") or part.get("media_type") or "image/png" + return { + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": part.get("data", ""), + }, + } + if ptype == "tool_result": + return dict(part) + elif part is not None: + return {"type": "text", "text": str(part)} + return None + + def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]: """Convert OpenAI tool definitions to Anthropic format.""" if not tools: @@ -495,7 +557,15 @@ def convert_messages_to_anthropic( continue # Regular user message - result.append({"role": "user", "content": content}) + if isinstance(content, list): + converted_blocks = [] + for part in content: + converted = _convert_user_content_part_to_anthropic(part) + if converted is not None: + converted_blocks.append(converted) + result.append({"role": "user", "content": converted_blocks or [{"type": "text", "text": ""}]}) + else: + result.append({"role": "user", "content": content}) # Strip orphaned tool_use blocks (no matching tool_result follows) tool_result_ids = set() diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index dd8f22bb7c..7794889c02 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -1,4 +1,4 @@ -"""Shared auxiliary OpenAI client for cheap/fast side tasks. +"""Shared auxiliary client router for side tasks. Provides a single resolution chain so every consumer (context compression, session search, web extraction, vision analysis, browser vision) picks up @@ -10,21 +10,21 @@ Resolution order for text tasks (auto mode): 3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) 4. Codex OAuth (Responses API via chatgpt.com with gpt-5.3-codex, wrapped to look like a chat.completions client) - 5. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, MiniMax-CN) - — checked via PROVIDER_REGISTRY entries with auth_type='api_key' - 6. None + 5. Native Anthropic + 6. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, MiniMax-CN) + 7. None Resolution order for vision/multimodal tasks (auto mode): - 1. OpenRouter - 2. Nous Portal - 3. Codex OAuth (gpt-5.3-codex supports vision via Responses API) - 4. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.) - 5. None (API-key providers like z.ai/Kimi/MiniMax are skipped — - they may not support multimodal) + 1. Selected main provider, if it is one of the supported vision backends below + 2. OpenRouter + 3. Nous Portal + 4. Codex OAuth (gpt-5.3-codex supports vision via Responses API) + 5. Native Anthropic + 6. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.) + 7. None Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER, -CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task: -"openrouter", "nous", "codex", or "main" (= steps 3-5). +CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task. Default "auto" follows the chains above. Per-task model overrides (e.g. AUXILIARY_VISION_MODEL, @@ -74,6 +74,7 @@ auxiliary_is_nous: bool = False _OPENROUTER_MODEL = "google/gemini-3-flash-preview" _NOUS_MODEL = "gemini-3-flash" _NOUS_DEFAULT_BASE_URL = "https://inference-api.nousresearch.com/v1" +_ANTHROPIC_DEFAULT_BASE_URL = "https://api.anthropic.com" _AUTH_JSON_PATH = get_hermes_home() / "auth.json" # Codex fallback: uses the Responses API (the only endpoint the Codex @@ -309,6 +310,114 @@ class AsyncCodexAuxiliaryClient: self.base_url = sync_wrapper.base_url +class _AnthropicCompletionsAdapter: + """OpenAI-client-compatible adapter for Anthropic Messages API.""" + + def __init__(self, real_client: Any, model: str): + self._client = real_client + self._model = model + + def create(self, **kwargs) -> Any: + from agent.anthropic_adapter import build_anthropic_kwargs, normalize_anthropic_response + + messages = kwargs.get("messages", []) + model = kwargs.get("model", self._model) + tools = kwargs.get("tools") + tool_choice = kwargs.get("tool_choice") + max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000 + temperature = kwargs.get("temperature") + + normalized_tool_choice = None + if isinstance(tool_choice, str): + normalized_tool_choice = tool_choice + elif isinstance(tool_choice, dict): + choice_type = str(tool_choice.get("type", "")).lower() + if choice_type == "function": + normalized_tool_choice = tool_choice.get("function", {}).get("name") + elif choice_type in {"auto", "required", "none"}: + normalized_tool_choice = choice_type + + anthropic_kwargs = build_anthropic_kwargs( + model=model, + messages=messages, + tools=tools, + max_tokens=max_tokens, + reasoning_config=None, + tool_choice=normalized_tool_choice, + ) + if temperature is not None: + anthropic_kwargs["temperature"] = temperature + + response = self._client.messages.create(**anthropic_kwargs) + assistant_message, finish_reason = normalize_anthropic_response(response) + + usage = None + if hasattr(response, "usage") and response.usage: + prompt_tokens = getattr(response.usage, "input_tokens", 0) or 0 + completion_tokens = getattr(response.usage, "output_tokens", 0) or 0 + total_tokens = getattr(response.usage, "total_tokens", 0) or (prompt_tokens + completion_tokens) + usage = SimpleNamespace( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) + + choice = SimpleNamespace( + index=0, + message=assistant_message, + finish_reason=finish_reason, + ) + return SimpleNamespace( + choices=[choice], + model=model, + usage=usage, + ) + + +class _AnthropicChatShim: + def __init__(self, adapter: _AnthropicCompletionsAdapter): + self.completions = adapter + + +class AnthropicAuxiliaryClient: + """OpenAI-client-compatible wrapper over a native Anthropic client.""" + + def __init__(self, real_client: Any, model: str, api_key: str, base_url: str): + self._real_client = real_client + adapter = _AnthropicCompletionsAdapter(real_client, model) + self.chat = _AnthropicChatShim(adapter) + self.api_key = api_key + self.base_url = base_url + + def close(self): + close_fn = getattr(self._real_client, "close", None) + if callable(close_fn): + close_fn() + + +class _AsyncAnthropicCompletionsAdapter: + def __init__(self, sync_adapter: _AnthropicCompletionsAdapter): + self._sync = sync_adapter + + async def create(self, **kwargs) -> Any: + import asyncio + return await asyncio.to_thread(self._sync.create, **kwargs) + + +class _AsyncAnthropicChatShim: + def __init__(self, adapter: _AsyncAnthropicCompletionsAdapter): + self.completions = adapter + + +class AsyncAnthropicAuxiliaryClient: + def __init__(self, sync_wrapper: "AnthropicAuxiliaryClient"): + sync_adapter = sync_wrapper.chat.completions + async_adapter = _AsyncAnthropicCompletionsAdapter(sync_adapter) + self.chat = _AsyncAnthropicChatShim(async_adapter) + self.api_key = sync_wrapper.api_key + self.base_url = sync_wrapper.base_url + + def _read_nous_auth() -> Optional[dict]: """Read and validate ~/.hermes/auth.json for an active Nous provider. @@ -380,6 +489,9 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]: break if not api_key: continue + if provider_id == "anthropic": + return _try_anthropic() + # Resolve base URL (with optional env-var override) # Kimi Code keys (sk-kimi-) need api.kimi.com/coding/v1 env_url = "" @@ -484,6 +596,22 @@ def _try_codex() -> Tuple[Optional[Any], Optional[str]]: return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL +def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]: + try: + from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token + except ImportError: + return None, None + + token = resolve_anthropic_token() + if not token: + return None, None + + model = _API_KEY_PROVIDER_AUX_MODELS.get("anthropic", "claude-haiku-4-5-20251001") + logger.debug("Auxiliary client: Anthropic native (%s)", model) + real_client = build_anthropic_client(token, _ANTHROPIC_DEFAULT_BASE_URL) + return AnthropicAuxiliaryClient(real_client, model, token, _ANTHROPIC_DEFAULT_BASE_URL), model + + def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]: """Resolve a specific forced provider. Returns (None, None) if creds missing.""" if forced == "openrouter": @@ -546,6 +674,8 @@ def _to_async_client(sync_client, model: str): if isinstance(sync_client, CodexAuxiliaryClient): return AsyncCodexAuxiliaryClient(sync_client), model + if isinstance(sync_client, AnthropicAuxiliaryClient): + return AsyncAnthropicAuxiliaryClient(sync_client), model async_kwargs = { "api_key": sync_client.api_key, @@ -686,6 +816,14 @@ def resolve_provider_client( return None, None if pconfig.auth_type == "api_key": + if provider == "anthropic": + client, default_model = _try_anthropic() + if client is None: + logger.warning("resolve_provider_client: anthropic requested but no Anthropic credentials found") + return None, None + final_model = model or default_model + return (_to_async_client(client, final_model) if async_mode else (client, final_model)) + # Find the first configured API key api_key = "" for env_var in pconfig.api_key_env_vars: @@ -772,6 +910,7 @@ _VISION_AUTO_PROVIDER_ORDER = ( "openrouter", "nous", "openai-codex", + "anthropic", "custom", ) @@ -793,6 +932,8 @@ def _resolve_strict_vision_backend(provider: str) -> Tuple[Optional[Any], Option return _try_nous() if provider == "openai-codex": return _try_codex() + if provider == "anthropic": + return _try_anthropic() if provider == "custom": return _try_custom_endpoint() return None, None @@ -802,19 +943,36 @@ def _strict_vision_backend_available(provider: str) -> bool: return _resolve_strict_vision_backend(provider)[0] is not None +def _preferred_main_vision_provider() -> Optional[str]: + """Return the selected main provider when it is also a supported vision backend.""" + try: + from hermes_cli.config import load_config + + config = load_config() + model_cfg = config.get("model", {}) + if isinstance(model_cfg, dict): + provider = _normalize_vision_provider(model_cfg.get("provider", "")) + if provider in _VISION_AUTO_PROVIDER_ORDER: + return provider + except Exception: + pass + return None + + def get_available_vision_backends() -> List[str]: """Return the currently available vision backends in auto-selection order. This is the single source of truth for setup, tool gating, and runtime - auto-routing of vision tasks. Phase 1 keeps the auto list conservative: - OpenRouter, Nous Portal, Codex OAuth, then custom OpenAI-compatible - endpoints. Explicit provider overrides can still route elsewhere. + auto-routing of vision tasks. The selected main provider is preferred when + it is also a known-good vision backend; otherwise Hermes falls back through + the standard conservative order. """ - return [ - provider - for provider in _VISION_AUTO_PROVIDER_ORDER - if _strict_vision_backend_available(provider) - ] + ordered = list(_VISION_AUTO_PROVIDER_ORDER) + preferred = _preferred_main_vision_provider() + if preferred in ordered: + ordered.remove(preferred) + ordered.insert(0, preferred) + return [provider for provider in ordered if _strict_vision_backend_available(provider)] def resolve_vision_provider_client( diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 051de13c12..fcaff67cf6 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -1268,11 +1268,9 @@ def setup_model_provider(config: dict): _vision_needs_setup = not bool(_vision_backends) - if selected_provider in {"openrouter", "nous", "openai-codex"}: - # If the user just selected one of our known-good vision backends during - # setup, treat vision as covered. Auth/setup failure returns earlier. - _vision_needs_setup = False - elif selected_provider == "custom" and "custom" in _vision_backends: + if selected_provider in _vision_backends: + # If the user just selected a backend Hermes can already use for + # vision, treat it as covered. Auth/setup failure returns earlier. _vision_needs_setup = False if _vision_needs_setup: diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py index 57c73eb8bd..925772fa76 100644 --- a/tests/agent/test_auxiliary_client.py +++ b/tests/agent/test_auxiliary_client.py @@ -10,6 +10,8 @@ import pytest from agent.auxiliary_client import ( get_text_auxiliary_client, get_vision_auxiliary_client, + get_available_vision_backends, + resolve_provider_client, auxiliary_max_tokens_param, _read_codex_access_token, _get_auxiliary_provider, @@ -24,6 +26,7 @@ def _clean_env(monkeypatch): for key in ( "OPENROUTER_API_KEY", "OPENAI_BASE_URL", "OPENAI_API_KEY", "OPENAI_MODEL", "LLM_MODEL", "NOUS_INFERENCE_BASE_URL", + "ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN", # Per-task provider/model overrides "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL", "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL", @@ -164,14 +167,74 @@ class TestGetTextAuxiliaryClient: class TestVisionClientFallback: - """Vision client auto mode only tries OpenRouter + Nous (multimodal-capable).""" + """Vision client auto mode resolves known-good multimodal backends.""" def test_vision_returns_none_without_any_credentials(self): - with patch("agent.auxiliary_client._read_nous_auth", return_value=None): + with ( + patch("agent.auxiliary_client._read_nous_auth", return_value=None), + patch("agent.auxiliary_client._try_anthropic", return_value=(None, None)), + ): client, model = get_vision_auxiliary_client() assert client is None assert model is None + def test_vision_auto_includes_anthropic_when_configured(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key") + with ( + patch("agent.auxiliary_client._read_nous_auth", return_value=None), + patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()), + patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"), + ): + backends = get_available_vision_backends() + + assert "anthropic" in backends + + def test_resolve_provider_client_returns_native_anthropic_wrapper(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key") + with ( + patch("agent.auxiliary_client._read_nous_auth", return_value=None), + patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()), + patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"), + ): + client, model = resolve_provider_client("anthropic") + + assert client is not None + assert client.__class__.__name__ == "AnthropicAuxiliaryClient" + assert model == "claude-haiku-4-5-20251001" + + def test_vision_auto_uses_anthropic_when_no_higher_priority_backend(self, monkeypatch): + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key") + with ( + patch("agent.auxiliary_client._read_nous_auth", return_value=None), + patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()), + patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"), + ): + client, model = get_vision_auxiliary_client() + + assert client is not None + assert client.__class__.__name__ == "AnthropicAuxiliaryClient" + assert model == "claude-haiku-4-5-20251001" + + def test_selected_anthropic_provider_is_preferred_for_vision_auto(self, monkeypatch): + monkeypatch.setenv("OPENROUTER_API_KEY", "or-key") + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key") + + def fake_load_config(): + return {"model": {"provider": "anthropic", "default": "claude-sonnet-4-6"}} + + with ( + patch("agent.auxiliary_client._read_nous_auth", return_value=None), + patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()), + patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"), + patch("agent.auxiliary_client.OpenAI") as mock_openai, + patch("hermes_cli.config.load_config", fake_load_config), + ): + client, model = get_vision_auxiliary_client() + + assert client is not None + assert client.__class__.__name__ == "AnthropicAuxiliaryClient" + assert model == "claude-haiku-4-5-20251001" + def test_vision_auto_includes_codex(self, codex_auth_dir): """Codex supports vision (gpt-5.3-codex), so auto mode should use it.""" with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \ diff --git a/tests/hermes_cli/test_setup_model_provider.py b/tests/hermes_cli/test_setup_model_provider.py index 8f7063ec5a..34b491066b 100644 --- a/tests/hermes_cli/test_setup_model_provider.py +++ b/tests/hermes_cli/test_setup_model_provider.py @@ -111,6 +111,7 @@ def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tm monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None) monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: []) monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: []) + monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: []) setup_model_provider(config) save_config(config) @@ -149,6 +150,7 @@ def test_setup_keep_current_anthropic_can_configure_openai_vision_default(tmp_pa monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None) monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: []) monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: []) + monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: []) setup_model_provider(config) env = _read_env(tmp_path) @@ -224,3 +226,17 @@ def test_setup_summary_marks_codex_auth_as_vision_available(tmp_path, monkeypatc assert "missing run 'hermes setup' to configure" not in output assert "Mixture of Agents" in output assert "missing OPENROUTER_API_KEY" in output + + +def test_setup_summary_marks_anthropic_auth_as_vision_available(tmp_path, monkeypatch, capsys): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + _clear_provider_env(monkeypatch) + monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key") + monkeypatch.setattr("shutil.which", lambda _name: None) + monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: ["anthropic"]) + + _print_setup_summary(load_config(), tmp_path) + output = capsys.readouterr().out + + assert "Vision (image analysis)" in output + assert "missing run 'hermes setup' to configure" not in output diff --git a/tests/test_anthropic_adapter.py b/tests/test_anthropic_adapter.py index 541d8e2bc6..1bc3af2e19 100644 --- a/tests/test_anthropic_adapter.py +++ b/tests/test_anthropic_adapter.py @@ -567,6 +567,56 @@ class TestConvertMessages: assert tool_block["content"] == "result" assert tool_block["cache_control"] == {"type": "ephemeral"} + def test_converts_data_url_image_to_anthropic_image_block(self): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,ZmFrZQ=="}, + }, + ], + } + ] + + _, result = convert_messages_to_anthropic(messages) + blocks = result[0]["content"] + assert blocks[0] == {"type": "text", "text": "Describe this image"} + assert blocks[1] == { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "ZmFrZQ==", + }, + } + + def test_converts_remote_image_url_to_anthropic_image_block(self): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image_url", + "image_url": {"url": "https://example.com/cat.png"}, + }, + ], + } + ] + + _, result = convert_messages_to_anthropic(messages) + blocks = result[0]["content"] + assert blocks[1] == { + "type": "image", + "source": { + "type": "url", + "url": "https://example.com/cat.png", + }, + } + def test_empty_cached_assistant_tool_turn_converts_without_empty_text_block(self): messages = apply_anthropic_cache_control([ {"role": "system", "content": "System prompt"}, diff --git a/tools/vision_tools.py b/tools/vision_tools.py index 37fb8fe464..954ffd283d 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -3,7 +3,8 @@ Vision Tools Module This module provides vision analysis tools that work with image URLs. -Uses Gemini 3 Flash Preview via OpenRouter API for intelligent image understanding. +Uses the centralized auxiliary vision router, which can select OpenRouter, +Nous, Codex, native Anthropic, or a custom OpenAI-compatible endpoint. Available tools: - vision_analyze_tool: Analyze images from URLs with custom prompts @@ -409,7 +410,7 @@ if __name__ == "__main__": if not api_available: print("❌ No auxiliary vision model available") - print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.") + print("Configure a supported multimodal backend (OpenRouter, Nous, Codex, Anthropic, or a custom OpenAI-compatible endpoint).") exit(1) else: print("✅ Vision model available")