fix(anthropic): revert inline vision, add hermes model flow, wire vision aux

Feedback fixes:

1. Revert _convert_vision_content — vision is handled by the vision_analyze
   tool, not by converting image blocks inline in conversation messages.
   Removed the function and its tests.

2. Add Anthropic to 'hermes model' (cmd_model in main.py):
   - Added to provider_labels dict
   - Added to providers selection list
   - Added _model_flow_anthropic() with Claude Code credential auto-detection,
     API key prompting, and model selection from catalog.

3. Wire up Anthropic as a vision-capable auxiliary provider:
   - Added _try_anthropic() to auxiliary_client.py using claude-sonnet-4
     as the vision model (Claude natively supports multimodal)
   - Added to the get_vision_auxiliary_client() auto-detection chain
     (after OpenRouter/Nous, before Codex/custom)

Cache tracking note: the Anthropic cache metrics branch in run_agent.py
(cache_read_input_tokens / cache_creation_input_tokens) is in the correct
place — it's response-level parsing, same location as the existing
OpenRouter cache tracking. auxiliary_client.py has no cache tracking.
This commit is contained in:
teknium1 2026-03-12 16:09:04 -07:00
parent d7adfe8f61
commit 7086fde37e
4 changed files with 105 additions and 94 deletions

View file

@ -184,58 +184,6 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
return result
def _convert_vision_content(content: Any) -> Any:
"""Convert OpenAI multimodal content blocks to Anthropic format.
OpenAI format: [{"type": "image_url", "image_url": {"url": "data:...;base64,..."}}]
Anthropic format: [{"type": "image", "source": {"type": "base64", ...}}]
"""
if not isinstance(content, list):
return content
result = []
for block in content:
if not isinstance(block, dict):
result.append(block)
continue
if block.get("type") == "image_url":
image_url = block.get("image_url", {})
url = image_url.get("url", "") if isinstance(image_url, dict) else ""
if url.startswith("data:"):
# data:image/png;base64,iVBOR...
try:
header, b64_data = url.split(",", 1)
media_type = header.split(":")[1].split(";")[0]
result.append({
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": b64_data,
},
})
except (ValueError, IndexError):
logger.warning("Could not parse data URL for image, skipping")
else:
# Regular URL — Anthropic supports url source type
result.append({
"type": "image",
"source": {
"type": "url",
"url": url,
},
})
elif block.get("type") == "text":
result.append({"type": "text", "text": block.get("text", "")})
else:
# Pass through unknown block types
result.append(block)
return result
def convert_messages_to_anthropic(
messages: List[Dict],
) -> Tuple[Optional[Any], List[Dict]]:
@ -304,9 +252,8 @@ def convert_messages_to_anthropic(
result.append({"role": "user", "content": [tool_result]})
continue
# Regular user message — convert vision content if multimodal
converted = _convert_vision_content(content) if isinstance(content, list) else content
result.append({"role": "user", "content": converted})
# Regular user message
result.append({"role": "user", "content": content})
# Strip orphaned tool_use blocks (no matching tool_result follows)
tool_result_ids = set()

View file

@ -449,6 +449,21 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
return OpenAI(api_key=custom_key, base_url=custom_base), model
_ANTHROPIC_VISION_MODEL = "claude-sonnet-4-20250514"
def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]:
"""Try Anthropic credentials for auxiliary tasks (vision-capable)."""
from agent.anthropic_adapter import resolve_anthropic_token
token = resolve_anthropic_token()
if not token:
return None, None
# Return a simple wrapper that indicates Anthropic is available.
# The actual client is created by resolve_provider_client("anthropic").
logger.debug("Auxiliary client: Anthropic (%s)", _ANTHROPIC_VISION_MODEL)
return resolve_provider_client("anthropic", model=_ANTHROPIC_VISION_MODEL)
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
codex_token = _read_codex_access_token()
if not codex_token:
@ -753,8 +768,8 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
# back to the user's custom endpoint. Many local models (Qwen-VL,
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
# caused silent failures for local-only users.
for try_fn in (_try_openrouter, _try_nous, _try_codex,
_try_custom_endpoint):
for try_fn in (_try_openrouter, _try_nous, _try_anthropic,
_try_codex, _try_custom_endpoint):
client, model = try_fn()
if client is not None:
return client, model

View file

@ -746,6 +746,7 @@ def cmd_model(args):
"openrouter": "OpenRouter",
"nous": "Nous Portal",
"openai-codex": "OpenAI Codex",
"anthropic": "Anthropic",
"zai": "Z.AI / GLM",
"kimi-coding": "Kimi / Moonshot",
"minimax": "MiniMax",
@ -764,6 +765,7 @@ def cmd_model(args):
("openrouter", "OpenRouter (100+ models, pay-per-use)"),
("nous", "Nous Portal (Nous Research subscription)"),
("openai-codex", "OpenAI Codex"),
("anthropic", "Anthropic (Claude models — API key or Claude Code)"),
("zai", "Z.AI / GLM (Zhipu AI direct API)"),
("kimi-coding", "Kimi / Moonshot (Moonshot AI direct API)"),
("minimax", "MiniMax (global direct API)"),
@ -832,6 +834,8 @@ def cmd_model(args):
_model_flow_named_custom(config, _custom_provider_map[selected_provider])
elif selected_provider == "remove-custom":
_remove_custom_provider(config)
elif selected_provider == "anthropic":
_model_flow_anthropic(config, current_model)
elif selected_provider == "kimi-coding":
_model_flow_kimi(config, current_model)
elif selected_provider in ("zai", "minimax", "minimax-cn"):
@ -1555,6 +1559,88 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""):
print("No change.")
def _model_flow_anthropic(config, current_model=""):
"""Flow for Anthropic provider — API key, setup-token, or Claude Code creds."""
import os
from hermes_cli.auth import (
PROVIDER_REGISTRY, _prompt_model_selection, _save_model_choice,
_update_config_for_provider, deactivate_provider,
)
from hermes_cli.config import get_env_value, save_env_value, load_config, save_config
from hermes_cli.models import _PROVIDER_MODELS
pconfig = PROVIDER_REGISTRY["anthropic"]
# Check for existing credentials (env vars or Claude Code)
existing_key = (
get_env_value("ANTHROPIC_API_KEY")
or os.getenv("ANTHROPIC_API_KEY", "")
or get_env_value("ANTHROPIC_TOKEN")
or os.getenv("ANTHROPIC_TOKEN", "")
)
# Check for Claude Code auto-discovery
cc_available = False
try:
from agent.anthropic_adapter import read_claude_code_credentials, is_claude_code_token_valid
cc_creds = read_claude_code_credentials()
if cc_creds and is_claude_code_token_valid(cc_creds):
cc_available = True
except Exception:
pass
if existing_key:
print(f" Anthropic key: {existing_key[:12]}... ✓")
elif cc_available:
print(" Claude Code credentials: ✓ (auto-detected from ~/.claude/.credentials.json)")
else:
print("No Anthropic credentials found.")
try:
new_key = input("ANTHROPIC_API_KEY (or Enter to cancel): ").strip()
except (KeyboardInterrupt, EOFError):
print()
return
if not new_key:
print("Cancelled.")
return
save_env_value("ANTHROPIC_API_KEY", new_key)
print("API key saved.")
print()
# Model selection
model_list = _PROVIDER_MODELS.get("anthropic", [])
if model_list:
selected = _prompt_model_selection(model_list, current_model=current_model)
else:
try:
selected = input("Model name (e.g., claude-sonnet-4-20250514): ").strip()
except (KeyboardInterrupt, EOFError):
selected = None
if selected:
# Clear custom endpoint if set
if get_env_value("OPENAI_BASE_URL"):
save_env_value("OPENAI_BASE_URL", "")
save_env_value("OPENAI_API_KEY", "")
_save_model_choice(selected)
# Update config with provider
cfg = load_config()
model = cfg.get("model")
if not isinstance(model, dict):
model = {"default": model} if model else {}
cfg["model"] = model
model["provider"] = "anthropic"
model["base_url"] = pconfig.inference_base_url
save_config(cfg)
deactivate_provider()
print(f"Default model set to: {selected} (via Anthropic)")
else:
print("No change.")
def cmd_login(args):
"""Authenticate Hermes CLI with a provider."""
from hermes_cli.auth import login_command

View file

@ -413,43 +413,6 @@ class TestNormalizeResponse:
assert len(msg.tool_calls) == 1
# ---------------------------------------------------------------------------
# Vision content conversion
# ---------------------------------------------------------------------------
class TestVisionContentConversion:
def test_base64_image(self):
from agent.anthropic_adapter import _convert_vision_content
content = [
{"type": "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR"}},
]
result = _convert_vision_content(content)
assert result[0] == {"type": "text", "text": "What's in this image?"}
assert result[1]["type"] == "image"
assert result[1]["source"]["type"] == "base64"
assert result[1]["source"]["media_type"] == "image/png"
assert result[1]["source"]["data"] == "iVBOR"
def test_url_image(self):
from agent.anthropic_adapter import _convert_vision_content
content = [
{"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
]
result = _convert_vision_content(content)
assert result[0]["type"] == "image"
assert result[0]["source"]["type"] == "url"
assert result[0]["source"]["url"] == "https://example.com/img.png"
def test_passthrough_non_list(self):
from agent.anthropic_adapter import _convert_vision_content
assert _convert_vision_content("plain text") == "plain text"
# ---------------------------------------------------------------------------
# Role alternation
# ---------------------------------------------------------------------------