mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(anthropic): revert inline vision, add hermes model flow, wire vision aux
Feedback fixes:
1. Revert _convert_vision_content — vision is handled by the vision_analyze
tool, not by converting image blocks inline in conversation messages.
Removed the function and its tests.
2. Add Anthropic to 'hermes model' (cmd_model in main.py):
- Added to provider_labels dict
- Added to providers selection list
- Added _model_flow_anthropic() with Claude Code credential auto-detection,
API key prompting, and model selection from catalog.
3. Wire up Anthropic as a vision-capable auxiliary provider:
- Added _try_anthropic() to auxiliary_client.py using claude-sonnet-4
as the vision model (Claude natively supports multimodal)
- Added to the get_vision_auxiliary_client() auto-detection chain
(after OpenRouter/Nous, before Codex/custom)
Cache tracking note: the Anthropic cache metrics branch in run_agent.py
(cache_read_input_tokens / cache_creation_input_tokens) is in the correct
place — it's response-level parsing, same location as the existing
OpenRouter cache tracking. auxiliary_client.py has no cache tracking.
This commit is contained in:
parent
d7adfe8f61
commit
7086fde37e
4 changed files with 105 additions and 94 deletions
|
|
@ -184,58 +184,6 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
|
|||
return result
|
||||
|
||||
|
||||
def _convert_vision_content(content: Any) -> Any:
|
||||
"""Convert OpenAI multimodal content blocks to Anthropic format.
|
||||
|
||||
OpenAI format: [{"type": "image_url", "image_url": {"url": "data:...;base64,..."}}]
|
||||
Anthropic format: [{"type": "image", "source": {"type": "base64", ...}}]
|
||||
"""
|
||||
if not isinstance(content, list):
|
||||
return content
|
||||
|
||||
result = []
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
result.append(block)
|
||||
continue
|
||||
|
||||
if block.get("type") == "image_url":
|
||||
image_url = block.get("image_url", {})
|
||||
url = image_url.get("url", "") if isinstance(image_url, dict) else ""
|
||||
|
||||
if url.startswith("data:"):
|
||||
# data:image/png;base64,iVBOR...
|
||||
try:
|
||||
header, b64_data = url.split(",", 1)
|
||||
media_type = header.split(":")[1].split(";")[0]
|
||||
result.append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": media_type,
|
||||
"data": b64_data,
|
||||
},
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
logger.warning("Could not parse data URL for image, skipping")
|
||||
else:
|
||||
# Regular URL — Anthropic supports url source type
|
||||
result.append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "url",
|
||||
"url": url,
|
||||
},
|
||||
})
|
||||
elif block.get("type") == "text":
|
||||
result.append({"type": "text", "text": block.get("text", "")})
|
||||
else:
|
||||
# Pass through unknown block types
|
||||
result.append(block)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def convert_messages_to_anthropic(
|
||||
messages: List[Dict],
|
||||
) -> Tuple[Optional[Any], List[Dict]]:
|
||||
|
|
@ -304,9 +252,8 @@ def convert_messages_to_anthropic(
|
|||
result.append({"role": "user", "content": [tool_result]})
|
||||
continue
|
||||
|
||||
# Regular user message — convert vision content if multimodal
|
||||
converted = _convert_vision_content(content) if isinstance(content, list) else content
|
||||
result.append({"role": "user", "content": converted})
|
||||
# Regular user message
|
||||
result.append({"role": "user", "content": content})
|
||||
|
||||
# Strip orphaned tool_use blocks (no matching tool_result follows)
|
||||
tool_result_ids = set()
|
||||
|
|
|
|||
|
|
@ -449,6 +449,21 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|||
return OpenAI(api_key=custom_key, base_url=custom_base), model
|
||||
|
||||
|
||||
_ANTHROPIC_VISION_MODEL = "claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Try Anthropic credentials for auxiliary tasks (vision-capable)."""
|
||||
from agent.anthropic_adapter import resolve_anthropic_token
|
||||
token = resolve_anthropic_token()
|
||||
if not token:
|
||||
return None, None
|
||||
# Return a simple wrapper that indicates Anthropic is available.
|
||||
# The actual client is created by resolve_provider_client("anthropic").
|
||||
logger.debug("Auxiliary client: Anthropic (%s)", _ANTHROPIC_VISION_MODEL)
|
||||
return resolve_provider_client("anthropic", model=_ANTHROPIC_VISION_MODEL)
|
||||
|
||||
|
||||
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
|
||||
codex_token = _read_codex_access_token()
|
||||
if not codex_token:
|
||||
|
|
@ -753,8 +768,8 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|||
# back to the user's custom endpoint. Many local models (Qwen-VL,
|
||||
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
|
||||
# caused silent failures for local-only users.
|
||||
for try_fn in (_try_openrouter, _try_nous, _try_codex,
|
||||
_try_custom_endpoint):
|
||||
for try_fn in (_try_openrouter, _try_nous, _try_anthropic,
|
||||
_try_codex, _try_custom_endpoint):
|
||||
client, model = try_fn()
|
||||
if client is not None:
|
||||
return client, model
|
||||
|
|
|
|||
|
|
@ -746,6 +746,7 @@ def cmd_model(args):
|
|||
"openrouter": "OpenRouter",
|
||||
"nous": "Nous Portal",
|
||||
"openai-codex": "OpenAI Codex",
|
||||
"anthropic": "Anthropic",
|
||||
"zai": "Z.AI / GLM",
|
||||
"kimi-coding": "Kimi / Moonshot",
|
||||
"minimax": "MiniMax",
|
||||
|
|
@ -764,6 +765,7 @@ def cmd_model(args):
|
|||
("openrouter", "OpenRouter (100+ models, pay-per-use)"),
|
||||
("nous", "Nous Portal (Nous Research subscription)"),
|
||||
("openai-codex", "OpenAI Codex"),
|
||||
("anthropic", "Anthropic (Claude models — API key or Claude Code)"),
|
||||
("zai", "Z.AI / GLM (Zhipu AI direct API)"),
|
||||
("kimi-coding", "Kimi / Moonshot (Moonshot AI direct API)"),
|
||||
("minimax", "MiniMax (global direct API)"),
|
||||
|
|
@ -832,6 +834,8 @@ def cmd_model(args):
|
|||
_model_flow_named_custom(config, _custom_provider_map[selected_provider])
|
||||
elif selected_provider == "remove-custom":
|
||||
_remove_custom_provider(config)
|
||||
elif selected_provider == "anthropic":
|
||||
_model_flow_anthropic(config, current_model)
|
||||
elif selected_provider == "kimi-coding":
|
||||
_model_flow_kimi(config, current_model)
|
||||
elif selected_provider in ("zai", "minimax", "minimax-cn"):
|
||||
|
|
@ -1555,6 +1559,88 @@ def _model_flow_api_key_provider(config, provider_id, current_model=""):
|
|||
print("No change.")
|
||||
|
||||
|
||||
def _model_flow_anthropic(config, current_model=""):
|
||||
"""Flow for Anthropic provider — API key, setup-token, or Claude Code creds."""
|
||||
import os
|
||||
from hermes_cli.auth import (
|
||||
PROVIDER_REGISTRY, _prompt_model_selection, _save_model_choice,
|
||||
_update_config_for_provider, deactivate_provider,
|
||||
)
|
||||
from hermes_cli.config import get_env_value, save_env_value, load_config, save_config
|
||||
from hermes_cli.models import _PROVIDER_MODELS
|
||||
|
||||
pconfig = PROVIDER_REGISTRY["anthropic"]
|
||||
|
||||
# Check for existing credentials (env vars or Claude Code)
|
||||
existing_key = (
|
||||
get_env_value("ANTHROPIC_API_KEY")
|
||||
or os.getenv("ANTHROPIC_API_KEY", "")
|
||||
or get_env_value("ANTHROPIC_TOKEN")
|
||||
or os.getenv("ANTHROPIC_TOKEN", "")
|
||||
)
|
||||
|
||||
# Check for Claude Code auto-discovery
|
||||
cc_available = False
|
||||
try:
|
||||
from agent.anthropic_adapter import read_claude_code_credentials, is_claude_code_token_valid
|
||||
cc_creds = read_claude_code_credentials()
|
||||
if cc_creds and is_claude_code_token_valid(cc_creds):
|
||||
cc_available = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if existing_key:
|
||||
print(f" Anthropic key: {existing_key[:12]}... ✓")
|
||||
elif cc_available:
|
||||
print(" Claude Code credentials: ✓ (auto-detected from ~/.claude/.credentials.json)")
|
||||
else:
|
||||
print("No Anthropic credentials found.")
|
||||
try:
|
||||
new_key = input("ANTHROPIC_API_KEY (or Enter to cancel): ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print()
|
||||
return
|
||||
if not new_key:
|
||||
print("Cancelled.")
|
||||
return
|
||||
save_env_value("ANTHROPIC_API_KEY", new_key)
|
||||
print("API key saved.")
|
||||
print()
|
||||
|
||||
# Model selection
|
||||
model_list = _PROVIDER_MODELS.get("anthropic", [])
|
||||
if model_list:
|
||||
selected = _prompt_model_selection(model_list, current_model=current_model)
|
||||
else:
|
||||
try:
|
||||
selected = input("Model name (e.g., claude-sonnet-4-20250514): ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
selected = None
|
||||
|
||||
if selected:
|
||||
# Clear custom endpoint if set
|
||||
if get_env_value("OPENAI_BASE_URL"):
|
||||
save_env_value("OPENAI_BASE_URL", "")
|
||||
save_env_value("OPENAI_API_KEY", "")
|
||||
|
||||
_save_model_choice(selected)
|
||||
|
||||
# Update config with provider
|
||||
cfg = load_config()
|
||||
model = cfg.get("model")
|
||||
if not isinstance(model, dict):
|
||||
model = {"default": model} if model else {}
|
||||
cfg["model"] = model
|
||||
model["provider"] = "anthropic"
|
||||
model["base_url"] = pconfig.inference_base_url
|
||||
save_config(cfg)
|
||||
deactivate_provider()
|
||||
|
||||
print(f"Default model set to: {selected} (via Anthropic)")
|
||||
else:
|
||||
print("No change.")
|
||||
|
||||
|
||||
def cmd_login(args):
|
||||
"""Authenticate Hermes CLI with a provider."""
|
||||
from hermes_cli.auth import login_command
|
||||
|
|
|
|||
|
|
@ -413,43 +413,6 @@ class TestNormalizeResponse:
|
|||
assert len(msg.tool_calls) == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vision content conversion
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestVisionContentConversion:
|
||||
def test_base64_image(self):
|
||||
from agent.anthropic_adapter import _convert_vision_content
|
||||
|
||||
content = [
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,iVBOR"}},
|
||||
]
|
||||
result = _convert_vision_content(content)
|
||||
assert result[0] == {"type": "text", "text": "What's in this image?"}
|
||||
assert result[1]["type"] == "image"
|
||||
assert result[1]["source"]["type"] == "base64"
|
||||
assert result[1]["source"]["media_type"] == "image/png"
|
||||
assert result[1]["source"]["data"] == "iVBOR"
|
||||
|
||||
def test_url_image(self):
|
||||
from agent.anthropic_adapter import _convert_vision_content
|
||||
|
||||
content = [
|
||||
{"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
|
||||
]
|
||||
result = _convert_vision_content(content)
|
||||
assert result[0]["type"] == "image"
|
||||
assert result[0]["source"]["type"] == "url"
|
||||
assert result[0]["source"]["url"] == "https://example.com/img.png"
|
||||
|
||||
def test_passthrough_non_list(self):
|
||||
from agent.anthropic_adapter import _convert_vision_content
|
||||
|
||||
assert _convert_vision_content("plain text") == "plain text"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Role alternation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue