mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)
When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images).
This commit is contained in:
parent
e62250453b
commit
3800972dd0
7 changed files with 757 additions and 10 deletions
|
|
@ -1463,7 +1463,16 @@ def _read_main_model() -> str:
|
|||
|
||||
config.yaml model.default is the single source of truth for the active
|
||||
model. Environment variables are no longer consulted.
|
||||
|
||||
Runtime override: when an AIAgent is active with a CLI/gateway-provided
|
||||
model that differs from config.yaml, ``set_runtime_main()`` records the
|
||||
override in a process-local global. This is consulted FIRST so tools
|
||||
that gate on "the active main model" (e.g. ``vision_analyze``'s native
|
||||
fast path) see the live runtime, not the persisted config default.
|
||||
"""
|
||||
override = _RUNTIME_MAIN_MODEL
|
||||
if isinstance(override, str) and override.strip():
|
||||
return override.strip()
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
|
|
@ -1484,7 +1493,13 @@ def _read_main_provider() -> str:
|
|||
|
||||
Returns the lowercase provider id (e.g. "alibaba", "openrouter") or ""
|
||||
if not configured.
|
||||
|
||||
Runtime override: see ``_read_main_model`` — same mechanism for the
|
||||
provider half of the runtime tuple.
|
||||
"""
|
||||
override = _RUNTIME_MAIN_PROVIDER
|
||||
if isinstance(override, str) and override.strip():
|
||||
return override.strip().lower()
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
|
|
@ -1498,6 +1513,32 @@ def _read_main_provider() -> str:
|
|||
return ""
|
||||
|
||||
|
||||
# Process-local override set by AIAgent at session/turn start. Single-threaded
|
||||
# per turn — no lock needed. Cleared by ``clear_runtime_main()``.
|
||||
_RUNTIME_MAIN_PROVIDER: str = ""
|
||||
_RUNTIME_MAIN_MODEL: str = ""
|
||||
|
||||
|
||||
def set_runtime_main(provider: str, model: str) -> None:
|
||||
"""Record the live runtime provider/model for the current AIAgent.
|
||||
|
||||
Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or
|
||||
equivalent setter) at the top of each turn so that
|
||||
``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway
|
||||
overrides instead of the stale config.yaml default.
|
||||
"""
|
||||
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
|
||||
_RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower()
|
||||
_RUNTIME_MAIN_MODEL = (model or "").strip()
|
||||
|
||||
|
||||
def clear_runtime_main() -> None:
|
||||
"""Clear the runtime override (e.g. on session end)."""
|
||||
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
|
||||
_RUNTIME_MAIN_PROVIDER = ""
|
||||
_RUNTIME_MAIN_MODEL = ""
|
||||
|
||||
|
||||
def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""Resolve the active custom/main endpoint the same way the main CLI does.
|
||||
|
||||
|
|
|
|||
|
|
@ -410,10 +410,29 @@ def _chat_messages_to_responses_input(messages: List[Dict[str, Any]]) -> List[Di
|
|||
call_id = raw_tool_call_id.strip()
|
||||
if not isinstance(call_id, str) or not call_id.strip():
|
||||
continue
|
||||
|
||||
# Multimodal tool result: convert OpenAI-style content list into
|
||||
# Responses ``function_call_output.output`` array. The Responses
|
||||
# API accepts ``output`` as either a string or an array of
|
||||
# ``input_text``/``input_image`` items. See
|
||||
# https://developers.openai.com/api/reference/python/resources/responses/.
|
||||
tool_content = msg.get("content")
|
||||
output_value: Any
|
||||
if isinstance(tool_content, list):
|
||||
converted = _chat_content_to_responses_parts(
|
||||
tool_content, role="user",
|
||||
)
|
||||
if converted:
|
||||
output_value = converted
|
||||
else:
|
||||
output_value = ""
|
||||
else:
|
||||
output_value = str(tool_content or "")
|
||||
|
||||
items.append({
|
||||
"type": "function_call_output",
|
||||
"call_id": call_id,
|
||||
"output": str(msg.get("content", "") or ""),
|
||||
"output": output_value,
|
||||
})
|
||||
|
||||
return items
|
||||
|
|
@ -466,6 +485,38 @@ def _preflight_codex_input_items(raw_items: Any) -> List[Dict[str, Any]]:
|
|||
output = item.get("output", "")
|
||||
if output is None:
|
||||
output = ""
|
||||
# Output may be a string OR an array of structured content
|
||||
# items (input_text / input_image) for multimodal tool results.
|
||||
# Both shapes are accepted by the Responses API. We preserve
|
||||
# the array form when present.
|
||||
if isinstance(output, list):
|
||||
# Validate each item is a recognised content shape; drop
|
||||
# anything else to avoid 4xx from the API.
|
||||
cleaned: List[Dict[str, Any]] = []
|
||||
for part in output:
|
||||
if not isinstance(part, dict):
|
||||
continue
|
||||
ptype = part.get("type")
|
||||
if ptype == "input_text":
|
||||
text = part.get("text")
|
||||
if isinstance(text, str) and text:
|
||||
cleaned.append({"type": "input_text", "text": text})
|
||||
elif ptype == "input_image":
|
||||
url = part.get("image_url")
|
||||
if isinstance(url, str) and url:
|
||||
entry: Dict[str, Any] = {"type": "input_image", "image_url": url}
|
||||
detail = part.get("detail")
|
||||
if isinstance(detail, str) and detail.strip():
|
||||
entry["detail"] = detail.strip()
|
||||
cleaned.append(entry)
|
||||
normalized.append(
|
||||
{
|
||||
"type": "function_call_output",
|
||||
"call_id": call_id.strip(),
|
||||
"output": cleaned if cleaned else "",
|
||||
}
|
||||
)
|
||||
continue
|
||||
if not isinstance(output, str):
|
||||
output = str(output)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue