mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-22 05:22:09 +00:00
feat(vision): vision_analyze returns pixels to vision-capable models, not aux text (#22955)
When the active main model has native vision and the provider supports multimodal tool results (Anthropic, OpenAI Chat, Codex Responses, Gemini 3, OpenRouter, Nous), vision_analyze loads the image bytes and returns them to the model as a multimodal tool-result envelope. The model then sees the pixels directly on its next turn instead of receiving a lossy text description from an auxiliary LLM. Falls back to the legacy aux-LLM text path for non-vision models and unverified providers. Mirrors the architecture used in OpenCode, Claude Code, Codex CLI, and Cline. All four converge on the same pattern: tool results carry image content blocks for vision-capable provider/model combinations. Changes - tools/vision_tools.py: _vision_analyze_native fast path + provider capability table (_supports_media_in_tool_results). Schema description updated to reflect new behaviour. - agent/codex_responses_adapter.py: function_call_output.output now accepts the array form for multimodal tool results (was string-only). Preflight validates input_text/input_image parts. - agent/auxiliary_client.py: _RUNTIME_MAIN_PROVIDER/_MODEL globals so tools see the live CLI/gateway override, not the stale config.yaml default. set_runtime_main()/clear_runtime_main() helpers. - run_agent.py: AIAgent.run_conversation calls set_runtime_main at turn start so vision_analyze's fast-path check sees the actual runtime. - tests/conftest.py: clear runtime-main override between tests. Tests - tests/tools/test_vision_native_fast_path.py: provider capability table, envelope shape, fast-path gating (vision-capable model uses fast path; non-vision model falls through to aux). - tests/run_agent/test_codex_multimodal_tool_result.py: list tool content becomes function_call_output.output array; preflight preserves arrays and drops unknown part types. Live verified - Opus 4.6 + Sonnet 4.6 on OpenRouter: model calls vision_analyze on a typed filepath, gets pixels back, reads exact text from images that no aux description could capture (font color irony, multi-line fruit-count list, etc.). PR replaces the closed prior efforts (#16506 shipped the inbound user- attached path; this PR closes the gap for tool-discovered images).
This commit is contained in:
parent
e62250453b
commit
3800972dd0
7 changed files with 757 additions and 10 deletions
|
|
@ -1463,7 +1463,16 @@ def _read_main_model() -> str:
|
|||
|
||||
config.yaml model.default is the single source of truth for the active
|
||||
model. Environment variables are no longer consulted.
|
||||
|
||||
Runtime override: when an AIAgent is active with a CLI/gateway-provided
|
||||
model that differs from config.yaml, ``set_runtime_main()`` records the
|
||||
override in a process-local global. This is consulted FIRST so tools
|
||||
that gate on "the active main model" (e.g. ``vision_analyze``'s native
|
||||
fast path) see the live runtime, not the persisted config default.
|
||||
"""
|
||||
override = _RUNTIME_MAIN_MODEL
|
||||
if isinstance(override, str) and override.strip():
|
||||
return override.strip()
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
|
|
@ -1484,7 +1493,13 @@ def _read_main_provider() -> str:
|
|||
|
||||
Returns the lowercase provider id (e.g. "alibaba", "openrouter") or ""
|
||||
if not configured.
|
||||
|
||||
Runtime override: see ``_read_main_model`` — same mechanism for the
|
||||
provider half of the runtime tuple.
|
||||
"""
|
||||
override = _RUNTIME_MAIN_PROVIDER
|
||||
if isinstance(override, str) and override.strip():
|
||||
return override.strip().lower()
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
|
|
@ -1498,6 +1513,32 @@ def _read_main_provider() -> str:
|
|||
return ""
|
||||
|
||||
|
||||
# Process-local override set by AIAgent at session/turn start. Single-threaded
|
||||
# per turn — no lock needed. Cleared by ``clear_runtime_main()``.
|
||||
_RUNTIME_MAIN_PROVIDER: str = ""
|
||||
_RUNTIME_MAIN_MODEL: str = ""
|
||||
|
||||
|
||||
def set_runtime_main(provider: str, model: str) -> None:
|
||||
"""Record the live runtime provider/model for the current AIAgent.
|
||||
|
||||
Called by ``run_agent.AIAgent._sync_runtime_main_for_aux_routing`` (or
|
||||
equivalent setter) at the top of each turn so that
|
||||
``_read_main_provider`` / ``_read_main_model`` reflect CLI/gateway
|
||||
overrides instead of the stale config.yaml default.
|
||||
"""
|
||||
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
|
||||
_RUNTIME_MAIN_PROVIDER = (provider or "").strip().lower()
|
||||
_RUNTIME_MAIN_MODEL = (model or "").strip()
|
||||
|
||||
|
||||
def clear_runtime_main() -> None:
|
||||
"""Clear the runtime override (e.g. on session end)."""
|
||||
global _RUNTIME_MAIN_PROVIDER, _RUNTIME_MAIN_MODEL
|
||||
_RUNTIME_MAIN_PROVIDER = ""
|
||||
_RUNTIME_MAIN_MODEL = ""
|
||||
|
||||
|
||||
def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
"""Resolve the active custom/main endpoint the same way the main CLI does.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue