refactor(vision): consolidate native fast-path gate into one shared helper

The fast-path decision (native routing + provider allowlist OR
supports_vision override) lived inline in vision_analyze and was copied
into browser_vision. Extract it to _should_use_native_vision_fast_path()
so both tools share one source of truth.

- vision_tools: gate logic now one helper; vision_analyze calls it in 3 lines
- browser_tool: thin envelope decoration over the shared helper, not a copy
- browser_vision typed Union[str, Dict] to match its real return shape
- tests slimmed to target the override path + text-mode-wins invariant
This commit is contained in:
teknium1 2026-05-29 03:40:26 -07:00 committed by Teknium
parent c3f28c651d
commit c77a697fa4
4 changed files with 140 additions and 193 deletions

View file

@ -251,6 +251,7 @@ class TestBrowserVisionConfig:
assert mock_llm.call_args.kwargs["timeout"] == 120.0
def test_browser_vision_native_fast_path_returns_multimodal(self, tmp_path):
"""supports_vision override → screenshot attached natively, no aux call."""
from agent.auxiliary_client import clear_runtime_main, set_runtime_main
from tools.browser_tool import browser_vision
@ -265,10 +266,7 @@ class TestBrowserVisionConfig:
"tools.browser_tool._run_browser_command",
return_value={
"success": True,
"data": {
"path": str(screenshot),
"annotations": annotations,
},
"data": {"path": str(screenshot), "annotations": annotations},
},
),
patch(
@ -278,9 +276,7 @@ class TestBrowserVisionConfig:
patch("tools.browser_tool._get_vision_model") as mock_get_vision_model,
patch("tools.browser_tool.call_llm") as mock_llm,
):
result = browser_vision(
"what is on the page?", annotate=True, task_id="test"
)
result = browser_vision("what is on the page?", annotate=True, task_id="test")
finally:
clear_runtime_main()
@ -289,55 +285,12 @@ class TestBrowserVisionConfig:
assert result["meta"]["screenshot_path"] == str(screenshot)
assert result["meta"]["annotations"] == annotations
assert any(p.get("type") == "image_url" for p in result["content"])
assert "what is on the page?" in result["content"][0]["text"]
assert str(screenshot) in result["content"][0]["text"]
assert "Screenshot path:" in result["text_summary"]
assert f"Screenshot path: {screenshot}" in result["text_summary"]
mock_get_vision_model.assert_not_called()
mock_llm.assert_not_called()
def test_browser_vision_native_mode_without_supports_vision_uses_aux_llm(self, tmp_path):
from agent.auxiliary_client import clear_runtime_main, set_runtime_main
from tools.browser_tool import browser_vision
shots_dir, screenshot = self._setup_screenshot(tmp_path)
mock_response = MagicMock()
mock_choice = MagicMock()
mock_choice.message.content = "Fallback screenshot analysis"
mock_response.choices = [mock_choice]
set_runtime_main("brand-new-provider", "opaque-model")
try:
with (
patch("hermes_constants.get_hermes_dir", return_value=shots_dir),
patch("tools.browser_tool._cleanup_old_screenshots"),
patch(
"tools.browser_tool._run_browser_command",
return_value={"success": True, "data": {"path": str(screenshot)}},
),
patch(
"hermes_cli.config.load_config",
return_value={"agent": {"image_input_mode": "native"}},
),
patch("tools.browser_tool._get_vision_model", return_value="test-model"),
patch("tools.browser_tool.call_llm", return_value=mock_response) as mock_llm,
):
result = json.loads(browser_vision("what is on the page?", task_id="test"))
finally:
clear_runtime_main()
assert result["success"] is True
assert result["analysis"] == "Fallback screenshot analysis"
assert result["screenshot_path"] == str(screenshot)
mock_llm.assert_called_once()
kwargs = mock_llm.call_args.kwargs
assert kwargs["task"] == "vision"
assert kwargs["model"] == "test-model"
assert kwargs["messages"][0]["content"][1]["type"] == "image_url"
assert kwargs["messages"][0]["content"][1]["image_url"]["url"].startswith(
"data:image/png;base64,"
)
def test_browser_vision_text_mode_blocks_native_fast_path(self, tmp_path):
"""Explicit text routing → aux LLM used even with supports_vision."""
from agent.auxiliary_client import clear_runtime_main, set_runtime_main
from tools.browser_tool import browser_vision
@ -372,7 +325,6 @@ class TestBrowserVisionConfig:
assert result["success"] is True
assert result["analysis"] == "Text-mode screenshot analysis"
assert result["screenshot_path"] == str(screenshot)
mock_llm.assert_called_once()

View file

@ -146,35 +146,32 @@ class TestVisionAnalyzeNative:
class TestHandleVisionAnalyzeFastPath:
"""Verify the dispatcher chooses fast-path vs aux-LLM correctly."""
def test_native_mode_with_supported_transport_uses_fast_path(self, tmp_path):
"""Explicit native mode + known transport returns multimodal."""
def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch):
"""Main model supports native vision → fast path returns multimodal."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
async def _aux_sentinel(*args, **kwargs):
return '{"sentinel": "aux-path"}'
# Set runtime override so the handler thinks we're on opus@openrouter
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("openrouter", "anthropic/claude-opus-4.6")
try:
# Mock decide_image_input_mode to always return "native" so the
# fast path fires regardless of model-catalog state in CI.
with patch(
"hermes_cli.config.load_config",
return_value={"agent": {"image_input_mode": "native"}},
), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux:
result = asyncio.get_event_loop().run_until_complete(
_handle_vision_analyze({"image_url": str(img), "question": "?"})
)
"agent.image_routing.decide_image_input_mode",
return_value="native",
):
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert isinstance(result, dict), (
assert isinstance(result, dict), \
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
)
assert result.get("_multimodal") is True
mock_aux.assert_not_called()
def test_native_mode_with_unsupported_transport_falls_through(self, tmp_path):
"""Explicit native mode still respects the transport gate."""
def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch):
"""Non-vision main model → fast path skipped, aux LLM path attempted."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
@ -182,27 +179,39 @@ class TestHandleVisionAnalyzeFastPath:
return '{"sentinel": "aux-path"}'
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("brand-new-provider", "opaque-model")
set_runtime_main("openrouter", "qwen/qwen3-coder")
try:
with (
patch(
"hermes_cli.config.load_config",
return_value={"agent": {"image_input_mode": "native"}},
),
patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux,
):
result = asyncio.get_event_loop().run_until_complete(
_handle_vision_analyze({"image_url": str(img), "question": "?"})
)
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert isinstance(result, str)
assert json.loads(result) == {"sentinel": "aux-path"}
mock_aux.assert_called_once()
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
"Fast path fired for non-vision model; should have fallen through to aux LLM"
def test_supports_vision_bypasses_transport_gate(self, tmp_path):
"""supports_vision=True enables fast path even on unknown providers."""
def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch):
"""Even with vision-capable model, unknown provider → fall through."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
async def _aux_sentinel(*args, **kwargs):
return '{"sentinel": "aux-path"}'
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6")
try:
with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel):
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert not (isinstance(result, dict) and result.get("_multimodal") is True), \
"Fast path fired for unknown provider; should have fallen through"
def test_supports_vision_override_bypasses_provider_allowlist(self, tmp_path):
"""supports_vision=true enables the fast path on an unlisted provider."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
@ -215,21 +224,19 @@ class TestHandleVisionAnalyzeFastPath:
with patch(
"hermes_cli.config.load_config",
return_value={"model": {"supports_vision": True}},
), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux:
result = asyncio.get_event_loop().run_until_complete(
_handle_vision_analyze({"image_url": str(img), "question": "?"})
)
), patch(
"tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel,
) as mock_aux:
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()
assert isinstance(result, dict), (
f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}"
)
assert result.get("_multimodal") is True
assert isinstance(result, dict) and result.get("_multimodal") is True
mock_aux.assert_not_called()
def test_text_mode_still_blocks_fast_path_when_supports_vision_true(self, tmp_path):
"""Routing mode wins over supports_vision when text mode was chosen."""
def test_text_mode_wins_over_supports_vision_override(self, tmp_path):
"""Explicit text routing blocks the fast path even with supports_vision."""
img = tmp_path / "x.png"
img.write_bytes(_TINY_PNG)
@ -239,19 +246,17 @@ class TestHandleVisionAnalyzeFastPath:
from agent.auxiliary_client import set_runtime_main, clear_runtime_main
set_runtime_main("brand-new-provider", "llava-v1.6")
try:
with (
patch(
"hermes_cli.config.load_config",
return_value={
"agent": {"image_input_mode": "text"},
"model": {"supports_vision": True},
},
),
patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux,
):
result = asyncio.get_event_loop().run_until_complete(
_handle_vision_analyze({"image_url": str(img), "question": "?"})
)
with patch(
"hermes_cli.config.load_config",
return_value={
"agent": {"image_input_mode": "text"},
"model": {"supports_vision": True},
},
), patch(
"tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel,
) as mock_aux:
coro = _handle_vision_analyze({"image_url": str(img), "question": "?"})
result = asyncio.get_event_loop().run_until_complete(coro)
finally:
clear_runtime_main()

View file

@ -62,7 +62,7 @@ import tempfile
import threading
import time
import requests
from typing import Dict, Any, Optional, List, Tuple
from typing import Dict, Any, Optional, List, Tuple, Union
from pathlib import Path
from agent.auxiliary_client import call_llm
from hermes_constants import get_hermes_home
@ -3044,16 +3044,16 @@ def browser_get_images(task_id: Optional[str] = None) -> str:
return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False)
def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str:
def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> Union[str, Dict[str, Any]]:
"""
Take a screenshot of the current page for visual inspection.
This tool captures what's visually displayed in the browser. When the
active model supports native vision, the screenshot is attached directly
to the conversation so the model can inspect it on the next turn.
Otherwise Hermes falls back to the auxiliary vision model. Useful for
understanding visual content that the text-based snapshot may not capture
(CAPTCHAs, verification challenges, images, complex layouts, etc.).
Captures what's visually displayed in the browser. When the active model
supports native vision, the screenshot is attached directly to the
conversation so the model can inspect it on the next turn; otherwise Hermes
falls back to the auxiliary vision model and returns a text analysis. Useful
for visual content the text-based snapshot may not capture (CAPTCHAs,
verification challenges, images, complex layouts, etc.).
The screenshot is saved persistently and its file path is returned so it
can be shared with users via MEDIA:<path> in the response.
@ -3064,8 +3064,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
task_id: Task identifier for session isolation
Returns:
Either a JSON string with vision analysis results and screenshot_path,
or a multimodal tool-result envelope with the screenshot and metadata.
A JSON string with vision analysis results and screenshot_path, or a
multimodal tool-result envelope carrying the screenshot and metadata.
"""
if _is_camofox_mode():
from tools.browser_camofox import camofox_vision
@ -3190,55 +3190,33 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
_screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii")
data_url = f"data:image/png;base64,{_screenshot_b64}"
# Fast path: when the active main model supports native vision AND the
# provider supports image content inside tool results, short-circuit
# the auxiliary LLM and return the image bytes as a multimodal
# tool-result envelope. The user can force native vision with the
# supports_vision override. The main model sees the pixels directly on its
# next turn — no aux call, no information loss, no extra latency.
try:
from agent.auxiliary_client import _read_main_model, _read_main_provider
from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
from hermes_cli.config import load_config
from tools.vision_tools import (
_build_native_vision_tool_result,
_supports_media_in_tool_results,
)
# Fast path: when native image routing is in effect for the active main
# model, attach the screenshot directly instead of describing it through
# an auxiliary vision LLM. The model inspects the pixels on its next
# turn — no aux call, no information loss. Consistent with vision_analyze.
from tools.vision_tools import (
_build_native_vision_tool_result,
_should_use_native_vision_fast_path,
)
_provider = _read_main_provider()
_model = _read_main_model()
_cfg = load_config()
_mode = decide_image_input_mode(_provider, _model, _cfg)
_supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
if _mode == "native" and (
_supports_media_in_tool_results(_provider, _model)
or _supports_vision
):
native_result = _build_native_vision_tool_result(
image_url=str(screenshot_path),
question=question,
image_data_url=data_url,
image_size_bytes=len(_screenshot_bytes),
)
native_result.setdefault("meta", {})
native_result["meta"]["screenshot_path"] = str(screenshot_path)
if _lp_fallback_warning:
native_result["meta"]["fallback_warning"] = _lp_fallback_warning
if annotate and result.get("data", {}).get("annotations"):
native_result["meta"]["annotations"] = result["data"]["annotations"]
text_parts = native_result.get("content") or []
if text_parts and isinstance(text_parts[0], dict) and text_parts[0].get("type") == "text":
text_parts[0]["text"] = (
str(text_parts[0].get("text", ""))
+ f"\n\nScreenshot path: {screenshot_path}"
)
native_result["text_summary"] = (
str(native_result.get("text_summary") or "")
+ f" Screenshot path: {screenshot_path}"
).strip()
return native_result
except Exception:
pass
if _should_use_native_vision_fast_path():
native_result = _build_native_vision_tool_result(
image_url=str(screenshot_path),
question=question,
image_data_url=data_url,
image_size_bytes=len(_screenshot_bytes),
)
meta = native_result.setdefault("meta", {})
meta["screenshot_path"] = str(screenshot_path)
if _lp_fallback_warning:
meta["fallback_warning"] = _lp_fallback_warning
if annotate and result.get("data", {}).get("annotations"):
meta["annotations"] = result["data"]["annotations"]
native_result["text_summary"] = (
f"{native_result.get('text_summary', '')} "
f"Screenshot path: {screenshot_path}"
).strip()
return native_result
vision_prompt = (
f"You are analyzing a screenshot of a web browser.\n\n"

View file

@ -476,6 +476,36 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool:
return False
def _should_use_native_vision_fast_path() -> bool:
"""Whether vision tools should attach the image to the main model directly
instead of routing through the auxiliary vision LLM.
True when image routing resolves to ``native`` AND either the provider is
known to accept images inside tool results, or the user explicitly declared
the model vision-capable via the ``model.supports_vision`` config override.
The override is the escape hatch for custom/local providers that aren't in
the static allowlist. Best-effort: any resolution failure returns False so
the caller falls back to the legacy aux-LLM path.
"""
try:
from agent.auxiliary_client import _read_main_provider, _read_main_model
from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
from hermes_cli.config import load_config
provider = _read_main_provider()
model = _read_main_model()
cfg = load_config()
if decide_image_input_mode(provider, model, cfg) != "native":
return False
return (
_supports_media_in_tool_results(provider, model)
or _lookup_supports_vision(provider, model, cfg) is True
)
except Exception as exc:
logger.debug("Native vision fast-path check failed: %s", exc)
return False
def _build_native_vision_tool_result(
image_url: str,
question: str,
@ -1030,33 +1060,15 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
image_url = args.get("image_url", "")
question = args.get("question", "")
# Fast path: when the active main model supports native vision AND the
# provider supports image content inside tool results, short-circuit
# the auxiliary LLM and return the image bytes as a multimodal
# tool-result envelope. The user can force native vision with the
# supports_vision override. The main model sees the pixels directly on its
# next turn — no aux call, no information loss, no extra latency.
try:
from agent.auxiliary_client import _read_main_provider, _read_main_model
from agent.image_routing import decide_image_input_mode, _lookup_supports_vision
from hermes_cli.config import load_config
_provider = _read_main_provider()
_model = _read_main_model()
_cfg = load_config()
_mode = decide_image_input_mode(_provider, _model, _cfg)
_supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True
if _mode == "native" and (
_supports_media_in_tool_results(_provider, _model)
or _supports_vision
):
logger.info(
"vision_analyze: native fast path (provider=%s, model=%s)",
_provider, _model,
)
return _vision_analyze_native(image_url, question)
except Exception as exc:
logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc)
# Fast path: when native image routing is in effect for the active main
# model (provider accepts images in tool results, or the user set the
# model.supports_vision override), short-circuit the auxiliary LLM and
# return the image bytes as a multimodal tool-result envelope. The main
# model sees the pixels directly on its next turn — no aux call, no
# information loss, no extra latency.
if _should_use_native_vision_fast_path():
logger.info("vision_analyze: native fast path")
return _vision_analyze_native(image_url, question)
# Legacy path: aux LLM describes the image and we return its text.
full_prompt = (