From c77a697fa4f3a5bc839bf569d6405489b0301c09 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 03:40:26 -0700 Subject: [PATCH] refactor(vision): consolidate native fast-path gate into one shared helper The fast-path decision (native routing + provider allowlist OR supports_vision override) lived inline in vision_analyze and was copied into browser_vision. Extract it to _should_use_native_vision_fast_path() so both tools share one source of truth. - vision_tools: gate logic now one helper; vision_analyze calls it in 3 lines - browser_tool: thin envelope decoration over the shared helper, not a copy - browser_vision typed Union[str, Dict] to match its real return shape - tests slimmed to target the override path + text-mode-wins invariant --- tests/tools/test_browser_console.py | 58 +--------- tests/tools/test_vision_native_fast_path.py | 115 ++++++++++---------- tools/browser_tool.py | 94 ++++++---------- tools/vision_tools.py | 66 ++++++----- 4 files changed, 140 insertions(+), 193 deletions(-) diff --git a/tests/tools/test_browser_console.py b/tests/tools/test_browser_console.py index bc79b9de350..6b49087a696 100644 --- a/tests/tools/test_browser_console.py +++ b/tests/tools/test_browser_console.py @@ -251,6 +251,7 @@ class TestBrowserVisionConfig: assert mock_llm.call_args.kwargs["timeout"] == 120.0 def test_browser_vision_native_fast_path_returns_multimodal(self, tmp_path): + """supports_vision override → screenshot attached natively, no aux call.""" from agent.auxiliary_client import clear_runtime_main, set_runtime_main from tools.browser_tool import browser_vision @@ -265,10 +266,7 @@ class TestBrowserVisionConfig: "tools.browser_tool._run_browser_command", return_value={ "success": True, - "data": { - "path": str(screenshot), - "annotations": annotations, - }, + "data": {"path": str(screenshot), "annotations": annotations}, }, ), patch( @@ -278,9 +276,7 @@ class TestBrowserVisionConfig: patch("tools.browser_tool._get_vision_model") as mock_get_vision_model, patch("tools.browser_tool.call_llm") as mock_llm, ): - result = browser_vision( - "what is on the page?", annotate=True, task_id="test" - ) + result = browser_vision("what is on the page?", annotate=True, task_id="test") finally: clear_runtime_main() @@ -289,55 +285,12 @@ class TestBrowserVisionConfig: assert result["meta"]["screenshot_path"] == str(screenshot) assert result["meta"]["annotations"] == annotations assert any(p.get("type") == "image_url" for p in result["content"]) - assert "what is on the page?" in result["content"][0]["text"] - assert str(screenshot) in result["content"][0]["text"] - assert "Screenshot path:" in result["text_summary"] + assert f"Screenshot path: {screenshot}" in result["text_summary"] mock_get_vision_model.assert_not_called() mock_llm.assert_not_called() - def test_browser_vision_native_mode_without_supports_vision_uses_aux_llm(self, tmp_path): - from agent.auxiliary_client import clear_runtime_main, set_runtime_main - from tools.browser_tool import browser_vision - - shots_dir, screenshot = self._setup_screenshot(tmp_path) - mock_response = MagicMock() - mock_choice = MagicMock() - mock_choice.message.content = "Fallback screenshot analysis" - mock_response.choices = [mock_choice] - - set_runtime_main("brand-new-provider", "opaque-model") - try: - with ( - patch("hermes_constants.get_hermes_dir", return_value=shots_dir), - patch("tools.browser_tool._cleanup_old_screenshots"), - patch( - "tools.browser_tool._run_browser_command", - return_value={"success": True, "data": {"path": str(screenshot)}}, - ), - patch( - "hermes_cli.config.load_config", - return_value={"agent": {"image_input_mode": "native"}}, - ), - patch("tools.browser_tool._get_vision_model", return_value="test-model"), - patch("tools.browser_tool.call_llm", return_value=mock_response) as mock_llm, - ): - result = json.loads(browser_vision("what is on the page?", task_id="test")) - finally: - clear_runtime_main() - - assert result["success"] is True - assert result["analysis"] == "Fallback screenshot analysis" - assert result["screenshot_path"] == str(screenshot) - mock_llm.assert_called_once() - kwargs = mock_llm.call_args.kwargs - assert kwargs["task"] == "vision" - assert kwargs["model"] == "test-model" - assert kwargs["messages"][0]["content"][1]["type"] == "image_url" - assert kwargs["messages"][0]["content"][1]["image_url"]["url"].startswith( - "data:image/png;base64," - ) - def test_browser_vision_text_mode_blocks_native_fast_path(self, tmp_path): + """Explicit text routing → aux LLM used even with supports_vision.""" from agent.auxiliary_client import clear_runtime_main, set_runtime_main from tools.browser_tool import browser_vision @@ -372,7 +325,6 @@ class TestBrowserVisionConfig: assert result["success"] is True assert result["analysis"] == "Text-mode screenshot analysis" - assert result["screenshot_path"] == str(screenshot) mock_llm.assert_called_once() diff --git a/tests/tools/test_vision_native_fast_path.py b/tests/tools/test_vision_native_fast_path.py index 1f2e9b4d4dc..9916ca369d5 100644 --- a/tests/tools/test_vision_native_fast_path.py +++ b/tests/tools/test_vision_native_fast_path.py @@ -146,35 +146,32 @@ class TestVisionAnalyzeNative: class TestHandleVisionAnalyzeFastPath: """Verify the dispatcher chooses fast-path vs aux-LLM correctly.""" - def test_native_mode_with_supported_transport_uses_fast_path(self, tmp_path): - """Explicit native mode + known transport returns multimodal.""" + def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch): + """Main model supports native vision → fast path returns multimodal.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) - async def _aux_sentinel(*args, **kwargs): - return '{"sentinel": "aux-path"}' - + # Set runtime override so the handler thinks we're on opus@openrouter from agent.auxiliary_client import set_runtime_main, clear_runtime_main set_runtime_main("openrouter", "anthropic/claude-opus-4.6") try: + # Mock decide_image_input_mode to always return "native" so the + # fast path fires regardless of model-catalog state in CI. with patch( - "hermes_cli.config.load_config", - return_value={"agent": {"image_input_mode": "native"}}, - ), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux: - result = asyncio.get_event_loop().run_until_complete( - _handle_vision_analyze({"image_url": str(img), "question": "?"}) - ) + "agent.image_routing.decide_image_input_mode", + return_value="native", + ): + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) finally: clear_runtime_main() - assert isinstance(result, dict), ( + assert isinstance(result, dict), \ f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}" - ) assert result.get("_multimodal") is True - mock_aux.assert_not_called() - def test_native_mode_with_unsupported_transport_falls_through(self, tmp_path): - """Explicit native mode still respects the transport gate.""" + def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch): + """Non-vision main model → fast path skipped, aux LLM path attempted.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) @@ -182,27 +179,39 @@ class TestHandleVisionAnalyzeFastPath: return '{"sentinel": "aux-path"}' from agent.auxiliary_client import set_runtime_main, clear_runtime_main - set_runtime_main("brand-new-provider", "opaque-model") + set_runtime_main("openrouter", "qwen/qwen3-coder") try: - with ( - patch( - "hermes_cli.config.load_config", - return_value={"agent": {"image_input_mode": "native"}}, - ), - patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux, - ): - result = asyncio.get_event_loop().run_until_complete( - _handle_vision_analyze({"image_url": str(img), "question": "?"}) - ) + with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel): + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) finally: clear_runtime_main() - assert isinstance(result, str) - assert json.loads(result) == {"sentinel": "aux-path"} - mock_aux.assert_called_once() + assert not (isinstance(result, dict) and result.get("_multimodal") is True), \ + "Fast path fired for non-vision model; should have fallen through to aux LLM" - def test_supports_vision_bypasses_transport_gate(self, tmp_path): - """supports_vision=True enables fast path even on unknown providers.""" + def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch): + """Even with vision-capable model, unknown provider → fall through.""" + img = tmp_path / "x.png" + img.write_bytes(_TINY_PNG) + + async def _aux_sentinel(*args, **kwargs): + return '{"sentinel": "aux-path"}' + + from agent.auxiliary_client import set_runtime_main, clear_runtime_main + set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6") + try: + with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel): + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) + finally: + clear_runtime_main() + + assert not (isinstance(result, dict) and result.get("_multimodal") is True), \ + "Fast path fired for unknown provider; should have fallen through" + + def test_supports_vision_override_bypasses_provider_allowlist(self, tmp_path): + """supports_vision=true enables the fast path on an unlisted provider.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) @@ -215,21 +224,19 @@ class TestHandleVisionAnalyzeFastPath: with patch( "hermes_cli.config.load_config", return_value={"model": {"supports_vision": True}}, - ), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux: - result = asyncio.get_event_loop().run_until_complete( - _handle_vision_analyze({"image_url": str(img), "question": "?"}) - ) + ), patch( + "tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel, + ) as mock_aux: + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) finally: clear_runtime_main() - assert isinstance(result, dict), ( - f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}" - ) - assert result.get("_multimodal") is True + assert isinstance(result, dict) and result.get("_multimodal") is True mock_aux.assert_not_called() - def test_text_mode_still_blocks_fast_path_when_supports_vision_true(self, tmp_path): - """Routing mode wins over supports_vision when text mode was chosen.""" + def test_text_mode_wins_over_supports_vision_override(self, tmp_path): + """Explicit text routing blocks the fast path even with supports_vision.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) @@ -239,19 +246,17 @@ class TestHandleVisionAnalyzeFastPath: from agent.auxiliary_client import set_runtime_main, clear_runtime_main set_runtime_main("brand-new-provider", "llava-v1.6") try: - with ( - patch( - "hermes_cli.config.load_config", - return_value={ - "agent": {"image_input_mode": "text"}, - "model": {"supports_vision": True}, - }, - ), - patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux, - ): - result = asyncio.get_event_loop().run_until_complete( - _handle_vision_analyze({"image_url": str(img), "question": "?"}) - ) + with patch( + "hermes_cli.config.load_config", + return_value={ + "agent": {"image_input_mode": "text"}, + "model": {"supports_vision": True}, + }, + ), patch( + "tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel, + ) as mock_aux: + coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) + result = asyncio.get_event_loop().run_until_complete(coro) finally: clear_runtime_main() diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 05ba3921f5f..124c490730c 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -62,7 +62,7 @@ import tempfile import threading import time import requests -from typing import Dict, Any, Optional, List, Tuple +from typing import Dict, Any, Optional, List, Tuple, Union from pathlib import Path from agent.auxiliary_client import call_llm from hermes_constants import get_hermes_home @@ -3044,16 +3044,16 @@ def browser_get_images(task_id: Optional[str] = None) -> str: return json.dumps(_copy_fallback_warning(response, result), ensure_ascii=False) -def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> str: +def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] = None) -> Union[str, Dict[str, Any]]: """ Take a screenshot of the current page for visual inspection. - This tool captures what's visually displayed in the browser. When the - active model supports native vision, the screenshot is attached directly - to the conversation so the model can inspect it on the next turn. - Otherwise Hermes falls back to the auxiliary vision model. Useful for - understanding visual content that the text-based snapshot may not capture - (CAPTCHAs, verification challenges, images, complex layouts, etc.). + Captures what's visually displayed in the browser. When the active model + supports native vision, the screenshot is attached directly to the + conversation so the model can inspect it on the next turn; otherwise Hermes + falls back to the auxiliary vision model and returns a text analysis. Useful + for visual content the text-based snapshot may not capture (CAPTCHAs, + verification challenges, images, complex layouts, etc.). The screenshot is saved persistently and its file path is returned so it can be shared with users via MEDIA: in the response. @@ -3064,8 +3064,8 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] task_id: Task identifier for session isolation Returns: - Either a JSON string with vision analysis results and screenshot_path, - or a multimodal tool-result envelope with the screenshot and metadata. + A JSON string with vision analysis results and screenshot_path, or a + multimodal tool-result envelope carrying the screenshot and metadata. """ if _is_camofox_mode(): from tools.browser_camofox import camofox_vision @@ -3190,55 +3190,33 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] _screenshot_b64 = base64.b64encode(_screenshot_bytes).decode("ascii") data_url = f"data:image/png;base64,{_screenshot_b64}" - # Fast path: when the active main model supports native vision AND the - # provider supports image content inside tool results, short-circuit - # the auxiliary LLM and return the image bytes as a multimodal - # tool-result envelope. The user can force native vision with the - # supports_vision override. The main model sees the pixels directly on its - # next turn — no aux call, no information loss, no extra latency. - try: - from agent.auxiliary_client import _read_main_model, _read_main_provider - from agent.image_routing import decide_image_input_mode, _lookup_supports_vision - from hermes_cli.config import load_config - from tools.vision_tools import ( - _build_native_vision_tool_result, - _supports_media_in_tool_results, - ) + # Fast path: when native image routing is in effect for the active main + # model, attach the screenshot directly instead of describing it through + # an auxiliary vision LLM. The model inspects the pixels on its next + # turn — no aux call, no information loss. Consistent with vision_analyze. + from tools.vision_tools import ( + _build_native_vision_tool_result, + _should_use_native_vision_fast_path, + ) - _provider = _read_main_provider() - _model = _read_main_model() - _cfg = load_config() - _mode = decide_image_input_mode(_provider, _model, _cfg) - _supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True - if _mode == "native" and ( - _supports_media_in_tool_results(_provider, _model) - or _supports_vision - ): - native_result = _build_native_vision_tool_result( - image_url=str(screenshot_path), - question=question, - image_data_url=data_url, - image_size_bytes=len(_screenshot_bytes), - ) - native_result.setdefault("meta", {}) - native_result["meta"]["screenshot_path"] = str(screenshot_path) - if _lp_fallback_warning: - native_result["meta"]["fallback_warning"] = _lp_fallback_warning - if annotate and result.get("data", {}).get("annotations"): - native_result["meta"]["annotations"] = result["data"]["annotations"] - text_parts = native_result.get("content") or [] - if text_parts and isinstance(text_parts[0], dict) and text_parts[0].get("type") == "text": - text_parts[0]["text"] = ( - str(text_parts[0].get("text", "")) - + f"\n\nScreenshot path: {screenshot_path}" - ) - native_result["text_summary"] = ( - str(native_result.get("text_summary") or "") - + f" Screenshot path: {screenshot_path}" - ).strip() - return native_result - except Exception: - pass + if _should_use_native_vision_fast_path(): + native_result = _build_native_vision_tool_result( + image_url=str(screenshot_path), + question=question, + image_data_url=data_url, + image_size_bytes=len(_screenshot_bytes), + ) + meta = native_result.setdefault("meta", {}) + meta["screenshot_path"] = str(screenshot_path) + if _lp_fallback_warning: + meta["fallback_warning"] = _lp_fallback_warning + if annotate and result.get("data", {}).get("annotations"): + meta["annotations"] = result["data"]["annotations"] + native_result["text_summary"] = ( + f"{native_result.get('text_summary', '')} " + f"Screenshot path: {screenshot_path}" + ).strip() + return native_result vision_prompt = ( f"You are analyzing a screenshot of a web browser.\n\n" diff --git a/tools/vision_tools.py b/tools/vision_tools.py index a6a18449916..986f9dab984 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -476,6 +476,36 @@ def _supports_media_in_tool_results(provider: str, model: str) -> bool: return False +def _should_use_native_vision_fast_path() -> bool: + """Whether vision tools should attach the image to the main model directly + instead of routing through the auxiliary vision LLM. + + True when image routing resolves to ``native`` AND either the provider is + known to accept images inside tool results, or the user explicitly declared + the model vision-capable via the ``model.supports_vision`` config override. + The override is the escape hatch for custom/local providers that aren't in + the static allowlist. Best-effort: any resolution failure returns False so + the caller falls back to the legacy aux-LLM path. + """ + try: + from agent.auxiliary_client import _read_main_provider, _read_main_model + from agent.image_routing import decide_image_input_mode, _lookup_supports_vision + from hermes_cli.config import load_config + + provider = _read_main_provider() + model = _read_main_model() + cfg = load_config() + if decide_image_input_mode(provider, model, cfg) != "native": + return False + return ( + _supports_media_in_tool_results(provider, model) + or _lookup_supports_vision(provider, model, cfg) is True + ) + except Exception as exc: + logger.debug("Native vision fast-path check failed: %s", exc) + return False + + def _build_native_vision_tool_result( image_url: str, question: str, @@ -1030,33 +1060,15 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: image_url = args.get("image_url", "") question = args.get("question", "") - # Fast path: when the active main model supports native vision AND the - # provider supports image content inside tool results, short-circuit - # the auxiliary LLM and return the image bytes as a multimodal - # tool-result envelope. The user can force native vision with the - # supports_vision override. The main model sees the pixels directly on its - # next turn — no aux call, no information loss, no extra latency. - try: - from agent.auxiliary_client import _read_main_provider, _read_main_model - from agent.image_routing import decide_image_input_mode, _lookup_supports_vision - from hermes_cli.config import load_config - - _provider = _read_main_provider() - _model = _read_main_model() - _cfg = load_config() - _mode = decide_image_input_mode(_provider, _model, _cfg) - _supports_vision = _lookup_supports_vision(_provider, _model, _cfg) is True - if _mode == "native" and ( - _supports_media_in_tool_results(_provider, _model) - or _supports_vision - ): - logger.info( - "vision_analyze: native fast path (provider=%s, model=%s)", - _provider, _model, - ) - return _vision_analyze_native(image_url, question) - except Exception as exc: - logger.debug("Native vision fast-path check failed; using aux LLM: %s", exc) + # Fast path: when native image routing is in effect for the active main + # model (provider accepts images in tool results, or the user set the + # model.supports_vision override), short-circuit the auxiliary LLM and + # return the image bytes as a multimodal tool-result envelope. The main + # model sees the pixels directly on its next turn — no aux call, no + # information loss, no extra latency. + if _should_use_native_vision_fast_path(): + logger.info("vision_analyze: native fast path") + return _vision_analyze_native(image_url, question) # Legacy path: aux LLM describes the image and we return its text. full_prompt = (