From 2402ec5e7b251d115efdc24c231463f33b587902 Mon Sep 17 00:00:00 2001 From: tillfalko Date: Thu, 21 May 2026 17:32:40 +0200 Subject: [PATCH] test: extend test coverage to native image routing --- tests/tools/test_browser_console.py | 125 ++++++++++++++++++++ tests/tools/test_vision_native_fast_path.py | 103 +++++++++++----- 2 files changed, 201 insertions(+), 27 deletions(-) diff --git a/tests/tools/test_browser_console.py b/tests/tools/test_browser_console.py index 948a213ce1d..bc79b9de350 100644 --- a/tests/tools/test_browser_console.py +++ b/tests/tools/test_browser_console.py @@ -250,6 +250,131 @@ class TestBrowserVisionConfig: assert mock_llm.call_args.kwargs["temperature"] == 0.1 assert mock_llm.call_args.kwargs["timeout"] == 120.0 + def test_browser_vision_native_fast_path_returns_multimodal(self, tmp_path): + from agent.auxiliary_client import clear_runtime_main, set_runtime_main + from tools.browser_tool import browser_vision + + shots_dir, screenshot = self._setup_screenshot(tmp_path) + annotations = [{"id": 1, "label": "Search box"}] + set_runtime_main("brand-new-provider", "llava-v1.6") + try: + with ( + patch("hermes_constants.get_hermes_dir", return_value=shots_dir), + patch("tools.browser_tool._cleanup_old_screenshots"), + patch( + "tools.browser_tool._run_browser_command", + return_value={ + "success": True, + "data": { + "path": str(screenshot), + "annotations": annotations, + }, + }, + ), + patch( + "hermes_cli.config.load_config", + return_value={"model": {"supports_vision": True}}, + ), + patch("tools.browser_tool._get_vision_model") as mock_get_vision_model, + patch("tools.browser_tool.call_llm") as mock_llm, + ): + result = browser_vision( + "what is on the page?", annotate=True, task_id="test" + ) + finally: + clear_runtime_main() + + assert isinstance(result, dict) + assert result["_multimodal"] is True + assert result["meta"]["screenshot_path"] == str(screenshot) + assert result["meta"]["annotations"] == annotations + assert any(p.get("type") == "image_url" for p in result["content"]) + assert "what is on the page?" in result["content"][0]["text"] + assert str(screenshot) in result["content"][0]["text"] + assert "Screenshot path:" in result["text_summary"] + mock_get_vision_model.assert_not_called() + mock_llm.assert_not_called() + + def test_browser_vision_native_mode_without_supports_vision_uses_aux_llm(self, tmp_path): + from agent.auxiliary_client import clear_runtime_main, set_runtime_main + from tools.browser_tool import browser_vision + + shots_dir, screenshot = self._setup_screenshot(tmp_path) + mock_response = MagicMock() + mock_choice = MagicMock() + mock_choice.message.content = "Fallback screenshot analysis" + mock_response.choices = [mock_choice] + + set_runtime_main("brand-new-provider", "opaque-model") + try: + with ( + patch("hermes_constants.get_hermes_dir", return_value=shots_dir), + patch("tools.browser_tool._cleanup_old_screenshots"), + patch( + "tools.browser_tool._run_browser_command", + return_value={"success": True, "data": {"path": str(screenshot)}}, + ), + patch( + "hermes_cli.config.load_config", + return_value={"agent": {"image_input_mode": "native"}}, + ), + patch("tools.browser_tool._get_vision_model", return_value="test-model"), + patch("tools.browser_tool.call_llm", return_value=mock_response) as mock_llm, + ): + result = json.loads(browser_vision("what is on the page?", task_id="test")) + finally: + clear_runtime_main() + + assert result["success"] is True + assert result["analysis"] == "Fallback screenshot analysis" + assert result["screenshot_path"] == str(screenshot) + mock_llm.assert_called_once() + kwargs = mock_llm.call_args.kwargs + assert kwargs["task"] == "vision" + assert kwargs["model"] == "test-model" + assert kwargs["messages"][0]["content"][1]["type"] == "image_url" + assert kwargs["messages"][0]["content"][1]["image_url"]["url"].startswith( + "data:image/png;base64," + ) + + def test_browser_vision_text_mode_blocks_native_fast_path(self, tmp_path): + from agent.auxiliary_client import clear_runtime_main, set_runtime_main + from tools.browser_tool import browser_vision + + shots_dir, screenshot = self._setup_screenshot(tmp_path) + mock_response = MagicMock() + mock_choice = MagicMock() + mock_choice.message.content = "Text-mode screenshot analysis" + mock_response.choices = [mock_choice] + + set_runtime_main("brand-new-provider", "llava-v1.6") + try: + with ( + patch("hermes_constants.get_hermes_dir", return_value=shots_dir), + patch("tools.browser_tool._cleanup_old_screenshots"), + patch( + "tools.browser_tool._run_browser_command", + return_value={"success": True, "data": {"path": str(screenshot)}}, + ), + patch( + "hermes_cli.config.load_config", + return_value={ + "agent": {"image_input_mode": "text"}, + "model": {"supports_vision": True}, + }, + ), + patch("tools.browser_tool._get_vision_model", return_value="test-model"), + patch("tools.browser_tool.call_llm", return_value=mock_response) as mock_llm, + ): + result = json.loads(browser_vision("what is on the page?", task_id="test")) + finally: + clear_runtime_main() + + assert result["success"] is True + assert result["analysis"] == "Text-mode screenshot analysis" + assert result["screenshot_path"] == str(screenshot) + mock_llm.assert_called_once() + # ── auto-recording config ──────────────────────────────────────────── diff --git a/tests/tools/test_vision_native_fast_path.py b/tests/tools/test_vision_native_fast_path.py index 89b9724e254..1f2e9b4d4dc 100644 --- a/tests/tools/test_vision_native_fast_path.py +++ b/tests/tools/test_vision_native_fast_path.py @@ -146,32 +146,35 @@ class TestVisionAnalyzeNative: class TestHandleVisionAnalyzeFastPath: """Verify the dispatcher chooses fast-path vs aux-LLM correctly.""" - def test_vision_capable_main_model_uses_fast_path(self, tmp_path, monkeypatch): - """Main model supports native vision → fast path returns multimodal.""" + def test_native_mode_with_supported_transport_uses_fast_path(self, tmp_path): + """Explicit native mode + known transport returns multimodal.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) - # Set runtime override so the handler thinks we're on opus@openrouter + async def _aux_sentinel(*args, **kwargs): + return '{"sentinel": "aux-path"}' + from agent.auxiliary_client import set_runtime_main, clear_runtime_main set_runtime_main("openrouter", "anthropic/claude-opus-4.6") try: - # Mock decide_image_input_mode to always return "native" so the - # fast path fires regardless of model-catalog state in CI. with patch( - "agent.image_routing.decide_image_input_mode", - return_value="native", - ): - coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) - result = asyncio.get_event_loop().run_until_complete(coro) + "hermes_cli.config.load_config", + return_value={"agent": {"image_input_mode": "native"}}, + ), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux: + result = asyncio.get_event_loop().run_until_complete( + _handle_vision_analyze({"image_url": str(img), "question": "?"}) + ) finally: clear_runtime_main() - assert isinstance(result, dict), \ + assert isinstance(result, dict), ( f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}" + ) assert result.get("_multimodal") is True + mock_aux.assert_not_called() - def test_non_vision_main_model_falls_through_to_aux(self, tmp_path, monkeypatch): - """Non-vision main model → fast path skipped, aux LLM path attempted.""" + def test_native_mode_with_unsupported_transport_falls_through(self, tmp_path): + """Explicit native mode still respects the transport gate.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) @@ -179,19 +182,27 @@ class TestHandleVisionAnalyzeFastPath: return '{"sentinel": "aux-path"}' from agent.auxiliary_client import set_runtime_main, clear_runtime_main - set_runtime_main("openrouter", "qwen/qwen3-coder") + set_runtime_main("brand-new-provider", "opaque-model") try: - with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel): - coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) - result = asyncio.get_event_loop().run_until_complete(coro) + with ( + patch( + "hermes_cli.config.load_config", + return_value={"agent": {"image_input_mode": "native"}}, + ), + patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux, + ): + result = asyncio.get_event_loop().run_until_complete( + _handle_vision_analyze({"image_url": str(img), "question": "?"}) + ) finally: clear_runtime_main() - assert not (isinstance(result, dict) and result.get("_multimodal") is True), \ - "Fast path fired for non-vision model; should have fallen through to aux LLM" + assert isinstance(result, str) + assert json.loads(result) == {"sentinel": "aux-path"} + mock_aux.assert_called_once() - def test_fast_path_disabled_for_unsupported_provider(self, tmp_path, monkeypatch): - """Even with vision-capable model, unknown provider → fall through.""" + def test_supports_vision_bypasses_transport_gate(self, tmp_path): + """supports_vision=True enables fast path even on unknown providers.""" img = tmp_path / "x.png" img.write_bytes(_TINY_PNG) @@ -199,13 +210,51 @@ class TestHandleVisionAnalyzeFastPath: return '{"sentinel": "aux-path"}' from agent.auxiliary_client import set_runtime_main, clear_runtime_main - set_runtime_main("brand-new-provider", "anthropic/claude-opus-4.6") + set_runtime_main("brand-new-provider", "llava-v1.6") try: - with patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel): - coro = _handle_vision_analyze({"image_url": str(img), "question": "?"}) - result = asyncio.get_event_loop().run_until_complete(coro) + with patch( + "hermes_cli.config.load_config", + return_value={"model": {"supports_vision": True}}, + ), patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux: + result = asyncio.get_event_loop().run_until_complete( + _handle_vision_analyze({"image_url": str(img), "question": "?"}) + ) finally: clear_runtime_main() - assert not (isinstance(result, dict) and result.get("_multimodal") is True), \ - "Fast path fired for unknown provider; should have fallen through" + assert isinstance(result, dict), ( + f"Expected multimodal envelope, got {type(result).__name__}: {str(result)[:200]}" + ) + assert result.get("_multimodal") is True + mock_aux.assert_not_called() + + def test_text_mode_still_blocks_fast_path_when_supports_vision_true(self, tmp_path): + """Routing mode wins over supports_vision when text mode was chosen.""" + img = tmp_path / "x.png" + img.write_bytes(_TINY_PNG) + + async def _aux_sentinel(*args, **kwargs): + return '{"sentinel": "aux-path"}' + + from agent.auxiliary_client import set_runtime_main, clear_runtime_main + set_runtime_main("brand-new-provider", "llava-v1.6") + try: + with ( + patch( + "hermes_cli.config.load_config", + return_value={ + "agent": {"image_input_mode": "text"}, + "model": {"supports_vision": True}, + }, + ), + patch("tools.vision_tools.vision_analyze_tool", side_effect=_aux_sentinel) as mock_aux, + ): + result = asyncio.get_event_loop().run_until_complete( + _handle_vision_analyze({"image_url": str(img), "question": "?"}) + ) + finally: + clear_runtime_main() + + assert isinstance(result, str) + assert json.loads(result) == {"sentinel": "aux-path"} + mock_aux.assert_called_once()