diff --git a/tests/tools/test_computer_use_capture_routing.py b/tests/tools/test_computer_use_capture_routing.py new file mode 100644 index 00000000000..44084fabbea --- /dev/null +++ b/tests/tools/test_computer_use_capture_routing.py @@ -0,0 +1,431 @@ +"""End-to-end regression for #24015 — capture routing via auxiliary.vision. + +When ``computer_use(action='capture', mode='som'|'vision')`` returns a +screenshot, ``_capture_response`` previously always returned a +``_multimodal`` envelope. For non-vision main models, or when the user +explicitly configured ``auxiliary.vision`` in ``config.yaml``, that +envelope tripped HTTP 404 / 400 at the provider boundary even though a +perfectly good vision backend was sitting in config waiting to be used. + +This file exercises the integrated ``_capture_response`` flow with +deterministic stubs for: + +* ``should_route_capture_to_aux_vision`` (the policy decision) +* ``_run_async`` (sync->async bridge) +* ``vision_analyze_tool`` (the aux LLM call) +* ``hermes_constants.get_hermes_dir`` (cache path) + +…so the full code path is covered without a live cua-driver, a real +auxiliary client, or network access. +""" + +from __future__ import annotations + +import base64 +import json +import os +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + + +# --------------------------------------------------------------------------- +# Fixtures / helpers +# --------------------------------------------------------------------------- + +# 1×1 PNG (transparent) — minimal bytes that decode cleanly. +_PNG_B64 = ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m" + "NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" +) + +# 1×1 JPEG — used to verify mime detection works for either stream type. +_JPEG_B64 = ( + "/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEB" + "AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/" +) + + +@pytest.fixture +def tmp_cache_dir(tmp_path): + """Override get_hermes_dir so cache writes land under tmp_path.""" + cache_dir = tmp_path / "cache_vision" + cache_dir.mkdir() + + def _fake_get(*_args, **_kw): + return cache_dir + + with patch("hermes_constants.get_hermes_dir", _fake_get): + yield cache_dir + + +def _make_capture( + *, + png_b64: str = _PNG_B64, + mode: str = "som", + elements=None, + app: str = "Safari", + window_title: str = "GitHub – Issue #24015", + width: int = 1280, + height: int = 800, +): + from tools.computer_use.backend import CaptureResult, UIElement + + elements = list(elements or [ + UIElement(index=0, role="AXButton", label="Sign in", + bounds=(10, 20, 80, 30)), + UIElement(index=1, role="AXTextField", label="username", + bounds=(10, 60, 200, 24)), + ]) + raw = base64.b64decode(png_b64, validate=False) + return CaptureResult( + mode=mode, + width=width, + height=height, + png_b64=png_b64, + elements=elements, + app=app, + window_title=window_title, + png_bytes_len=len(raw), + ) + + +def _stub_aux_analysis(text: str): + """Return a fake vision_analyze_tool coroutine result (JSON envelope).""" + return json.dumps({"success": True, "analysis": text}) + + +# --------------------------------------------------------------------------- +# _capture_response: routing OFF (current/native behaviour) +# --------------------------------------------------------------------------- + +class TestCaptureResponseDefaultPath: + """When routing helper says 'native', the existing multimodal envelope wins.""" + + def test_som_capture_returns_multimodal_envelope_when_native(self): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(png_b64=_PNG_B64, mode="som") + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=False): + resp = cu_tool._capture_response(cap) + + assert isinstance(resp, dict) + assert resp.get("_multimodal") is True + # Image part must use image/png MIME for a PNG payload. + image_part = next( + p for p in resp["content"] if p.get("type") == "image_url" + ) + url = image_part["image_url"]["url"] + assert url.startswith("data:image/png;base64,") + assert "vision_analysis" not in resp + + def test_jpeg_capture_returns_image_jpeg_mime_when_native(self): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(png_b64=_JPEG_B64, mode="som") + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=False): + resp = cu_tool._capture_response(cap) + + url = next(p for p in resp["content"] if p.get("type") == "image_url") + assert url["image_url"]["url"].startswith("data:image/jpeg;base64,") + + def test_ax_only_capture_returns_text_regardless_of_routing(self): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="ax", png_b64="") + # ax mode never has a PNG so neither path matters; assert pure text. + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True) as routing: + resp = cu_tool._capture_response(cap) + + # ax never even consults the routing helper — short-circuited above + # the image branch. + routing.assert_not_called() + assert isinstance(resp, str) + body = json.loads(resp) + assert body["mode"] == "ax" + + +# --------------------------------------------------------------------------- +# _capture_response: routing ON (the #24015 fix) +# --------------------------------------------------------------------------- + +class TestCaptureResponseRoutedToAuxVision: + """When routing helper says 'aux', the PNG is pre-analysed and a text + response is returned with no image_url parts at all.""" + + def test_som_capture_returns_text_with_vision_analysis( + self, tmp_cache_dir, + ): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="som") + + captured_calls = {} + + def _fake_run_async(coro): + captured_calls["called"] = True + return _stub_aux_analysis( + "A Safari window showing a GitHub issue page with a 'Sign " + "in' button and a 'username' text field." + ) + + # vision_analyze_tool is async; force a sync MagicMock so we can + # assert positional args without dealing with awaitables. + fake_vat = MagicMock(return_value="") + + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True), \ + patch("model_tools._run_async", side_effect=_fake_run_async), \ + patch("tools.vision_tools.vision_analyze_tool", + new_callable=lambda: fake_vat): + resp = cu_tool._capture_response(cap) + + # Must be a JSON string, NOT a multimodal envelope. This is exactly + # the contract that prevents #24015's HTTP 404 from firing on the + # next agent turn. + assert isinstance(resp, str) + body = json.loads(resp) + assert body["mode"] == "som" + assert body["app"] == "Safari" + assert "Sign in" in body["vision_analysis"] + assert body["vision_analysis_routed_via"] == "auxiliary.vision" + # The original AX-only metadata (window title, element index, app) + # is preserved alongside the new vision analysis so the agent loses + # no context vs the multimodal path. + assert body["window_title"] == "GitHub – Issue #24015" + assert len(body["elements"]) == 2 + + assert captured_calls.get("called") is True + # vision_analyze_tool was invoked with a path under the patched cache + # and a non-empty prompt. + args, _kwargs = fake_vat.call_args + path_arg, prompt_arg = args[0], args[1] + assert str(tmp_cache_dir) in path_arg + assert "macOS application screenshot" in prompt_arg + # AX summary is included so the aux model can ground its description + # against the same set-of-mark index the agent will see. + assert "Sign in" in prompt_arg + + def test_temp_screenshot_file_is_cleaned_up_after_routing( + self, tmp_cache_dir, + ): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="som") + # We capture the path the aux call sees so we can assert it's gone + # after _capture_response returns. + observed_path = {} + + def _fake_run_async(_coro): + return _stub_aux_analysis("description goes here") + + def _fake_vat(image_path, _prompt): + observed_path["path"] = image_path + # File must exist while aux is being arranged. + assert os.path.exists(image_path) + return "" + + fake_vat = MagicMock(side_effect=_fake_vat) + + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True), \ + patch("model_tools._run_async", side_effect=_fake_run_async), \ + patch("tools.vision_tools.vision_analyze_tool", + new_callable=lambda: fake_vat): + cu_tool._capture_response(cap) + + # File must be unlinked after _capture_response returns. + assert observed_path["path"] + assert not os.path.exists(observed_path["path"]) + + def test_temp_file_cleaned_up_even_when_aux_call_raises( + self, tmp_cache_dir, + ): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="som") + observed_path = {} + + def _fake_vat(image_path, _prompt): + observed_path["path"] = image_path + return "" + + def _fake_run_async(_coro): + raise RuntimeError("aux LLM down") + + fake_vat = MagicMock(side_effect=_fake_vat) + + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True), \ + patch("model_tools._run_async", side_effect=_fake_run_async), \ + patch("tools.vision_tools.vision_analyze_tool", + new_callable=lambda: fake_vat): + resp = cu_tool._capture_response(cap) + + # Aux failure → fall back to multimodal envelope (so the user still + # gets *something* useful even if vision is broken). + assert isinstance(resp, dict) + assert resp.get("_multimodal") is True + # Temp file must still be cleaned up. + assert observed_path["path"] + assert not os.path.exists(observed_path["path"]) + + def test_empty_aux_analysis_falls_back_to_multimodal(self, tmp_cache_dir): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="som") + + def _fake_run_async(_coro): + return _stub_aux_analysis("") + + fake_vat = MagicMock(return_value="") + + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True), \ + patch("model_tools._run_async", side_effect=_fake_run_async), \ + patch("tools.vision_tools.vision_analyze_tool", + new_callable=lambda: fake_vat): + resp = cu_tool._capture_response(cap) + + # Empty analysis is treated as failure — we'd rather show pixels + # than embed an empty 'vision_analysis' string into the result. + assert isinstance(resp, dict) + assert resp.get("_multimodal") is True + + def test_invalid_aux_response_falls_back_to_multimodal(self, tmp_cache_dir): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="som") + + def _fake_run_async(_coro): + return 1234 # not a string at all + + fake_vat = MagicMock(return_value="") + + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True), \ + patch("model_tools._run_async", side_effect=_fake_run_async), \ + patch("tools.vision_tools.vision_analyze_tool", + new_callable=lambda: fake_vat): + resp = cu_tool._capture_response(cap) + + assert isinstance(resp, dict) + assert resp.get("_multimodal") is True + + +# --------------------------------------------------------------------------- +# _should_route_through_aux_vision: end-to-end with real config plumbing +# --------------------------------------------------------------------------- + +class TestRoutingDecisionWiring: + """Verify _should_route_through_aux_vision wires the right config + helper.""" + + def test_explicit_aux_vision_in_config_routes_to_aux(self): + from tools.computer_use import tool as cu_tool + + cfg = { + "model": {"default": "tencent/hy3-preview", "provider": "openrouter"}, + "auxiliary": { + "vision": { + "provider": "openrouter", + "model": "google/gemini-2.5-flash", + } + }, + } + with patch("agent.auxiliary_client._read_main_provider", + return_value="openrouter"), \ + patch("agent.auxiliary_client._read_main_model", + return_value="tencent/hy3-preview"), \ + patch("hermes_cli.config.load_config", return_value=cfg): + assert cu_tool._should_route_through_aux_vision() is True + + def test_no_explicit_aux_and_vision_capable_main_keeps_multimodal(self): + from tools.computer_use import tool as cu_tool + + cfg = { + "model": {"default": "claude-opus-4-5", "provider": "anthropic"}, + } + with patch("agent.auxiliary_client._read_main_provider", + return_value="anthropic"), \ + patch("agent.auxiliary_client._read_main_model", + return_value="claude-opus-4-5"), \ + patch("hermes_cli.config.load_config", return_value=cfg), \ + patch("tools.computer_use.vision_routing._lookup_supports_vision", + return_value=True), \ + patch("tools.computer_use.vision_routing." + "_provider_accepts_multimodal_tool_result", + return_value=True): + assert cu_tool._should_route_through_aux_vision() is False + + def test_config_load_failure_disables_routing_safely(self): + from tools.computer_use import tool as cu_tool + + with patch("hermes_cli.config.load_config", + side_effect=RuntimeError("config.yaml unreadable")): + # No exception should bubble up — fail open by returning False + # so the legacy multimodal envelope continues to work. + assert cu_tool._should_route_through_aux_vision() is False + + def test_helper_decision_exception_is_swallowed(self): + from tools.computer_use import tool as cu_tool + from tools.computer_use import vision_routing as vr_mod + + with patch("agent.auxiliary_client._read_main_provider", + return_value="openrouter"), \ + patch("agent.auxiliary_client._read_main_model", + return_value="x"), \ + patch("hermes_cli.config.load_config", return_value={}), \ + patch.object(vr_mod, "should_route_capture_to_aux_vision", + side_effect=ValueError("policy bug")): + assert cu_tool._should_route_through_aux_vision() is False + + +# --------------------------------------------------------------------------- +# Bug reproduction marker — proves the fix is needed. +# --------------------------------------------------------------------------- + +class TestBugReproductionAnchor: + """Without the fix, this test would assert the wrong thing. + + On upstream/main HEAD prior to this branch, _capture_response returns a + multimodal envelope unconditionally — so when a non-vision main model + is configured, the captured PNG is delivered to the main provider as + image_url content and the request is rejected with HTTP 404. We don't + have a live provider here, but we can pin the contract: with routing + enabled the response MUST be a JSON string with no image_url parts. + """ + + def test_non_vision_main_model_never_returns_image_url_when_routed( + self, tmp_cache_dir, + ): + from tools.computer_use import tool as cu_tool + + cap = _make_capture(mode="som") + + def _fake_run_async(_coro): + return _stub_aux_analysis( + "Screenshot showing a GitHub.com window with a sign-in " + "form." + ) + + fake_vat = MagicMock(return_value="") + + with patch.object(cu_tool, "_should_route_through_aux_vision", + return_value=True), \ + patch("model_tools._run_async", side_effect=_fake_run_async), \ + patch("tools.vision_tools.vision_analyze_tool", + new_callable=lambda: fake_vat): + resp = cu_tool._capture_response(cap) + + # Must be a string (text-only result). + assert isinstance(resp, str) + # Must NOT contain a base64 image URL anywhere — that's what tripped + # 'No endpoints found that support image input' on the reporter's + # main provider in #24015. + assert "data:image" not in resp + assert "image_url" not in resp