test(computer_use): end-to-end regression for capture routing (#24015)

Add tests/tools/test_computer_use_capture_routing.py — 13 integration
tests that drive _capture_response end-to-end with deterministic stubs
for the routing helper, _run_async, vision_analyze_tool, and
get_hermes_dir, so the full code path is exercised without a live
cua-driver, real auxiliary client, or network access.

Coverage:

  * TestCaptureResponseDefaultPath (3 cases)
    - SOM PNG capture returns the legacy multimodal envelope when the
      routing helper says 'native' (image/png MIME).
    - Same path returns image/jpeg MIME for JPEG payloads (cua-driver
      can return either).
    - AX-only mode never even consults the routing helper because no
      PNG is present.

  * TestCaptureResponseRoutedToAuxVision (5 cases)
    - SOM capture with routing on returns a JSON string with the
      vision_analysis embedded, the AX/SOM index preserved, and NO
      image_url parts. Verifies the aux call receives a path under
      the configured cache and a prompt that grounds itself against
      the AX summary.
    - Temp screenshot file is unlinked after _capture_response returns,
      including when the aux call raises (the finally block runs).
    - Empty / malformed aux analysis falls back to the multimodal
      envelope so the user always gets *something* useful.

  * TestRoutingDecisionWiring (4 cases)
    - Explicit auxiliary.vision in config flips routing on regardless of
      main-model vision capability.
    - Vision-capable main + native tool-result support keeps multimodal.
    - Config load failure fails open (returns False, multimodal path
      continues to work).
    - Helper exception is swallowed and routes to legacy behaviour.

  * TestBugReproductionAnchor (1 case) - directly pins the #24015
    contract: when routing is on, the response must NEVER contain a
    'data:image' or 'image_url' substring. That is exactly what tripped
    the reporter's HTTP 404 ('No endpoints found that support image
    input') on tencent/hy3-preview before the fix.

Bug-reproduction proof:
  $ git checkout upstream/main -- tools/computer_use/tool.py
  $ scripts/run_tests.sh tests/tools/test_computer_use_capture_routing.py
  ============================== 13 failed in 1.29s ==============================

  $ # restore tool.py to this branch's HEAD
  $ scripts/run_tests.sh tests/tools/test_computer_use_capture_routing.py
  ============================== 13 passed in 1.04s ==============================

Total branch coverage:
  85 passed across test_computer_use.py, test_computer_use_vision_routing.py,
  test_computer_use_capture_routing.py
This commit is contained in:
xxxigm 2026-05-12 07:24:45 +07:00 committed by Teknium
parent e02a7e5e1c
commit bec2250d2c

View file

@ -0,0 +1,431 @@
"""End-to-end regression for #24015 — capture routing via auxiliary.vision.
When ``computer_use(action='capture', mode='som'|'vision')`` returns a
screenshot, ``_capture_response`` previously always returned a
``_multimodal`` envelope. For non-vision main models, or when the user
explicitly configured ``auxiliary.vision`` in ``config.yaml``, that
envelope tripped HTTP 404 / 400 at the provider boundary even though a
perfectly good vision backend was sitting in config waiting to be used.
This file exercises the integrated ``_capture_response`` flow with
deterministic stubs for:
* ``should_route_capture_to_aux_vision`` (the policy decision)
* ``_run_async`` (sync->async bridge)
* ``vision_analyze_tool`` (the aux LLM call)
* ``hermes_constants.get_hermes_dir`` (cache path)
so the full code path is covered without a live cua-driver, a real
auxiliary client, or network access.
"""
from __future__ import annotations
import base64
import json
import os
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
# ---------------------------------------------------------------------------
# Fixtures / helpers
# ---------------------------------------------------------------------------
# 1×1 PNG (transparent) — minimal bytes that decode cleanly.
_PNG_B64 = (
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
"NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
)
# 1×1 JPEG — used to verify mime detection works for either stream type.
_JPEG_B64 = (
"/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEB"
"AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/"
)
@pytest.fixture
def tmp_cache_dir(tmp_path):
"""Override get_hermes_dir so cache writes land under tmp_path."""
cache_dir = tmp_path / "cache_vision"
cache_dir.mkdir()
def _fake_get(*_args, **_kw):
return cache_dir
with patch("hermes_constants.get_hermes_dir", _fake_get):
yield cache_dir
def _make_capture(
*,
png_b64: str = _PNG_B64,
mode: str = "som",
elements=None,
app: str = "Safari",
window_title: str = "GitHub Issue #24015",
width: int = 1280,
height: int = 800,
):
from tools.computer_use.backend import CaptureResult, UIElement
elements = list(elements or [
UIElement(index=0, role="AXButton", label="Sign in",
bounds=(10, 20, 80, 30)),
UIElement(index=1, role="AXTextField", label="username",
bounds=(10, 60, 200, 24)),
])
raw = base64.b64decode(png_b64, validate=False)
return CaptureResult(
mode=mode,
width=width,
height=height,
png_b64=png_b64,
elements=elements,
app=app,
window_title=window_title,
png_bytes_len=len(raw),
)
def _stub_aux_analysis(text: str):
"""Return a fake vision_analyze_tool coroutine result (JSON envelope)."""
return json.dumps({"success": True, "analysis": text})
# ---------------------------------------------------------------------------
# _capture_response: routing OFF (current/native behaviour)
# ---------------------------------------------------------------------------
class TestCaptureResponseDefaultPath:
"""When routing helper says 'native', the existing multimodal envelope wins."""
def test_som_capture_returns_multimodal_envelope_when_native(self):
from tools.computer_use import tool as cu_tool
cap = _make_capture(png_b64=_PNG_B64, mode="som")
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=False):
resp = cu_tool._capture_response(cap)
assert isinstance(resp, dict)
assert resp.get("_multimodal") is True
# Image part must use image/png MIME for a PNG payload.
image_part = next(
p for p in resp["content"] if p.get("type") == "image_url"
)
url = image_part["image_url"]["url"]
assert url.startswith("data:image/png;base64,")
assert "vision_analysis" not in resp
def test_jpeg_capture_returns_image_jpeg_mime_when_native(self):
from tools.computer_use import tool as cu_tool
cap = _make_capture(png_b64=_JPEG_B64, mode="som")
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=False):
resp = cu_tool._capture_response(cap)
url = next(p for p in resp["content"] if p.get("type") == "image_url")
assert url["image_url"]["url"].startswith("data:image/jpeg;base64,")
def test_ax_only_capture_returns_text_regardless_of_routing(self):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="ax", png_b64="")
# ax mode never has a PNG so neither path matters; assert pure text.
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True) as routing:
resp = cu_tool._capture_response(cap)
# ax never even consults the routing helper — short-circuited above
# the image branch.
routing.assert_not_called()
assert isinstance(resp, str)
body = json.loads(resp)
assert body["mode"] == "ax"
# ---------------------------------------------------------------------------
# _capture_response: routing ON (the #24015 fix)
# ---------------------------------------------------------------------------
class TestCaptureResponseRoutedToAuxVision:
"""When routing helper says 'aux', the PNG is pre-analysed and a text
response is returned with no image_url parts at all."""
def test_som_capture_returns_text_with_vision_analysis(
self, tmp_cache_dir,
):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="som")
captured_calls = {}
def _fake_run_async(coro):
captured_calls["called"] = True
return _stub_aux_analysis(
"A Safari window showing a GitHub issue page with a 'Sign "
"in' button and a 'username' text field."
)
# vision_analyze_tool is async; force a sync MagicMock so we can
# assert positional args without dealing with awaitables.
fake_vat = MagicMock(return_value="<coro>")
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True), \
patch("model_tools._run_async", side_effect=_fake_run_async), \
patch("tools.vision_tools.vision_analyze_tool",
new_callable=lambda: fake_vat):
resp = cu_tool._capture_response(cap)
# Must be a JSON string, NOT a multimodal envelope. This is exactly
# the contract that prevents #24015's HTTP 404 from firing on the
# next agent turn.
assert isinstance(resp, str)
body = json.loads(resp)
assert body["mode"] == "som"
assert body["app"] == "Safari"
assert "Sign in" in body["vision_analysis"]
assert body["vision_analysis_routed_via"] == "auxiliary.vision"
# The original AX-only metadata (window title, element index, app)
# is preserved alongside the new vision analysis so the agent loses
# no context vs the multimodal path.
assert body["window_title"] == "GitHub Issue #24015"
assert len(body["elements"]) == 2
assert captured_calls.get("called") is True
# vision_analyze_tool was invoked with a path under the patched cache
# and a non-empty prompt.
args, _kwargs = fake_vat.call_args
path_arg, prompt_arg = args[0], args[1]
assert str(tmp_cache_dir) in path_arg
assert "macOS application screenshot" in prompt_arg
# AX summary is included so the aux model can ground its description
# against the same set-of-mark index the agent will see.
assert "Sign in" in prompt_arg
def test_temp_screenshot_file_is_cleaned_up_after_routing(
self, tmp_cache_dir,
):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="som")
# We capture the path the aux call sees so we can assert it's gone
# after _capture_response returns.
observed_path = {}
def _fake_run_async(_coro):
return _stub_aux_analysis("description goes here")
def _fake_vat(image_path, _prompt):
observed_path["path"] = image_path
# File must exist while aux is being arranged.
assert os.path.exists(image_path)
return "<coro>"
fake_vat = MagicMock(side_effect=_fake_vat)
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True), \
patch("model_tools._run_async", side_effect=_fake_run_async), \
patch("tools.vision_tools.vision_analyze_tool",
new_callable=lambda: fake_vat):
cu_tool._capture_response(cap)
# File must be unlinked after _capture_response returns.
assert observed_path["path"]
assert not os.path.exists(observed_path["path"])
def test_temp_file_cleaned_up_even_when_aux_call_raises(
self, tmp_cache_dir,
):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="som")
observed_path = {}
def _fake_vat(image_path, _prompt):
observed_path["path"] = image_path
return "<coro>"
def _fake_run_async(_coro):
raise RuntimeError("aux LLM down")
fake_vat = MagicMock(side_effect=_fake_vat)
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True), \
patch("model_tools._run_async", side_effect=_fake_run_async), \
patch("tools.vision_tools.vision_analyze_tool",
new_callable=lambda: fake_vat):
resp = cu_tool._capture_response(cap)
# Aux failure → fall back to multimodal envelope (so the user still
# gets *something* useful even if vision is broken).
assert isinstance(resp, dict)
assert resp.get("_multimodal") is True
# Temp file must still be cleaned up.
assert observed_path["path"]
assert not os.path.exists(observed_path["path"])
def test_empty_aux_analysis_falls_back_to_multimodal(self, tmp_cache_dir):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="som")
def _fake_run_async(_coro):
return _stub_aux_analysis("")
fake_vat = MagicMock(return_value="<coro>")
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True), \
patch("model_tools._run_async", side_effect=_fake_run_async), \
patch("tools.vision_tools.vision_analyze_tool",
new_callable=lambda: fake_vat):
resp = cu_tool._capture_response(cap)
# Empty analysis is treated as failure — we'd rather show pixels
# than embed an empty 'vision_analysis' string into the result.
assert isinstance(resp, dict)
assert resp.get("_multimodal") is True
def test_invalid_aux_response_falls_back_to_multimodal(self, tmp_cache_dir):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="som")
def _fake_run_async(_coro):
return 1234 # not a string at all
fake_vat = MagicMock(return_value="<coro>")
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True), \
patch("model_tools._run_async", side_effect=_fake_run_async), \
patch("tools.vision_tools.vision_analyze_tool",
new_callable=lambda: fake_vat):
resp = cu_tool._capture_response(cap)
assert isinstance(resp, dict)
assert resp.get("_multimodal") is True
# ---------------------------------------------------------------------------
# _should_route_through_aux_vision: end-to-end with real config plumbing
# ---------------------------------------------------------------------------
class TestRoutingDecisionWiring:
"""Verify _should_route_through_aux_vision wires the right config + helper."""
def test_explicit_aux_vision_in_config_routes_to_aux(self):
from tools.computer_use import tool as cu_tool
cfg = {
"model": {"default": "tencent/hy3-preview", "provider": "openrouter"},
"auxiliary": {
"vision": {
"provider": "openrouter",
"model": "google/gemini-2.5-flash",
}
},
}
with patch("agent.auxiliary_client._read_main_provider",
return_value="openrouter"), \
patch("agent.auxiliary_client._read_main_model",
return_value="tencent/hy3-preview"), \
patch("hermes_cli.config.load_config", return_value=cfg):
assert cu_tool._should_route_through_aux_vision() is True
def test_no_explicit_aux_and_vision_capable_main_keeps_multimodal(self):
from tools.computer_use import tool as cu_tool
cfg = {
"model": {"default": "claude-opus-4-5", "provider": "anthropic"},
}
with patch("agent.auxiliary_client._read_main_provider",
return_value="anthropic"), \
patch("agent.auxiliary_client._read_main_model",
return_value="claude-opus-4-5"), \
patch("hermes_cli.config.load_config", return_value=cfg), \
patch("tools.computer_use.vision_routing._lookup_supports_vision",
return_value=True), \
patch("tools.computer_use.vision_routing."
"_provider_accepts_multimodal_tool_result",
return_value=True):
assert cu_tool._should_route_through_aux_vision() is False
def test_config_load_failure_disables_routing_safely(self):
from tools.computer_use import tool as cu_tool
with patch("hermes_cli.config.load_config",
side_effect=RuntimeError("config.yaml unreadable")):
# No exception should bubble up — fail open by returning False
# so the legacy multimodal envelope continues to work.
assert cu_tool._should_route_through_aux_vision() is False
def test_helper_decision_exception_is_swallowed(self):
from tools.computer_use import tool as cu_tool
from tools.computer_use import vision_routing as vr_mod
with patch("agent.auxiliary_client._read_main_provider",
return_value="openrouter"), \
patch("agent.auxiliary_client._read_main_model",
return_value="x"), \
patch("hermes_cli.config.load_config", return_value={}), \
patch.object(vr_mod, "should_route_capture_to_aux_vision",
side_effect=ValueError("policy bug")):
assert cu_tool._should_route_through_aux_vision() is False
# ---------------------------------------------------------------------------
# Bug reproduction marker — proves the fix is needed.
# ---------------------------------------------------------------------------
class TestBugReproductionAnchor:
"""Without the fix, this test would assert the wrong thing.
On upstream/main HEAD prior to this branch, _capture_response returns a
multimodal envelope unconditionally so when a non-vision main model
is configured, the captured PNG is delivered to the main provider as
image_url content and the request is rejected with HTTP 404. We don't
have a live provider here, but we can pin the contract: with routing
enabled the response MUST be a JSON string with no image_url parts.
"""
def test_non_vision_main_model_never_returns_image_url_when_routed(
self, tmp_cache_dir,
):
from tools.computer_use import tool as cu_tool
cap = _make_capture(mode="som")
def _fake_run_async(_coro):
return _stub_aux_analysis(
"Screenshot showing a GitHub.com window with a sign-in "
"form."
)
fake_vat = MagicMock(return_value="<coro>")
with patch.object(cu_tool, "_should_route_through_aux_vision",
return_value=True), \
patch("model_tools._run_async", side_effect=_fake_run_async), \
patch("tools.vision_tools.vision_analyze_tool",
new_callable=lambda: fake_vat):
resp = cu_tool._capture_response(cap)
# Must be a string (text-only result).
assert isinstance(resp, str)
# Must NOT contain a base64 image URL anywhere — that's what tripped
# 'No endpoints found that support image input' on the reporter's
# main provider in #24015.
assert "data:image" not in resp
assert "image_url" not in resp