mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
Add tests/tools/test_computer_use_capture_routing.py — 13 integration
tests that drive _capture_response end-to-end with deterministic stubs
for the routing helper, _run_async, vision_analyze_tool, and
get_hermes_dir, so the full code path is exercised without a live
cua-driver, real auxiliary client, or network access.
Coverage:
* TestCaptureResponseDefaultPath (3 cases)
- SOM PNG capture returns the legacy multimodal envelope when the
routing helper says 'native' (image/png MIME).
- Same path returns image/jpeg MIME for JPEG payloads (cua-driver
can return either).
- AX-only mode never even consults the routing helper because no
PNG is present.
* TestCaptureResponseRoutedToAuxVision (5 cases)
- SOM capture with routing on returns a JSON string with the
vision_analysis embedded, the AX/SOM index preserved, and NO
image_url parts. Verifies the aux call receives a path under
the configured cache and a prompt that grounds itself against
the AX summary.
- Temp screenshot file is unlinked after _capture_response returns,
including when the aux call raises (the finally block runs).
- Empty / malformed aux analysis falls back to the multimodal
envelope so the user always gets *something* useful.
* TestRoutingDecisionWiring (4 cases)
- Explicit auxiliary.vision in config flips routing on regardless of
main-model vision capability.
- Vision-capable main + native tool-result support keeps multimodal.
- Config load failure fails open (returns False, multimodal path
continues to work).
- Helper exception is swallowed and routes to legacy behaviour.
* TestBugReproductionAnchor (1 case) - directly pins the #24015
contract: when routing is on, the response must NEVER contain a
'data:image' or 'image_url' substring. That is exactly what tripped
the reporter's HTTP 404 ('No endpoints found that support image
input') on tencent/hy3-preview before the fix.
Bug-reproduction proof:
$ git checkout upstream/main -- tools/computer_use/tool.py
$ scripts/run_tests.sh tests/tools/test_computer_use_capture_routing.py
============================== 13 failed in 1.29s ==============================
$ # restore tool.py to this branch's HEAD
$ scripts/run_tests.sh tests/tools/test_computer_use_capture_routing.py
============================== 13 passed in 1.04s ==============================
Total branch coverage:
85 passed across test_computer_use.py, test_computer_use_vision_routing.py,
test_computer_use_capture_routing.py
431 lines
17 KiB
Python
431 lines
17 KiB
Python
"""End-to-end regression for #24015 — capture routing via auxiliary.vision.
|
||
|
||
When ``computer_use(action='capture', mode='som'|'vision')`` returns a
|
||
screenshot, ``_capture_response`` previously always returned a
|
||
``_multimodal`` envelope. For non-vision main models, or when the user
|
||
explicitly configured ``auxiliary.vision`` in ``config.yaml``, that
|
||
envelope tripped HTTP 404 / 400 at the provider boundary even though a
|
||
perfectly good vision backend was sitting in config waiting to be used.
|
||
|
||
This file exercises the integrated ``_capture_response`` flow with
|
||
deterministic stubs for:
|
||
|
||
* ``should_route_capture_to_aux_vision`` (the policy decision)
|
||
* ``_run_async`` (sync->async bridge)
|
||
* ``vision_analyze_tool`` (the aux LLM call)
|
||
* ``hermes_constants.get_hermes_dir`` (cache path)
|
||
|
||
…so the full code path is covered without a live cua-driver, a real
|
||
auxiliary client, or network access.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import base64
|
||
import json
|
||
import os
|
||
from pathlib import Path
|
||
from typing import Any
|
||
from unittest.mock import MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Fixtures / helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# 1×1 PNG (transparent) — minimal bytes that decode cleanly.
|
||
_PNG_B64 = (
|
||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42m"
|
||
"NkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="
|
||
)
|
||
|
||
# 1×1 JPEG — used to verify mime detection works for either stream type.
|
||
_JPEG_B64 = (
|
||
"/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEB"
|
||
"AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/"
|
||
)
|
||
|
||
|
||
@pytest.fixture
|
||
def tmp_cache_dir(tmp_path):
|
||
"""Override get_hermes_dir so cache writes land under tmp_path."""
|
||
cache_dir = tmp_path / "cache_vision"
|
||
cache_dir.mkdir()
|
||
|
||
def _fake_get(*_args, **_kw):
|
||
return cache_dir
|
||
|
||
with patch("hermes_constants.get_hermes_dir", _fake_get):
|
||
yield cache_dir
|
||
|
||
|
||
def _make_capture(
|
||
*,
|
||
png_b64: str = _PNG_B64,
|
||
mode: str = "som",
|
||
elements=None,
|
||
app: str = "Safari",
|
||
window_title: str = "GitHub – Issue #24015",
|
||
width: int = 1280,
|
||
height: int = 800,
|
||
):
|
||
from tools.computer_use.backend import CaptureResult, UIElement
|
||
|
||
elements = list(elements or [
|
||
UIElement(index=0, role="AXButton", label="Sign in",
|
||
bounds=(10, 20, 80, 30)),
|
||
UIElement(index=1, role="AXTextField", label="username",
|
||
bounds=(10, 60, 200, 24)),
|
||
])
|
||
raw = base64.b64decode(png_b64, validate=False)
|
||
return CaptureResult(
|
||
mode=mode,
|
||
width=width,
|
||
height=height,
|
||
png_b64=png_b64,
|
||
elements=elements,
|
||
app=app,
|
||
window_title=window_title,
|
||
png_bytes_len=len(raw),
|
||
)
|
||
|
||
|
||
def _stub_aux_analysis(text: str):
|
||
"""Return a fake vision_analyze_tool coroutine result (JSON envelope)."""
|
||
return json.dumps({"success": True, "analysis": text})
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _capture_response: routing OFF (current/native behaviour)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestCaptureResponseDefaultPath:
|
||
"""When routing helper says 'native', the existing multimodal envelope wins."""
|
||
|
||
def test_som_capture_returns_multimodal_envelope_when_native(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(png_b64=_PNG_B64, mode="som")
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=False):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
assert isinstance(resp, dict)
|
||
assert resp.get("_multimodal") is True
|
||
# Image part must use image/png MIME for a PNG payload.
|
||
image_part = next(
|
||
p for p in resp["content"] if p.get("type") == "image_url"
|
||
)
|
||
url = image_part["image_url"]["url"]
|
||
assert url.startswith("data:image/png;base64,")
|
||
assert "vision_analysis" not in resp
|
||
|
||
def test_jpeg_capture_returns_image_jpeg_mime_when_native(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(png_b64=_JPEG_B64, mode="som")
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=False):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
url = next(p for p in resp["content"] if p.get("type") == "image_url")
|
||
assert url["image_url"]["url"].startswith("data:image/jpeg;base64,")
|
||
|
||
def test_ax_only_capture_returns_text_regardless_of_routing(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="ax", png_b64="")
|
||
# ax mode never has a PNG so neither path matters; assert pure text.
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True) as routing:
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
# ax never even consults the routing helper — short-circuited above
|
||
# the image branch.
|
||
routing.assert_not_called()
|
||
assert isinstance(resp, str)
|
||
body = json.loads(resp)
|
||
assert body["mode"] == "ax"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _capture_response: routing ON (the #24015 fix)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestCaptureResponseRoutedToAuxVision:
|
||
"""When routing helper says 'aux', the PNG is pre-analysed and a text
|
||
response is returned with no image_url parts at all."""
|
||
|
||
def test_som_capture_returns_text_with_vision_analysis(
|
||
self, tmp_cache_dir,
|
||
):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="som")
|
||
|
||
captured_calls = {}
|
||
|
||
def _fake_run_async(coro):
|
||
captured_calls["called"] = True
|
||
return _stub_aux_analysis(
|
||
"A Safari window showing a GitHub issue page with a 'Sign "
|
||
"in' button and a 'username' text field."
|
||
)
|
||
|
||
# vision_analyze_tool is async; force a sync MagicMock so we can
|
||
# assert positional args without dealing with awaitables.
|
||
fake_vat = MagicMock(return_value="<coro>")
|
||
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True), \
|
||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||
patch("tools.vision_tools.vision_analyze_tool",
|
||
new_callable=lambda: fake_vat):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
# Must be a JSON string, NOT a multimodal envelope. This is exactly
|
||
# the contract that prevents #24015's HTTP 404 from firing on the
|
||
# next agent turn.
|
||
assert isinstance(resp, str)
|
||
body = json.loads(resp)
|
||
assert body["mode"] == "som"
|
||
assert body["app"] == "Safari"
|
||
assert "Sign in" in body["vision_analysis"]
|
||
assert body["vision_analysis_routed_via"] == "auxiliary.vision"
|
||
# The original AX-only metadata (window title, element index, app)
|
||
# is preserved alongside the new vision analysis so the agent loses
|
||
# no context vs the multimodal path.
|
||
assert body["window_title"] == "GitHub – Issue #24015"
|
||
assert len(body["elements"]) == 2
|
||
|
||
assert captured_calls.get("called") is True
|
||
# vision_analyze_tool was invoked with a path under the patched cache
|
||
# and a non-empty prompt.
|
||
args, _kwargs = fake_vat.call_args
|
||
path_arg, prompt_arg = args[0], args[1]
|
||
assert str(tmp_cache_dir) in path_arg
|
||
assert "macOS application screenshot" in prompt_arg
|
||
# AX summary is included so the aux model can ground its description
|
||
# against the same set-of-mark index the agent will see.
|
||
assert "Sign in" in prompt_arg
|
||
|
||
def test_temp_screenshot_file_is_cleaned_up_after_routing(
|
||
self, tmp_cache_dir,
|
||
):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="som")
|
||
# We capture the path the aux call sees so we can assert it's gone
|
||
# after _capture_response returns.
|
||
observed_path = {}
|
||
|
||
def _fake_run_async(_coro):
|
||
return _stub_aux_analysis("description goes here")
|
||
|
||
def _fake_vat(image_path, _prompt):
|
||
observed_path["path"] = image_path
|
||
# File must exist while aux is being arranged.
|
||
assert os.path.exists(image_path)
|
||
return "<coro>"
|
||
|
||
fake_vat = MagicMock(side_effect=_fake_vat)
|
||
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True), \
|
||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||
patch("tools.vision_tools.vision_analyze_tool",
|
||
new_callable=lambda: fake_vat):
|
||
cu_tool._capture_response(cap)
|
||
|
||
# File must be unlinked after _capture_response returns.
|
||
assert observed_path["path"]
|
||
assert not os.path.exists(observed_path["path"])
|
||
|
||
def test_temp_file_cleaned_up_even_when_aux_call_raises(
|
||
self, tmp_cache_dir,
|
||
):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="som")
|
||
observed_path = {}
|
||
|
||
def _fake_vat(image_path, _prompt):
|
||
observed_path["path"] = image_path
|
||
return "<coro>"
|
||
|
||
def _fake_run_async(_coro):
|
||
raise RuntimeError("aux LLM down")
|
||
|
||
fake_vat = MagicMock(side_effect=_fake_vat)
|
||
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True), \
|
||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||
patch("tools.vision_tools.vision_analyze_tool",
|
||
new_callable=lambda: fake_vat):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
# Aux failure → fall back to multimodal envelope (so the user still
|
||
# gets *something* useful even if vision is broken).
|
||
assert isinstance(resp, dict)
|
||
assert resp.get("_multimodal") is True
|
||
# Temp file must still be cleaned up.
|
||
assert observed_path["path"]
|
||
assert not os.path.exists(observed_path["path"])
|
||
|
||
def test_empty_aux_analysis_falls_back_to_multimodal(self, tmp_cache_dir):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="som")
|
||
|
||
def _fake_run_async(_coro):
|
||
return _stub_aux_analysis("")
|
||
|
||
fake_vat = MagicMock(return_value="<coro>")
|
||
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True), \
|
||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||
patch("tools.vision_tools.vision_analyze_tool",
|
||
new_callable=lambda: fake_vat):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
# Empty analysis is treated as failure — we'd rather show pixels
|
||
# than embed an empty 'vision_analysis' string into the result.
|
||
assert isinstance(resp, dict)
|
||
assert resp.get("_multimodal") is True
|
||
|
||
def test_invalid_aux_response_falls_back_to_multimodal(self, tmp_cache_dir):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="som")
|
||
|
||
def _fake_run_async(_coro):
|
||
return 1234 # not a string at all
|
||
|
||
fake_vat = MagicMock(return_value="<coro>")
|
||
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True), \
|
||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||
patch("tools.vision_tools.vision_analyze_tool",
|
||
new_callable=lambda: fake_vat):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
assert isinstance(resp, dict)
|
||
assert resp.get("_multimodal") is True
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# _should_route_through_aux_vision: end-to-end with real config plumbing
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestRoutingDecisionWiring:
|
||
"""Verify _should_route_through_aux_vision wires the right config + helper."""
|
||
|
||
def test_explicit_aux_vision_in_config_routes_to_aux(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cfg = {
|
||
"model": {"default": "tencent/hy3-preview", "provider": "openrouter"},
|
||
"auxiliary": {
|
||
"vision": {
|
||
"provider": "openrouter",
|
||
"model": "google/gemini-2.5-flash",
|
||
}
|
||
},
|
||
}
|
||
with patch("agent.auxiliary_client._read_main_provider",
|
||
return_value="openrouter"), \
|
||
patch("agent.auxiliary_client._read_main_model",
|
||
return_value="tencent/hy3-preview"), \
|
||
patch("hermes_cli.config.load_config", return_value=cfg):
|
||
assert cu_tool._should_route_through_aux_vision() is True
|
||
|
||
def test_no_explicit_aux_and_vision_capable_main_keeps_multimodal(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cfg = {
|
||
"model": {"default": "claude-opus-4-5", "provider": "anthropic"},
|
||
}
|
||
with patch("agent.auxiliary_client._read_main_provider",
|
||
return_value="anthropic"), \
|
||
patch("agent.auxiliary_client._read_main_model",
|
||
return_value="claude-opus-4-5"), \
|
||
patch("hermes_cli.config.load_config", return_value=cfg), \
|
||
patch("tools.computer_use.vision_routing._lookup_supports_vision",
|
||
return_value=True), \
|
||
patch("tools.computer_use.vision_routing."
|
||
"_provider_accepts_multimodal_tool_result",
|
||
return_value=True):
|
||
assert cu_tool._should_route_through_aux_vision() is False
|
||
|
||
def test_config_load_failure_disables_routing_safely(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
with patch("hermes_cli.config.load_config",
|
||
side_effect=RuntimeError("config.yaml unreadable")):
|
||
# No exception should bubble up — fail open by returning False
|
||
# so the legacy multimodal envelope continues to work.
|
||
assert cu_tool._should_route_through_aux_vision() is False
|
||
|
||
def test_helper_decision_exception_is_swallowed(self):
|
||
from tools.computer_use import tool as cu_tool
|
||
from tools.computer_use import vision_routing as vr_mod
|
||
|
||
with patch("agent.auxiliary_client._read_main_provider",
|
||
return_value="openrouter"), \
|
||
patch("agent.auxiliary_client._read_main_model",
|
||
return_value="x"), \
|
||
patch("hermes_cli.config.load_config", return_value={}), \
|
||
patch.object(vr_mod, "should_route_capture_to_aux_vision",
|
||
side_effect=ValueError("policy bug")):
|
||
assert cu_tool._should_route_through_aux_vision() is False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Bug reproduction marker — proves the fix is needed.
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestBugReproductionAnchor:
|
||
"""Without the fix, this test would assert the wrong thing.
|
||
|
||
On upstream/main HEAD prior to this branch, _capture_response returns a
|
||
multimodal envelope unconditionally — so when a non-vision main model
|
||
is configured, the captured PNG is delivered to the main provider as
|
||
image_url content and the request is rejected with HTTP 404. We don't
|
||
have a live provider here, but we can pin the contract: with routing
|
||
enabled the response MUST be a JSON string with no image_url parts.
|
||
"""
|
||
|
||
def test_non_vision_main_model_never_returns_image_url_when_routed(
|
||
self, tmp_cache_dir,
|
||
):
|
||
from tools.computer_use import tool as cu_tool
|
||
|
||
cap = _make_capture(mode="som")
|
||
|
||
def _fake_run_async(_coro):
|
||
return _stub_aux_analysis(
|
||
"Screenshot showing a GitHub.com window with a sign-in "
|
||
"form."
|
||
)
|
||
|
||
fake_vat = MagicMock(return_value="<coro>")
|
||
|
||
with patch.object(cu_tool, "_should_route_through_aux_vision",
|
||
return_value=True), \
|
||
patch("model_tools._run_async", side_effect=_fake_run_async), \
|
||
patch("tools.vision_tools.vision_analyze_tool",
|
||
new_callable=lambda: fake_vat):
|
||
resp = cu_tool._capture_response(cap)
|
||
|
||
# Must be a string (text-only result).
|
||
assert isinstance(resp, str)
|
||
# Must NOT contain a base64 image URL anywhere — that's what tripped
|
||
# 'No endpoints found that support image input' on the reporter's
|
||
# main provider in #24015.
|
||
assert "data:image" not in resp
|
||
assert "image_url" not in resp
|