fix(computer_use): route SOM/vision captures via auxiliary.vision (#24015)

When the active main model has no vision capability — or when the user
explicitly configured auxiliary.vision in config.yaml — sending the
captured screenshot back to the main model in a multimodal tool-result
envelope is the wrong move: it trips HTTP 404 / 400 at the provider
boundary (e.g. 'No endpoints found that support image input') and the
agent loop reports a hard tool failure for what should have been a
simple capture.

The reporter on #24015 hit this with:

  model:
    default: tencent/hy3-preview      # no vision support
    provider: openrouter
  auxiliary:
    vision:
      provider: openrouter
      model: google/gemini-2.5-flash  # explicitly configured

…and observed:

  computer_use(action='capture', mode='som')
  → ⚠️ API call failed (attempt1/3): NotFoundError [HTTP 404]
     🔌 Provider: openrouter  Model: tencent/hy3-preview
     📝 Error: HTTP 404: No endpoints found that support image input

Fix: in tools/computer_use/tool.py::_capture_response, after a
screenshot is captured (modes 'som' / 'vision'), consult the routing
helper introduced earlier in this branch. When it says 'route to aux',
materialise the PNG to $HERMES_HOME/cache/vision/, run vision_analyze
on it (which honours auxiliary.vision via the standard async_call_llm
task='vision' router), and return a text-only JSON tool result that
embeds the analysis alongside the existing AX/SOM index. The main
model never sees the pixels — it sees an actionable text description
plus the same set-of-mark element index it normally uses.

The two new helpers (_should_route_through_aux_vision,
_route_capture_through_aux_vision) keep the policy and the IO
separated so each can be tested in isolation. Both fail open: if the
config import fails, if the aux call raises, or if the analysis is
empty, we fall back to the existing multimodal envelope so the
behaviour is at worst the pre-fix status quo. Temp screenshot files
are cleaned up unconditionally in a finally block — even on aux call
failure — to avoid leaving residue under cache/vision/.

The end-to-end regression for #24015 is added in the next commit.
This commit is contained in:
xxxigm 2026-05-12 07:18:09 +07:00 committed by Teknium
parent 5ce5fe3181
commit e02a7e5e1c

View file

@ -429,6 +429,21 @@ def _capture_response(cap: CaptureResult) -> Any:
summary = "\n".join(summary_lines)
if cap.png_b64 and cap.mode != "ax":
# Decide whether to hand the screenshot to the auxiliary.vision
# pipeline (text-only result) or keep the multimodal envelope (main
# model handles vision natively). Issue #24015: previously the
# multimodal envelope was returned unconditionally, so non-vision
# main models tripped HTTP 404 / 400 at the provider boundary even
# when auxiliary.vision was explicitly configured to handle this.
if _should_route_through_aux_vision():
routed = _route_capture_through_aux_vision(cap, summary)
if routed is not None:
return routed
# Aux routing was requested but failed (no vision client, aux
# call raised, etc.). Fall through to the multimodal envelope —
# better to surface a tool-result error from the main model
# than to silently drop the screenshot entirely.
# Detect actual image format from base64 magic bytes so the MIME type
# matches what the data contains (cua-driver may return JPEG or PNG).
# JPEG: base64 starts with /9j/ PNG: starts with iVBOR
@ -457,6 +472,140 @@ def _capture_response(cap: CaptureResult) -> Any:
})
# ---------------------------------------------------------------------------
# auxiliary.vision routing for captured screenshots (#24015)
# ---------------------------------------------------------------------------
def _should_route_through_aux_vision() -> bool:
"""Return True when ``_capture_response`` should hand the PNG to aux vision.
Reads the active main provider/model and the loaded config and asks the
routing helper. Any failure (config import, runtime override missing,
etc.) returns False so the existing multimodal envelope continues to be
returned fail open on the routing decision so a broken config can
never silently drop the screenshot for vision-capable main models.
"""
try:
from agent.auxiliary_client import _read_main_model, _read_main_provider
from hermes_cli.config import load_config
from tools.computer_use.vision_routing import (
should_route_capture_to_aux_vision,
)
except Exception as exc: # pragma: no cover - defensive
logger.debug("computer_use: aux-vision routing import failed: %s", exc)
return False
try:
provider = _read_main_provider()
model = _read_main_model()
cfg = load_config()
except Exception as exc: # pragma: no cover - defensive
logger.debug("computer_use: aux-vision routing config read failed: %s", exc)
return False
try:
return bool(should_route_capture_to_aux_vision(provider, model, cfg))
except Exception as exc: # pragma: no cover - defensive
logger.debug("computer_use: aux-vision routing decision failed: %s", exc)
return False
def _route_capture_through_aux_vision(
cap: CaptureResult,
summary: str,
) -> Optional[str]:
"""Pre-analyse the captured PNG via ``vision_analyze`` and return a text result.
The captured base64 PNG is materialised to ``$HERMES_HOME/cache/vision/``
and handed to ``vision_analyze_tool`` with a generic describe prompt.
The resulting text description is merged into the existing AX/SOM
summary so the main model receives a single text payload that mentions
every interactable element AND a description of what the screenshot
looked like.
Returns:
A JSON-encoded text response on success.
``None`` on failure (caller falls back to the multimodal envelope).
"""
if not cap.png_b64:
return None
try:
import base64 as _base64
import os as _os
import uuid as _uuid
from hermes_constants import get_hermes_dir
from model_tools import _run_async
from tools.vision_tools import vision_analyze_tool
except Exception as exc: # pragma: no cover - defensive
logger.debug("computer_use: aux-vision import failed: %s", exc)
return None
temp_image_path = None
try:
try:
raw = _base64.b64decode(cap.png_b64, validate=False)
except Exception as exc:
logger.debug("computer_use: failed to decode capture base64: %s", exc)
return None
# Pick an extension that matches the on-disk bytes so vision_analyze's
# MIME sniffing returns the right content-type.
ext = ".jpg" if cap.png_b64[:8].startswith("/9j/") else ".png"
cache_dir = get_hermes_dir("cache/vision", "temp_vision_images")
temp_image_path = cache_dir / f"computer_use_{_uuid.uuid4().hex}{ext}"
temp_image_path.write_bytes(raw)
prompt = (
"Describe what is visible in this macOS application screenshot in "
"concise but specific terms. Mention the app name and window "
"title if visible, the overall layout, any labelled buttons, "
"menus or text fields, and any prominent text content the user "
"would need to know about. Do not invent details that are not "
"actually visible.\n\n"
f"AX/SOM index for cross-reference:\n{summary}"
)
result_json = _run_async(
vision_analyze_tool(str(temp_image_path), prompt)
)
except Exception as exc:
logger.warning(
"computer_use: auxiliary.vision pre-analysis failed (%s); "
"falling back to native multimodal envelope",
exc,
)
return None
finally:
if temp_image_path is not None:
try:
_os.unlink(str(temp_image_path))
except Exception:
pass
analysis_text = ""
if isinstance(result_json, str):
try:
parsed = json.loads(result_json)
if isinstance(parsed, dict):
analysis_text = str(parsed.get("analysis") or "").strip()
except (TypeError, json.JSONDecodeError):
analysis_text = result_json.strip()
if not analysis_text:
return None
return json.dumps({
"mode": cap.mode,
"width": cap.width,
"height": cap.height,
"app": cap.app,
"window_title": cap.window_title,
"elements": [_element_to_dict(e) for e in cap.elements],
"summary": summary,
"vision_analysis": analysis_text,
"vision_analysis_routed_via": "auxiliary.vision",
})
def _maybe_follow_capture(
backend: ComputerUseBackend, res: ActionResult, do_capture: bool,
) -> Any: