diff --git a/tools/computer_use/tool.py b/tools/computer_use/tool.py index af7193e177b..4912b0f979a 100644 --- a/tools/computer_use/tool.py +++ b/tools/computer_use/tool.py @@ -429,6 +429,21 @@ def _capture_response(cap: CaptureResult) -> Any: summary = "\n".join(summary_lines) if cap.png_b64 and cap.mode != "ax": + # Decide whether to hand the screenshot to the auxiliary.vision + # pipeline (text-only result) or keep the multimodal envelope (main + # model handles vision natively). Issue #24015: previously the + # multimodal envelope was returned unconditionally, so non-vision + # main models tripped HTTP 404 / 400 at the provider boundary even + # when auxiliary.vision was explicitly configured to handle this. + if _should_route_through_aux_vision(): + routed = _route_capture_through_aux_vision(cap, summary) + if routed is not None: + return routed + # Aux routing was requested but failed (no vision client, aux + # call raised, etc.). Fall through to the multimodal envelope — + # better to surface a tool-result error from the main model + # than to silently drop the screenshot entirely. + # Detect actual image format from base64 magic bytes so the MIME type # matches what the data contains (cua-driver may return JPEG or PNG). # JPEG: base64 starts with /9j/ PNG: starts with iVBOR @@ -457,6 +472,140 @@ def _capture_response(cap: CaptureResult) -> Any: }) +# --------------------------------------------------------------------------- +# auxiliary.vision routing for captured screenshots (#24015) +# --------------------------------------------------------------------------- + +def _should_route_through_aux_vision() -> bool: + """Return True when ``_capture_response`` should hand the PNG to aux vision. + + Reads the active main provider/model and the loaded config and asks the + routing helper. Any failure (config import, runtime override missing, + etc.) returns False so the existing multimodal envelope continues to be + returned — fail open on the routing decision so a broken config can + never silently drop the screenshot for vision-capable main models. + """ + try: + from agent.auxiliary_client import _read_main_model, _read_main_provider + from hermes_cli.config import load_config + from tools.computer_use.vision_routing import ( + should_route_capture_to_aux_vision, + ) + except Exception as exc: # pragma: no cover - defensive + logger.debug("computer_use: aux-vision routing import failed: %s", exc) + return False + try: + provider = _read_main_provider() + model = _read_main_model() + cfg = load_config() + except Exception as exc: # pragma: no cover - defensive + logger.debug("computer_use: aux-vision routing config read failed: %s", exc) + return False + try: + return bool(should_route_capture_to_aux_vision(provider, model, cfg)) + except Exception as exc: # pragma: no cover - defensive + logger.debug("computer_use: aux-vision routing decision failed: %s", exc) + return False + + +def _route_capture_through_aux_vision( + cap: CaptureResult, + summary: str, +) -> Optional[str]: + """Pre-analyse the captured PNG via ``vision_analyze`` and return a text result. + + The captured base64 PNG is materialised to ``$HERMES_HOME/cache/vision/`` + and handed to ``vision_analyze_tool`` with a generic describe prompt. + The resulting text description is merged into the existing AX/SOM + summary so the main model receives a single text payload that mentions + every interactable element AND a description of what the screenshot + looked like. + + Returns: + A JSON-encoded text response on success. + ``None`` on failure (caller falls back to the multimodal envelope). + """ + if not cap.png_b64: + return None + try: + import base64 as _base64 + import os as _os + import uuid as _uuid + + from hermes_constants import get_hermes_dir + from model_tools import _run_async + from tools.vision_tools import vision_analyze_tool + except Exception as exc: # pragma: no cover - defensive + logger.debug("computer_use: aux-vision import failed: %s", exc) + return None + + temp_image_path = None + try: + try: + raw = _base64.b64decode(cap.png_b64, validate=False) + except Exception as exc: + logger.debug("computer_use: failed to decode capture base64: %s", exc) + return None + + # Pick an extension that matches the on-disk bytes so vision_analyze's + # MIME sniffing returns the right content-type. + ext = ".jpg" if cap.png_b64[:8].startswith("/9j/") else ".png" + cache_dir = get_hermes_dir("cache/vision", "temp_vision_images") + temp_image_path = cache_dir / f"computer_use_{_uuid.uuid4().hex}{ext}" + temp_image_path.write_bytes(raw) + + prompt = ( + "Describe what is visible in this macOS application screenshot in " + "concise but specific terms. Mention the app name and window " + "title if visible, the overall layout, any labelled buttons, " + "menus or text fields, and any prominent text content the user " + "would need to know about. Do not invent details that are not " + "actually visible.\n\n" + f"AX/SOM index for cross-reference:\n{summary}" + ) + + result_json = _run_async( + vision_analyze_tool(str(temp_image_path), prompt) + ) + except Exception as exc: + logger.warning( + "computer_use: auxiliary.vision pre-analysis failed (%s); " + "falling back to native multimodal envelope", + exc, + ) + return None + finally: + if temp_image_path is not None: + try: + _os.unlink(str(temp_image_path)) + except Exception: + pass + + analysis_text = "" + if isinstance(result_json, str): + try: + parsed = json.loads(result_json) + if isinstance(parsed, dict): + analysis_text = str(parsed.get("analysis") or "").strip() + except (TypeError, json.JSONDecodeError): + analysis_text = result_json.strip() + + if not analysis_text: + return None + + return json.dumps({ + "mode": cap.mode, + "width": cap.width, + "height": cap.height, + "app": cap.app, + "window_title": cap.window_title, + "elements": [_element_to_dict(e) for e in cap.elements], + "summary": summary, + "vision_analysis": analysis_text, + "vision_analysis_routed_via": "auxiliary.vision", + }) + + def _maybe_follow_capture( backend: ComputerUseBackend, res: ActionResult, do_capture: bool, ) -> Any: