mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-11 08:42:11 +00:00
fix(computer_use): honor custom vision routing
This commit is contained in:
parent
ffe665277c
commit
591e6fb8f4
6 changed files with 207 additions and 7 deletions
|
|
@ -126,6 +126,45 @@ def _parse_elements_from_tree(markdown: str) -> List[UIElement]:
|
|||
return elements
|
||||
|
||||
|
||||
def _image_dimensions_from_bytes(raw: bytes) -> Tuple[int, int]:
|
||||
"""Best-effort PNG/JPEG dimension sniffing without extra dependencies."""
|
||||
if raw.startswith(b"\x89PNG\r\n\x1a\n") and len(raw) >= 24:
|
||||
width = int.from_bytes(raw[16:20], "big")
|
||||
height = int.from_bytes(raw[20:24], "big")
|
||||
if width > 0 and height > 0:
|
||||
return width, height
|
||||
|
||||
if raw.startswith(b"\xff\xd8"):
|
||||
i = 2
|
||||
n = len(raw)
|
||||
while i + 9 < n:
|
||||
if raw[i] != 0xFF:
|
||||
i += 1
|
||||
continue
|
||||
marker = raw[i + 1]
|
||||
i += 2
|
||||
if marker in {0xD8, 0xD9} or 0xD0 <= marker <= 0xD7:
|
||||
continue
|
||||
if i + 2 > n:
|
||||
break
|
||||
segment_len = int.from_bytes(raw[i:i + 2], "big")
|
||||
if segment_len < 2 or i + segment_len > n:
|
||||
break
|
||||
if marker in {
|
||||
0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7,
|
||||
0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF,
|
||||
}:
|
||||
if segment_len >= 7:
|
||||
height = int.from_bytes(raw[i + 3:i + 5], "big")
|
||||
width = int.from_bytes(raw[i + 5:i + 7], "big")
|
||||
if width > 0 and height > 0:
|
||||
return width, height
|
||||
break
|
||||
i += segment_len
|
||||
|
||||
return 0, 0
|
||||
|
||||
|
||||
def _split_tree_text(full_text: str) -> Tuple[str, str]:
|
||||
"""Split get_window_state text into (summary_line, tree_markdown)."""
|
||||
lines = full_text.split("\n", 1)
|
||||
|
|
@ -491,7 +530,12 @@ class CuaDriverBackend(ComputerUseBackend):
|
|||
png_bytes_len = 0
|
||||
if png_b64:
|
||||
try:
|
||||
png_bytes_len = len(base64.b64decode(png_b64, validate=False))
|
||||
raw = base64.b64decode(png_b64, validate=False)
|
||||
png_bytes_len = len(raw)
|
||||
detected_width, detected_height = _image_dimensions_from_bytes(raw)
|
||||
if detected_width and detected_height:
|
||||
width = detected_width
|
||||
height = detected_height
|
||||
except Exception:
|
||||
png_bytes_len = len(png_b64) * 3 // 4
|
||||
|
||||
|
|
|
|||
|
|
@ -615,6 +615,7 @@ def _route_capture_through_aux_vision(
|
|||
# MIME sniffing returns the right content-type.
|
||||
ext = ".jpg" if cap.png_b64[:8].startswith("/9j/") else ".png"
|
||||
cache_dir = get_hermes_dir("cache/vision", "temp_vision_images")
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
temp_image_path = cache_dir / f"computer_use_{_uuid.uuid4().hex}{ext}"
|
||||
temp_image_path.write_bytes(raw)
|
||||
|
||||
|
|
|
|||
|
|
@ -28,6 +28,10 @@ Behaviour (mirrors ``vision_analyze`` for consistency)
|
|||
``provider``, ``model``, or ``base_url`` non-empty / not ``"auto"``),
|
||||
the screenshot is routed through the aux vision pipeline. Users who
|
||||
pay for a dedicated vision model usually want it used.
|
||||
* Otherwise, if the user explicitly declared the active model vision-capable
|
||||
via ``model.supports_vision`` / provider model config, return ``False``.
|
||||
This is the escape hatch for custom/local OpenAI-compatible VLM routes that
|
||||
are absent from models.dev and provider allowlists.
|
||||
* Otherwise, if the active main model+provider can carry an image inside
|
||||
a tool-result message AND the model reports ``supports_vision=True``
|
||||
in models.dev metadata, return ``False`` (use the multimodal path).
|
||||
|
|
@ -76,10 +80,52 @@ def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
|
|||
return True
|
||||
|
||||
|
||||
def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:
|
||||
"""Return models.dev ``supports_vision`` for *(provider, model)* or None."""
|
||||
def _lookup_user_declared_supports_vision(
|
||||
provider: str,
|
||||
model: str,
|
||||
cfg: Optional[Dict[str, Any]],
|
||||
) -> Optional[bool]:
|
||||
"""Return config-declared ``supports_vision`` for the active route."""
|
||||
try:
|
||||
from agent.image_routing import _supports_vision_override
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logger.debug(
|
||||
"computer_use vision_routing: config override lookup import failed: %s",
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
try:
|
||||
return _supports_vision_override(cfg, provider, model)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logger.debug(
|
||||
"computer_use vision_routing: config override lookup failed: %s",
|
||||
exc,
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _lookup_supports_vision(
|
||||
provider: str,
|
||||
model: str,
|
||||
cfg: Optional[Dict[str, Any]] = None,
|
||||
) -> Optional[bool]:
|
||||
"""Return config/models.dev ``supports_vision`` for *(provider, model)*."""
|
||||
if not provider or not model:
|
||||
return None
|
||||
try:
|
||||
from agent.image_routing import _lookup_supports_vision as _lookup_image_supports
|
||||
except Exception:
|
||||
_lookup_image_supports = None
|
||||
if _lookup_image_supports is not None:
|
||||
try:
|
||||
return _lookup_image_supports(provider, model, cfg)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
logger.debug(
|
||||
"computer_use vision_routing: image-routing caps lookup failed "
|
||||
"for %s:%s — %s",
|
||||
provider, model, exc,
|
||||
)
|
||||
return None
|
||||
try:
|
||||
from agent.models_dev import get_model_capabilities
|
||||
caps = get_model_capabilities(provider, model)
|
||||
|
|
@ -137,11 +183,17 @@ def should_route_capture_to_aux_vision(
|
|||
if _explicit_aux_vision_override(cfg):
|
||||
return True
|
||||
|
||||
user_declared = _lookup_user_declared_supports_vision(provider, model, cfg)
|
||||
if user_declared is True:
|
||||
return False
|
||||
if user_declared is False:
|
||||
return True
|
||||
|
||||
accepts_tool_image = _provider_accepts_multimodal_tool_result(provider, model)
|
||||
if accepts_tool_image is None or accepts_tool_image is False:
|
||||
return True
|
||||
|
||||
supports_vision = _lookup_supports_vision(provider, model)
|
||||
supports_vision = _lookup_supports_vision(provider, model, cfg)
|
||||
if supports_vision is True:
|
||||
return False
|
||||
return True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue