mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
When local Ollama models are absent from models.dev, probe the Ollama server's /api/show capabilities so attached images are routed natively instead of being stripped as non-vision input.
740 lines
28 KiB
Python
740 lines
28 KiB
Python
"""Routing helpers for inbound user-attached images.
|
|
|
|
Two modes:
|
|
|
|
native — attach images as OpenAI-style ``image_url`` content parts on the
|
|
user turn. Provider adapters (Anthropic, Gemini, Bedrock, Codex,
|
|
OpenAI chat.completions) already translate these into their
|
|
vendor-specific multimodal formats.
|
|
|
|
text — run ``vision_analyze`` on each image up-front and prepend the
|
|
description to the user's text. The model never sees the pixels;
|
|
it only sees a lossy text summary. This is the pre-existing
|
|
behaviour and still the right choice for non-vision models.
|
|
|
|
The decision is made once per message turn by :func:`decide_image_input_mode`.
|
|
It reads ``agent.image_input_mode`` from config.yaml (``auto`` | ``native``
|
|
| ``text``, default ``auto``) and the active model's capability metadata.
|
|
|
|
In ``auto`` mode:
|
|
- If the user has explicitly configured ``auxiliary.vision.provider``
|
|
(i.e. not ``auto`` and not empty), we assume they want the text pipeline
|
|
regardless of the main model — they've opted in to a specific vision
|
|
backend for a reason (cost, quality, local-only, etc.).
|
|
- Otherwise, if the active model reports ``supports_vision=True`` in its
|
|
models.dev metadata, we attach natively.
|
|
- Otherwise (non-vision model, no explicit override), we fall back to text.
|
|
|
|
This keeps ``vision_analyze`` surfaced as a tool in every session — skills
|
|
and agent flows that chain it (browser screenshots, deeper inspection of
|
|
URL-referenced images, style-gating loops) keep working. The routing only
|
|
affects *how user-attached images on the current turn* are presented to the
|
|
main model.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import logging
|
|
import mimetypes
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_VALID_MODES = frozenset({"auto", "native", "text"})
|
|
|
|
|
|
# Image extensions used by extract_image_refs(). Kept tight on purpose — we
|
|
# only auto-attach things the model can actually see. Documents/archives are
|
|
# excluded because the gateway's broader extract_local_files() also routes
|
|
# them differently (send_document), and we don't want to attach a PDF as a
|
|
# vision part.
|
|
_IMAGE_EXTS = (
|
|
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".heic",
|
|
)
|
|
_IMAGE_EXT_PATTERN = "|".join(e.lstrip(".") for e in _IMAGE_EXTS)
|
|
|
|
# Absolute / home-relative local image path. Matches the same shape gateway's
|
|
# extract_local_files() uses: anchors to ``~/`` or ``/``, ignores matches inside
|
|
# URLs (the ``(?<![/:\w.])`` lookbehind), and case-insensitive on the extension.
|
|
_LOCAL_IMAGE_PATH_RE = re.compile(
|
|
r"(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:" + _IMAGE_EXT_PATTERN + r")\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# http(s) URL ending in an image extension (optionally followed by a
|
|
# query string). Case-insensitive on the extension. Strict ``http(s)://``
|
|
# scheme so we don't accidentally grab ``file://`` URLs or other shapes.
|
|
_IMAGE_URL_RE = re.compile(
|
|
r"https?://[^\s<>\"']+?\.(?:" + _IMAGE_EXT_PATTERN + r")(?:\?[^\s<>\"']*)?",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def extract_image_refs(text: str) -> Tuple[List[str], List[str]]:
|
|
"""Scan free-form text for image references the model should see.
|
|
|
|
Returns ``(local_paths, urls)``:
|
|
|
|
* ``local_paths`` — absolute (``/``) or home-relative (``~/``) paths
|
|
whose suffix is an image extension AND whose expanded form exists
|
|
on disk as a file. Order-preserving, deduplicated.
|
|
* ``urls`` — ``http(s)://…`` URLs whose path ends in an image
|
|
extension (a ``?query`` is allowed after the extension).
|
|
Order-preserving, deduplicated.
|
|
|
|
Matches inside fenced code blocks (``` ``` ```) and inline backticks
|
|
(`` `…` ``) are skipped so that snippets pasted into a task body for
|
|
reference aren't mistaken for live attachments. This mirrors the
|
|
behaviour of ``gateway.platforms.base.BaseAdapter.extract_local_files``.
|
|
|
|
Local paths are validated against the filesystem; URLs are not
|
|
(the provider fetches them at request time).
|
|
"""
|
|
if not isinstance(text, str) or not text:
|
|
return [], []
|
|
|
|
# Build spans covered by fenced code blocks and inline code so we can
|
|
# ignore references the author embedded purely as example text.
|
|
code_spans: list[tuple[int, int]] = []
|
|
for m in re.finditer(r"```[^\n]*\n.*?```", text, re.DOTALL):
|
|
code_spans.append((m.start(), m.end()))
|
|
for m in re.finditer(r"`[^`\n]+`", text):
|
|
code_spans.append((m.start(), m.end()))
|
|
|
|
def _in_code(pos: int) -> bool:
|
|
return any(s <= pos < e for s, e in code_spans)
|
|
|
|
local_paths: list[str] = []
|
|
seen_paths: set[str] = set()
|
|
for match in _LOCAL_IMAGE_PATH_RE.finditer(text):
|
|
if _in_code(match.start()):
|
|
continue
|
|
raw = match.group(0)
|
|
expanded = os.path.expanduser(raw)
|
|
try:
|
|
if not os.path.isfile(expanded):
|
|
continue
|
|
except OSError:
|
|
# ENAMETOOLONG / EINVAL on pathological inputs — skip rather than crash.
|
|
continue
|
|
if expanded in seen_paths:
|
|
continue
|
|
seen_paths.add(expanded)
|
|
local_paths.append(expanded)
|
|
|
|
urls: list[str] = []
|
|
seen_urls: set[str] = set()
|
|
for match in _IMAGE_URL_RE.finditer(text):
|
|
if _in_code(match.start()):
|
|
continue
|
|
url = match.group(0)
|
|
# Strip trailing punctuation that's almost certainly prose, not part
|
|
# of the URL (e.g. "see https://x.com/a.png." or "/a.png)").
|
|
url = url.rstrip(".,;:!?)]>")
|
|
if url in seen_urls:
|
|
continue
|
|
seen_urls.add(url)
|
|
urls.append(url)
|
|
|
|
return local_paths, urls
|
|
|
|
|
|
# Strict YAML/JSON boolean coercion for capability overrides.
|
|
#
|
|
# ``bool("false")`` is True in Python because non-empty strings are truthy, so
|
|
# a user writing ``supports_vision: "false"`` (quoted — a common YAML mistake)
|
|
# would silently enable native vision routing on a model that can't actually
|
|
# handle it. Accept only the values YAML 1.1 / 1.2 treat as booleans, plus
|
|
# real ``bool`` and integer 0/1. Anything else returns None so the caller
|
|
# falls through to models.dev rather than honouring garbage.
|
|
_TRUE_TOKENS = frozenset({"true", "yes", "on", "1"})
|
|
_FALSE_TOKENS = frozenset({"false", "no", "off", "0"})
|
|
|
|
|
|
def _coerce_capability_bool(raw: Any) -> Optional[bool]:
|
|
"""Return True/False for recognised boolean values, None otherwise."""
|
|
if isinstance(raw, bool):
|
|
return raw
|
|
if isinstance(raw, int):
|
|
if raw in (0, 1):
|
|
return bool(raw)
|
|
return None
|
|
if isinstance(raw, str):
|
|
s = raw.strip().lower()
|
|
if s in _TRUE_TOKENS:
|
|
return True
|
|
if s in _FALSE_TOKENS:
|
|
return False
|
|
return None
|
|
|
|
|
|
def _supports_vision_override(
|
|
cfg: Optional[Dict[str, Any]],
|
|
provider: str,
|
|
model: str,
|
|
) -> Optional[bool]:
|
|
"""Resolve user-declared vision capability from config.yaml.
|
|
|
|
Resolution order, first hit wins:
|
|
1. ``model.supports_vision`` (top-level shortcut for the active model)
|
|
2. ``providers.<provider>.models.<model>.supports_vision``
|
|
(named custom providers — ``provider`` may be the runtime-resolved
|
|
value ``"custom"`` and/or the user-declared name under
|
|
``model.provider``; both are tried)
|
|
|
|
Returns None when no override is set, so the caller falls through to
|
|
models.dev. Returns False explicitly only when the user wrote a
|
|
recognised boolean false token.
|
|
"""
|
|
if not isinstance(cfg, dict):
|
|
return None
|
|
|
|
# 1. Top-level shortcut
|
|
model_cfg_raw = cfg.get("model")
|
|
model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
|
|
top = _coerce_capability_bool(model_cfg.get("supports_vision"))
|
|
if top is not None:
|
|
return top
|
|
|
|
# 2. Per-provider, per-model. Named custom providers (e.g. "my-vllm")
|
|
# get rewritten to provider="custom" at runtime
|
|
# (hermes_cli/runtime_provider.py:_resolve_named_custom_runtime), so the
|
|
# config still holds the user-declared name under model.provider. Try
|
|
# both as candidate provider keys.
|
|
config_provider = str(model_cfg.get("provider") or "").strip()
|
|
providers_raw = cfg.get("providers")
|
|
providers_cfg: Dict[str, Any] = providers_raw if isinstance(providers_raw, dict) else {}
|
|
for p in dict.fromkeys(filter(None, (provider, config_provider))):
|
|
entry_raw = providers_cfg.get(p)
|
|
entry: Dict[str, Any] = entry_raw if isinstance(entry_raw, dict) else {}
|
|
models_raw = entry.get("models")
|
|
models_cfg: Dict[str, Any] = models_raw if isinstance(models_raw, dict) else {}
|
|
per_model_raw = models_cfg.get(model)
|
|
per_model: Dict[str, Any] = per_model_raw if isinstance(per_model_raw, dict) else {}
|
|
coerced = _coerce_capability_bool(per_model.get("supports_vision"))
|
|
if coerced is not None:
|
|
return coerced
|
|
|
|
# 2b. Legacy list-style custom_providers. Entries are dicts with a
|
|
# "name" key and a nested "models" dict. Match by provider name (which
|
|
# may appear as the raw name or "custom:<name>" at runtime).
|
|
custom_providers = cfg.get("custom_providers")
|
|
if isinstance(custom_providers, list):
|
|
# Build candidate names: the provider value and the config provider
|
|
# value, both raw and with "custom:" prefix stripped/added.
|
|
candidate_names: set = set()
|
|
for p in filter(None, (provider, config_provider)):
|
|
candidate_names.add(p)
|
|
if p.startswith("custom:"):
|
|
candidate_names.add(p[len("custom:"):])
|
|
else:
|
|
candidate_names.add(f"custom:{p}")
|
|
for entry_raw in custom_providers:
|
|
if not isinstance(entry_raw, dict):
|
|
continue
|
|
entry_name = str(entry_raw.get("name") or "").strip()
|
|
if entry_name not in candidate_names:
|
|
continue
|
|
models_raw = entry_raw.get("models")
|
|
models_cfg = models_raw if isinstance(models_raw, dict) else {}
|
|
per_model_raw = models_cfg.get(model)
|
|
per_model = per_model_raw if isinstance(per_model_raw, dict) else {}
|
|
coerced = _coerce_capability_bool(per_model.get("supports_vision"))
|
|
if coerced is not None:
|
|
return coerced
|
|
|
|
return None
|
|
|
|
|
|
def _resolve_inference_base_url(
|
|
cfg: Optional[Dict[str, Any]],
|
|
provider: str,
|
|
) -> str:
|
|
"""Best-effort base URL for the active inference provider."""
|
|
try:
|
|
from agent.auxiliary_client import _RUNTIME_MAIN_BASE_URL
|
|
|
|
runtime = str(_RUNTIME_MAIN_BASE_URL or "").strip()
|
|
if runtime:
|
|
return runtime
|
|
except Exception:
|
|
pass
|
|
|
|
if not isinstance(cfg, dict):
|
|
return ""
|
|
|
|
model_cfg_raw = cfg.get("model")
|
|
model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
|
|
base_url = str(model_cfg.get("base_url") or "").strip()
|
|
if base_url:
|
|
return base_url
|
|
|
|
config_provider = str(model_cfg.get("provider") or "").strip()
|
|
candidate_names: set[str] = set()
|
|
for p in filter(None, (provider, config_provider)):
|
|
candidate_names.add(p)
|
|
if p.lower().startswith("custom:"):
|
|
candidate_names.add(p.split(":", 1)[1])
|
|
else:
|
|
candidate_names.add(f"custom:{p}")
|
|
|
|
providers_cfg = cfg.get("providers")
|
|
if isinstance(providers_cfg, dict):
|
|
for name in candidate_names:
|
|
entry = providers_cfg.get(name)
|
|
if isinstance(entry, dict):
|
|
bu = str(entry.get("base_url") or "").strip()
|
|
if bu:
|
|
return bu
|
|
|
|
custom_providers = cfg.get("custom_providers")
|
|
if isinstance(custom_providers, list):
|
|
lowered = {n.lower() for n in candidate_names}
|
|
for entry_raw in custom_providers:
|
|
if not isinstance(entry_raw, dict):
|
|
continue
|
|
entry_name = str(entry_raw.get("name") or "").strip()
|
|
if entry_name not in candidate_names and entry_name.lower() not in lowered:
|
|
continue
|
|
bu = str(entry_raw.get("base_url") or "").strip()
|
|
if bu:
|
|
return bu
|
|
|
|
return ""
|
|
|
|
|
|
def _should_probe_ollama_vision(provider: str, base_url: str) -> bool:
|
|
"""True when the active provider likely fronts a local Ollama server."""
|
|
p = (provider or "").strip().lower()
|
|
if p == "ollama":
|
|
return True
|
|
if not base_url:
|
|
return False
|
|
try:
|
|
from agent.model_metadata import detect_local_server_type
|
|
|
|
return detect_local_server_type(base_url) == "ollama"
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _coerce_mode(raw: Any) -> str:
|
|
"""Normalize a config value into one of the valid modes."""
|
|
if not isinstance(raw, str):
|
|
return "auto"
|
|
val = raw.strip().lower()
|
|
if val in _VALID_MODES:
|
|
return val
|
|
return "auto"
|
|
|
|
|
|
def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
|
|
"""True when the user configured a specific auxiliary vision backend.
|
|
|
|
An explicit override means the user *wants* the text pipeline (they're
|
|
paying for a dedicated vision model), so we don't silently bypass it.
|
|
"""
|
|
if not isinstance(cfg, dict):
|
|
return False
|
|
aux = cfg.get("auxiliary") or {}
|
|
if not isinstance(aux, dict):
|
|
return False
|
|
vision = aux.get("vision") or {}
|
|
if not isinstance(vision, dict):
|
|
return False
|
|
|
|
provider = str(vision.get("provider") or "").strip().lower()
|
|
model = str(vision.get("model") or "").strip()
|
|
base_url = str(vision.get("base_url") or "").strip()
|
|
|
|
# "auto" / "" / blank = not explicit
|
|
if provider in {"", "auto"} and not model and not base_url:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _lookup_supports_vision(
|
|
provider: str,
|
|
model: str,
|
|
cfg: Optional[Dict[str, Any]] = None,
|
|
) -> Optional[bool]:
|
|
"""Return True/False if we can resolve caps, None if unknown.
|
|
|
|
Consults the user's ``supports_vision`` override in config.yaml first
|
|
(so custom/local models declared as vision-capable don't fall through to
|
|
text routing in ``auto`` mode), then falls back to models.dev.
|
|
"""
|
|
override = _supports_vision_override(cfg, provider, model)
|
|
if override is not None:
|
|
return override
|
|
if not provider or not model:
|
|
return None
|
|
caps = None
|
|
try:
|
|
from agent.models_dev import get_model_capabilities
|
|
caps = get_model_capabilities(provider, model)
|
|
except Exception as exc: # pragma: no cover - defensive
|
|
logger.debug("image_routing: caps lookup failed for %s:%s — %s", provider, model, exc)
|
|
if caps is not None:
|
|
return bool(caps.supports_vision)
|
|
|
|
base_url = _resolve_inference_base_url(cfg, provider)
|
|
if not base_url and (provider or "").strip().lower() == "ollama":
|
|
base_url = "http://localhost:11434/v1"
|
|
if _should_probe_ollama_vision(provider, base_url):
|
|
try:
|
|
from agent.model_metadata import query_ollama_supports_vision
|
|
|
|
ollama_vision = query_ollama_supports_vision(model, base_url)
|
|
if ollama_vision is not None:
|
|
return ollama_vision
|
|
except Exception as exc: # pragma: no cover - defensive
|
|
logger.debug(
|
|
"image_routing: ollama vision probe failed for %s:%s — %s",
|
|
provider,
|
|
model,
|
|
exc,
|
|
)
|
|
return None
|
|
|
|
|
|
def decide_image_input_mode(
|
|
provider: str,
|
|
model: str,
|
|
cfg: Optional[Dict[str, Any]],
|
|
) -> str:
|
|
"""Return ``"native"`` or ``"text"`` for the given turn.
|
|
|
|
Args:
|
|
provider: active inference provider ID (e.g. ``"anthropic"``, ``"openrouter"``).
|
|
model: active model slug as it would be sent to the provider.
|
|
cfg: loaded config.yaml dict, or None. When None, behaves as auto.
|
|
"""
|
|
mode_cfg = "auto"
|
|
if isinstance(cfg, dict):
|
|
agent_cfg = cfg.get("agent") or {}
|
|
if isinstance(agent_cfg, dict):
|
|
mode_cfg = _coerce_mode(agent_cfg.get("image_input_mode"))
|
|
|
|
if mode_cfg == "native":
|
|
return "native"
|
|
if mode_cfg == "text":
|
|
return "text"
|
|
|
|
# auto
|
|
if _explicit_aux_vision_override(cfg):
|
|
return "text"
|
|
|
|
supports = _lookup_supports_vision(provider, model, cfg)
|
|
if supports is True:
|
|
return "native"
|
|
return "text"
|
|
|
|
|
|
# Image size handling is REACTIVE rather than proactive: we attempt native
|
|
# attachment at full size regardless of provider, and rely on
|
|
# ``run_agent._try_shrink_image_parts_in_messages`` to shrink + retry if
|
|
# the provider rejects the request (e.g. Anthropic's hard 5 MB per-image
|
|
# ceiling returned as HTTP 400 "image exceeds 5 MB maximum").
|
|
#
|
|
# Why reactive: our knowledge of provider ceilings is partial and evolving
|
|
# (OpenAI accepts 49 MB+, Anthropic 5 MB, Gemini 100 MB, others unknown).
|
|
# A proactive per-provider table would be stale the moment a provider raises
|
|
# or lowers its limit, and silently degrading quality for users on providers
|
|
# that would have accepted the full image is the worse failure mode.
|
|
# The shrink-on-reject path loses 1 API call + maybe 1s of Pillow work when
|
|
# it fires, which is cheaper than permanent quality loss.
|
|
|
|
|
|
def _sniff_mime_from_bytes(raw: bytes) -> Optional[str]:
|
|
"""Detect image MIME from magic bytes. Returns None if unrecognised.
|
|
|
|
Filename-based detection (``mimetypes.guess_type``) is unreliable when
|
|
upstream platforms lie about content-type. Discord, for example, can
|
|
serve a PNG with ``content_type=image/webp`` for proxied/animated
|
|
stickers, custom emoji previews, or images uploaded via certain bots.
|
|
Anthropic strictly validates that declared media_type matches the
|
|
actual bytes and returns HTTP 400 on mismatch, so we sniff to be safe.
|
|
"""
|
|
if not raw:
|
|
return None
|
|
# PNG: 89 50 4E 47 0D 0A 1A 0A
|
|
if raw.startswith(b"\x89PNG\r\n\x1a\n"):
|
|
return "image/png"
|
|
# JPEG: FF D8 FF
|
|
if raw.startswith(b"\xff\xd8\xff"):
|
|
return "image/jpeg"
|
|
# GIF87a / GIF89a
|
|
if raw[:6] in {b"GIF87a", b"GIF89a"}:
|
|
return "image/gif"
|
|
# WEBP: "RIFF" .... "WEBP"
|
|
if len(raw) >= 12 and raw[:4] == b"RIFF" and raw[8:12] == b"WEBP":
|
|
return "image/webp"
|
|
# BMP: "BM"
|
|
if raw.startswith(b"BM"):
|
|
return "image/bmp"
|
|
# ISO-BMFF family (HEIC/HEIF/AVIF): bytes 4..8 == 'ftyp', major brand at 8..12
|
|
if len(raw) >= 12 and raw[4:8] == b"ftyp":
|
|
brand = raw[8:12]
|
|
if brand in {b"avif", b"avis"}:
|
|
return "image/avif"
|
|
if brand in {
|
|
b"heic", b"heix", b"hevc", b"hevx",
|
|
b"mif1", b"msf1", b"heim", b"heis",
|
|
}:
|
|
return "image/heic"
|
|
# TIFF: II*\0 (little-endian) or MM\0* (big-endian)
|
|
if raw[:4] in {b"II*\x00", b"MM\x00*"}:
|
|
return "image/tiff"
|
|
# ICO: 00 00 01 00 (reserved=0, type=1=icon)
|
|
if raw[:4] == b"\x00\x00\x01\x00":
|
|
return "image/x-icon"
|
|
# SVG: text-based, look for an <svg tag near the start (skip BOM/whitespace)
|
|
head = raw[:512].lstrip().lower()
|
|
if head.startswith(b"<?xml") or head.startswith(b"<svg"):
|
|
if b"<svg" in head:
|
|
return "image/svg+xml"
|
|
return None
|
|
|
|
|
|
# Formats every major vision provider (Anthropic, OpenAI, Gemini, Bedrock)
|
|
# accepts natively. Anything outside this set has to be transcoded to PNG
|
|
# before we declare media_type, otherwise the provider returns HTTP 400
|
|
# ("Could not process image" / "Unsupported image media type") and the
|
|
# whole turn fails with no salvage path.
|
|
#
|
|
# Discord (and a few other chat platforms) freely accept attachments in
|
|
# formats outside this set -- AVIF screenshots from Chromium, HEIC from
|
|
# iPhones, TIFF from scanners, BMP from old Windows tools, ICO -- so users
|
|
# do hit this in practice. SVG is vector and Pillow cannot rasterize it;
|
|
# it is skipped (logged) rather than transcoded.
|
|
_UNIVERSALLY_SUPPORTED_MIMES = frozenset({
|
|
"image/png", "image/jpeg", "image/gif", "image/webp",
|
|
})
|
|
|
|
|
|
def _transcode_to_png(raw: bytes) -> Optional[bytes]:
|
|
"""Decode arbitrary image bytes with Pillow and re-encode as PNG.
|
|
|
|
Returns None if Pillow isn't installed or can't decode the input
|
|
(rare formats, corrupted bytes, missing optional decoder plugin for
|
|
HEIC/AVIF, or vector formats like SVG). Caller falls back to skipping
|
|
the image so the rest of the turn still works.
|
|
|
|
HEIC/HEIF and AVIF need optional Pillow plugins; we try to register
|
|
them on demand and swallow ImportError so a missing plugin just
|
|
looks like 'Pillow can't decode this' rather than crashing.
|
|
"""
|
|
try:
|
|
from PIL import Image
|
|
except ImportError:
|
|
logger.info(
|
|
"image_routing: Pillow not installed; cannot transcode "
|
|
"non-standard image format to PNG. Install with `pip install Pillow` "
|
|
"(and `pillow-heif` / `pillow-avif-plugin` for those formats)."
|
|
)
|
|
return None
|
|
# Optional plugin registration. Silent on failure: an unsupported
|
|
# format will just fall through to Image.open raising below.
|
|
try:
|
|
import pillow_heif # type: ignore
|
|
|
|
pillow_heif.register_heif_opener()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
import pillow_avif # type: ignore # noqa: F401 -- registers AVIF on import
|
|
except Exception:
|
|
pass
|
|
try:
|
|
from io import BytesIO
|
|
|
|
with Image.open(BytesIO(raw)) as im:
|
|
# Pick an output mode PNG can serialise. Anything other than
|
|
# the standard set gets normalised to RGBA so transparency is
|
|
# preserved where the source had it.
|
|
if im.mode not in {"RGB", "RGBA", "L", "LA", "P"}:
|
|
im = im.convert("RGBA")
|
|
buf = BytesIO()
|
|
im.save(buf, format="PNG", optimize=False)
|
|
return buf.getvalue()
|
|
except Exception as exc:
|
|
logger.info(
|
|
"image_routing: Pillow could not transcode image to PNG -- %s", exc
|
|
)
|
|
return None
|
|
|
|
|
|
def _guess_mime(path: Path, raw: Optional[bytes] = None) -> str:
|
|
"""Return image MIME type for *path*.
|
|
|
|
If *raw* bytes are provided, magic-byte sniffing wins (authoritative).
|
|
Otherwise we fall back to ``mimetypes`` then suffix-based defaults.
|
|
"""
|
|
if raw is not None:
|
|
sniffed = _sniff_mime_from_bytes(raw)
|
|
if sniffed:
|
|
return sniffed
|
|
mime, _ = mimetypes.guess_type(str(path))
|
|
if mime and mime.startswith("image/"):
|
|
return mime
|
|
# mimetypes on some Linux distros mis-maps .jpg; default to jpeg when
|
|
# the suffix looks imagey.
|
|
suffix = path.suffix.lower()
|
|
return {
|
|
".jpg": "image/jpeg",
|
|
".jpeg": "image/jpeg",
|
|
".png": "image/png",
|
|
".gif": "image/gif",
|
|
".webp": "image/webp",
|
|
".bmp": "image/bmp",
|
|
}.get(suffix, "image/jpeg")
|
|
|
|
|
|
def _file_to_data_url(path: Path) -> Optional[str]:
|
|
"""Encode a local image as a base64 data URL at its native size.
|
|
|
|
Size limits are NOT enforced here — the agent retry loop
|
|
(``run_agent._try_shrink_image_parts_in_messages``) shrinks on the
|
|
provider's first rejection. Keeping this simple means providers that
|
|
accept large images (OpenAI 49 MB+, Gemini 100 MB) don't pay a silent
|
|
quality tax just because one other provider is stricter.
|
|
|
|
Format compatibility IS handled here: if the sniffed MIME isn't one
|
|
of ``_UNIVERSALLY_SUPPORTED_MIMES`` (i.e. it's something like AVIF,
|
|
HEIC, BMP, TIFF, or ICO that some providers reject outright), we
|
|
transcode to PNG with Pillow before declaring media_type. This fixes
|
|
the user-visible "Could not process image" HTTP 400 from Anthropic on
|
|
Discord-attached AVIF/HEIC/BMP files.
|
|
|
|
Returns None if the file can't be read OR if the format isn't
|
|
universally supported AND Pillow can't transcode it (Pillow missing,
|
|
HEIC/AVIF plugin missing, vector format like SVG, corrupt bytes). The
|
|
caller reports those paths in ``skipped`` and the rest of the turn
|
|
proceeds.
|
|
"""
|
|
try:
|
|
raw = path.read_bytes()
|
|
except Exception as exc:
|
|
logger.warning("image_routing: failed to read %s — %s", path, exc)
|
|
return None
|
|
mime = _guess_mime(path, raw=raw)
|
|
if mime not in _UNIVERSALLY_SUPPORTED_MIMES:
|
|
transcoded = _transcode_to_png(raw)
|
|
if transcoded is None:
|
|
logger.warning(
|
|
"image_routing: %s is %s which is not accepted by all major "
|
|
"vision providers and could not be transcoded to PNG; "
|
|
"skipping this attachment.",
|
|
path, mime,
|
|
)
|
|
return None
|
|
logger.info(
|
|
"image_routing: transcoded %s (%s) -> image/png for provider compatibility",
|
|
path.name, mime,
|
|
)
|
|
raw = transcoded
|
|
mime = "image/png"
|
|
b64 = base64.b64encode(raw).decode("ascii")
|
|
return f"data:{mime};base64,{b64}"
|
|
|
|
|
|
def build_native_content_parts(
|
|
user_text: str,
|
|
image_paths: List[str],
|
|
image_urls: Optional[List[str]] = None,
|
|
) -> Tuple[List[Dict[str, Any]], List[str]]:
|
|
"""Build an OpenAI-style ``content`` list for a user turn.
|
|
|
|
Shape:
|
|
[{"type": "text", "text": "...\\n\\n[Image attached at: /local/path]"},
|
|
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
|
|
{"type": "image_url", "image_url": {"url": "https://example.com/a.png"}},
|
|
...]
|
|
|
|
Local paths are read from disk and embedded as base64 ``data:`` URLs.
|
|
Remote URLs (``http(s)://``) are passed through verbatim — the provider
|
|
fetches them server-side. The model still sees the pixels either way.
|
|
|
|
For each successfully attached image, a hint is appended to the text
|
|
part:
|
|
|
|
* local path → ``[Image attached at: <path>]``
|
|
* URL → ``[Image attached: <url>]``
|
|
|
|
The hint gives the model a string handle so MCP/skill tools that take
|
|
an image path or URL argument can be invoked on the same image without
|
|
an extra round-trip. This parallels the text-mode hint produced by
|
|
``Runner._enrich_message_with_vision`` (``vision_analyze using image_url:
|
|
<path>``) so behaviour is consistent across both image input modes.
|
|
|
|
Images are attached at their native size. If a provider rejects the
|
|
request because an image is too large (e.g. Anthropic's 5 MB per-image
|
|
ceiling), the agent's retry loop transparently shrinks and retries
|
|
once — see ``run_agent._try_shrink_image_parts_in_messages``.
|
|
|
|
Returns (content_parts, skipped). Skipped entries are local paths
|
|
that couldn't be read from disk; URLs are never skipped (they're
|
|
not validated here).
|
|
"""
|
|
skipped: List[str] = []
|
|
image_parts: List[Dict[str, Any]] = []
|
|
attached_paths: List[str] = []
|
|
attached_urls: List[str] = []
|
|
|
|
for raw_path in image_paths:
|
|
p = Path(raw_path)
|
|
if not p.exists() or not p.is_file():
|
|
skipped.append(str(raw_path))
|
|
continue
|
|
data_url = _file_to_data_url(p)
|
|
if not data_url:
|
|
skipped.append(str(raw_path))
|
|
continue
|
|
image_parts.append({
|
|
"type": "image_url",
|
|
"image_url": {"url": data_url},
|
|
})
|
|
attached_paths.append(str(raw_path))
|
|
|
|
for url in image_urls or []:
|
|
url = (url or "").strip()
|
|
if not url:
|
|
continue
|
|
image_parts.append({
|
|
"type": "image_url",
|
|
"image_url": {"url": url},
|
|
})
|
|
attached_urls.append(url)
|
|
|
|
text = (user_text or "").strip()
|
|
|
|
# If at least one image attached, build a single text part that combines
|
|
# the user's caption (or a neutral default) with one hint per image.
|
|
if attached_paths or attached_urls:
|
|
base_text = text or "What do you see in this image?"
|
|
hint_lines: List[str] = []
|
|
hint_lines.extend(f"[Image attached at: {p}]" for p in attached_paths)
|
|
hint_lines.extend(f"[Image attached: {u}]" for u in attached_urls)
|
|
combined_text = f"{base_text}\n\n" + "\n".join(hint_lines)
|
|
parts: List[Dict[str, Any]] = [{"type": "text", "text": combined_text}]
|
|
parts.extend(image_parts)
|
|
return parts, skipped
|
|
|
|
# No images successfully attached — fall back to plain text-only behaviour.
|
|
parts = []
|
|
if text:
|
|
parts.append({"type": "text", "text": text})
|
|
return parts, skipped
|
|
|
|
|
|
__all__ = [
|
|
"decide_image_input_mode",
|
|
"build_native_content_parts",
|
|
"extract_image_refs",
|
|
]
|