mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
feat(kanban): attach images referenced in task bodies to worker vision (#34210)
Kanban workers now scan the task body for local image paths and http(s) image URLs and attach them to the worker's first user turn — matching the CLI/gateway behaviour for inbound images. Before, a user pasting `/home/me/screenshot.png` or `https://example.com/img.png` into a kanban task description had it sent to the model as plain text and the pixels were never seen. How it works: * agent/image_routing.py gains extract_image_refs(text) → (paths, urls) that mirrors gateway/platforms/base.py:extract_local_files (absolute / ~-relative paths, image extensions only, ignores fenced/inline code). * build_native_content_parts() accepts an optional image_urls= kwarg and emits passthrough image_url parts for remote URLs alongside the base64 data: URLs used for local paths. * cli.py (single-query/quiet branch — the path every dispatcher-spawned worker takes) detects HERMES_KANBAN_TASK, reads the task body via kanban_db.get_task, runs extract_image_refs, and threads the results into the existing image-routing decision (native vs text). Best-effort: enrichment failures never block worker startup. Tested: * tests/agent/test_image_routing.py — 22 new tests for extract_image_refs and URL pass-through in build_native_content_parts. * tests/hermes_cli/test_kanban_worker_image_extraction.py — 10 new tests driving real kanban_db round-trip (create task → read body → extract refs → build parts). * E2E: created a fake kanban task with a body referencing both a local PNG and an https URL; verified the worker pipeline produces a multimodal user turn with 1 text part + 2 image_url parts (data URL for the local file, passthrough URL for the remote).
This commit is contained in:
parent
1b1e30510a
commit
769ee86cd2
4 changed files with 610 additions and 20 deletions
|
|
@ -37,6 +37,8 @@ from __future__ import annotations
|
|||
import base64
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
|
@ -46,6 +48,102 @@ logger = logging.getLogger(__name__)
|
|||
_VALID_MODES = frozenset({"auto", "native", "text"})
|
||||
|
||||
|
||||
# Image extensions used by extract_image_refs(). Kept tight on purpose — we
|
||||
# only auto-attach things the model can actually see. Documents/archives are
|
||||
# excluded because the gateway's broader extract_local_files() also routes
|
||||
# them differently (send_document), and we don't want to attach a PDF as a
|
||||
# vision part.
|
||||
_IMAGE_EXTS = (
|
||||
".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".heic",
|
||||
)
|
||||
_IMAGE_EXT_PATTERN = "|".join(e.lstrip(".") for e in _IMAGE_EXTS)
|
||||
|
||||
# Absolute / home-relative local image path. Matches the same shape gateway's
|
||||
# extract_local_files() uses: anchors to ``~/`` or ``/``, ignores matches inside
|
||||
# URLs (the ``(?<![/:\w.])`` lookbehind), and case-insensitive on the extension.
|
||||
_LOCAL_IMAGE_PATH_RE = re.compile(
|
||||
r"(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:" + _IMAGE_EXT_PATTERN + r")\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# http(s) URL ending in an image extension (optionally followed by a
|
||||
# query string). Case-insensitive on the extension. Strict ``http(s)://``
|
||||
# scheme so we don't accidentally grab ``file://`` URLs or other shapes.
|
||||
_IMAGE_URL_RE = re.compile(
|
||||
r"https?://[^\s<>\"']+?\.(?:" + _IMAGE_EXT_PATTERN + r")(?:\?[^\s<>\"']*)?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def extract_image_refs(text: str) -> Tuple[List[str], List[str]]:
|
||||
"""Scan free-form text for image references the model should see.
|
||||
|
||||
Returns ``(local_paths, urls)``:
|
||||
|
||||
* ``local_paths`` — absolute (``/``) or home-relative (``~/``) paths
|
||||
whose suffix is an image extension AND whose expanded form exists
|
||||
on disk as a file. Order-preserving, deduplicated.
|
||||
* ``urls`` — ``http(s)://…`` URLs whose path ends in an image
|
||||
extension (a ``?query`` is allowed after the extension).
|
||||
Order-preserving, deduplicated.
|
||||
|
||||
Matches inside fenced code blocks (``` ``` ```) and inline backticks
|
||||
(`` `…` ``) are skipped so that snippets pasted into a task body for
|
||||
reference aren't mistaken for live attachments. This mirrors the
|
||||
behaviour of ``gateway.platforms.base.BaseAdapter.extract_local_files``.
|
||||
|
||||
Local paths are validated against the filesystem; URLs are not
|
||||
(the provider fetches them at request time).
|
||||
"""
|
||||
if not isinstance(text, str) or not text:
|
||||
return [], []
|
||||
|
||||
# Build spans covered by fenced code blocks and inline code so we can
|
||||
# ignore references the author embedded purely as example text.
|
||||
code_spans: list[tuple[int, int]] = []
|
||||
for m in re.finditer(r"```[^\n]*\n.*?```", text, re.DOTALL):
|
||||
code_spans.append((m.start(), m.end()))
|
||||
for m in re.finditer(r"`[^`\n]+`", text):
|
||||
code_spans.append((m.start(), m.end()))
|
||||
|
||||
def _in_code(pos: int) -> bool:
|
||||
return any(s <= pos < e for s, e in code_spans)
|
||||
|
||||
local_paths: list[str] = []
|
||||
seen_paths: set[str] = set()
|
||||
for match in _LOCAL_IMAGE_PATH_RE.finditer(text):
|
||||
if _in_code(match.start()):
|
||||
continue
|
||||
raw = match.group(0)
|
||||
expanded = os.path.expanduser(raw)
|
||||
try:
|
||||
if not os.path.isfile(expanded):
|
||||
continue
|
||||
except OSError:
|
||||
# ENAMETOOLONG / EINVAL on pathological inputs — skip rather than crash.
|
||||
continue
|
||||
if expanded in seen_paths:
|
||||
continue
|
||||
seen_paths.add(expanded)
|
||||
local_paths.append(expanded)
|
||||
|
||||
urls: list[str] = []
|
||||
seen_urls: set[str] = set()
|
||||
for match in _IMAGE_URL_RE.finditer(text):
|
||||
if _in_code(match.start()):
|
||||
continue
|
||||
url = match.group(0)
|
||||
# Strip trailing punctuation that's almost certainly prose, not part
|
||||
# of the URL (e.g. "see https://x.com/a.png." or "/a.png)").
|
||||
url = url.rstrip(".,;:!?)]>")
|
||||
if url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(url)
|
||||
urls.append(url)
|
||||
|
||||
return local_paths, urls
|
||||
|
||||
|
||||
# Strict YAML/JSON boolean coercion for capability overrides.
|
||||
#
|
||||
# ``bool("false")`` is True in Python because non-empty strings are truthy, so
|
||||
|
|
@ -320,20 +418,29 @@ def _file_to_data_url(path: Path) -> Optional[str]:
|
|||
def build_native_content_parts(
|
||||
user_text: str,
|
||||
image_paths: List[str],
|
||||
image_urls: Optional[List[str]] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], List[str]]:
|
||||
"""Build an OpenAI-style ``content`` list for a user turn.
|
||||
|
||||
Shape:
|
||||
[{"type": "text", "text": "...\\n\\n[Image attached at: /local/path]"},
|
||||
{"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
|
||||
{"type": "image_url", "image_url": {"url": "https://example.com/a.png"}},
|
||||
...]
|
||||
|
||||
The local path of each successfully attached image is appended to the
|
||||
text part as ``[Image attached at: <path>]``. The model still sees the
|
||||
pixels via the ``image_url`` part (full native vision); the path note
|
||||
just gives it a string handle so MCP/skill tools that take an image
|
||||
path or URL argument can be invoked on the same image without an
|
||||
extra round-trip. This parallels the text-mode hint produced by
|
||||
Local paths are read from disk and embedded as base64 ``data:`` URLs.
|
||||
Remote URLs (``http(s)://``) are passed through verbatim — the provider
|
||||
fetches them server-side. The model still sees the pixels either way.
|
||||
|
||||
For each successfully attached image, a hint is appended to the text
|
||||
part:
|
||||
|
||||
* local path → ``[Image attached at: <path>]``
|
||||
* URL → ``[Image attached: <url>]``
|
||||
|
||||
The hint gives the model a string handle so MCP/skill tools that take
|
||||
an image path or URL argument can be invoked on the same image without
|
||||
an extra round-trip. This parallels the text-mode hint produced by
|
||||
``Runner._enrich_message_with_vision`` (``vision_analyze using image_url:
|
||||
<path>``) so behaviour is consistent across both image input modes.
|
||||
|
||||
|
|
@ -342,12 +449,14 @@ def build_native_content_parts(
|
|||
ceiling), the agent's retry loop transparently shrinks and retries
|
||||
once — see ``run_agent._try_shrink_image_parts_in_messages``.
|
||||
|
||||
Returns (content_parts, skipped_paths). Skipped paths are files that
|
||||
couldn't be read from disk and are NOT advertised in the path hints.
|
||||
Returns (content_parts, skipped). Skipped entries are local paths
|
||||
that couldn't be read from disk; URLs are never skipped (they're
|
||||
not validated here).
|
||||
"""
|
||||
skipped: List[str] = []
|
||||
image_parts: List[Dict[str, Any]] = []
|
||||
attached_paths: List[str] = []
|
||||
attached_urls: List[str] = []
|
||||
|
||||
for raw_path in image_paths:
|
||||
p = Path(raw_path)
|
||||
|
|
@ -364,16 +473,26 @@ def build_native_content_parts(
|
|||
})
|
||||
attached_paths.append(str(raw_path))
|
||||
|
||||
for url in image_urls or []:
|
||||
url = (url or "").strip()
|
||||
if not url:
|
||||
continue
|
||||
image_parts.append({
|
||||
"type": "image_url",
|
||||
"image_url": {"url": url},
|
||||
})
|
||||
attached_urls.append(url)
|
||||
|
||||
text = (user_text or "").strip()
|
||||
|
||||
# If at least one image attached, build a single text part that combines
|
||||
# the user's caption (or a neutral default) with one path hint per image.
|
||||
if attached_paths:
|
||||
# the user's caption (or a neutral default) with one hint per image.
|
||||
if attached_paths or attached_urls:
|
||||
base_text = text or "What do you see in this image?"
|
||||
path_hints = "\n".join(
|
||||
f"[Image attached at: {p}]" for p in attached_paths
|
||||
)
|
||||
combined_text = f"{base_text}\n\n{path_hints}"
|
||||
hint_lines: List[str] = []
|
||||
hint_lines.extend(f"[Image attached at: {p}]" for p in attached_paths)
|
||||
hint_lines.extend(f"[Image attached: {u}]" for u in attached_urls)
|
||||
combined_text = f"{base_text}\n\n" + "\n".join(hint_lines)
|
||||
parts: List[Dict[str, Any]] = [{"type": "text", "text": combined_text}]
|
||||
parts.extend(image_parts)
|
||||
return parts, skipped
|
||||
|
|
@ -388,4 +507,5 @@ def build_native_content_parts(
|
|||
__all__ = [
|
||||
"decide_image_input_mode",
|
||||
"build_native_content_parts",
|
||||
"extract_image_refs",
|
||||
]
|
||||
|
|
|
|||
56
cli.py
56
cli.py
|
|
@ -15125,13 +15125,50 @@ def main(
|
|||
# Handle single query mode
|
||||
if query or image:
|
||||
query, single_query_images = _collect_query_images(query, image)
|
||||
# Kanban workers spawn with ``hermes chat -q "work kanban task <id>"``;
|
||||
# the actual task description lives in the task body. Mirror the
|
||||
# gateway/CLI behaviour for inbound images by scanning the body for
|
||||
# local image paths and http(s) image URLs and attaching them to the
|
||||
# worker's first turn. Without this, users who paste a screenshot
|
||||
# path or URL into a kanban task body never get it routed to the
|
||||
# model's vision input.
|
||||
single_query_image_urls: list[str] = []
|
||||
_kanban_task_id = os.environ.get("HERMES_KANBAN_TASK", "").strip()
|
||||
if _kanban_task_id:
|
||||
try:
|
||||
from hermes_cli import kanban_db as _kb
|
||||
from agent.image_routing import extract_image_refs as _extract_refs
|
||||
|
||||
_conn = _kb.connect()
|
||||
try:
|
||||
_task = _kb.get_task(_conn, _kanban_task_id)
|
||||
finally:
|
||||
try:
|
||||
_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
_body = getattr(_task, "body", "") if _task is not None else ""
|
||||
if _body:
|
||||
_kb_paths, _kb_urls = _extract_refs(_body)
|
||||
if _kb_paths:
|
||||
# Dedupe against any --image the user already passed.
|
||||
_seen = {str(p) for p in single_query_images}
|
||||
for _p in _kb_paths:
|
||||
if _p not in _seen:
|
||||
_seen.add(_p)
|
||||
single_query_images.append(Path(_p))
|
||||
if _kb_urls:
|
||||
single_query_image_urls.extend(_kb_urls)
|
||||
except Exception as _exc:
|
||||
# Best-effort enrichment; never block worker startup on it.
|
||||
logger.debug("kanban image-ref extraction failed: %s", _exc)
|
||||
if quiet:
|
||||
# Quiet mode: suppress banner, spinner, tool previews.
|
||||
# Only print the final response and parseable session info.
|
||||
cli.tool_progress_mode = "off"
|
||||
if cli._ensure_runtime_credentials():
|
||||
effective_query: Any = query
|
||||
if single_query_images:
|
||||
if single_query_images or single_query_image_urls:
|
||||
# Honour the same image-routing decision used by the
|
||||
# interactive path. With a vision-capable model (incl.
|
||||
# custom-provider models declared via
|
||||
|
|
@ -15160,19 +15197,26 @@ def main(
|
|||
_parts, _skipped = _build_parts(
|
||||
query if isinstance(query, str) else "",
|
||||
[str(p) for p in single_query_images],
|
||||
image_urls=list(single_query_image_urls) or None,
|
||||
)
|
||||
if any(p.get("type") == "image_url" for p in _parts):
|
||||
effective_query = _parts
|
||||
else:
|
||||
# All images unreadable — text fallback.
|
||||
# ``_preprocess_images_with_vision`` only knows
|
||||
# about local files; URLs would be lost there,
|
||||
# so keep the original query text intact when
|
||||
# only URLs were supplied.
|
||||
if single_query_images:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query, single_query_images, announce=False,
|
||||
)
|
||||
except Exception:
|
||||
if single_query_images:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query, single_query_images, announce=False,
|
||||
)
|
||||
except Exception:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query, single_query_images, announce=False,
|
||||
)
|
||||
else:
|
||||
elif single_query_images:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query,
|
||||
single_query_images,
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ from agent.image_routing import (
|
|||
_supports_vision_override,
|
||||
build_native_content_parts,
|
||||
decide_image_input_mode,
|
||||
extract_image_refs,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -449,3 +450,190 @@ class TestLargeImageHandling:
|
|||
assert len(parts) == 2
|
||||
assert parts[0]["type"] == "text"
|
||||
assert parts[1]["type"] == "image_url"
|
||||
|
||||
|
||||
# ─── extract_image_refs ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestExtractImageRefs:
|
||||
"""Scan task body / inbound text for image paths and URLs (kanban worker
|
||||
enrichment, issue raised May 2026)."""
|
||||
|
||||
def test_empty_or_none_returns_empty(self):
|
||||
assert extract_image_refs("") == ([], [])
|
||||
assert extract_image_refs(None) == ([], []) # type: ignore[arg-type]
|
||||
|
||||
def test_finds_absolute_path(self, tmp_path: Path):
|
||||
img = tmp_path / "screenshot.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
body = f"Look at {img} and tell me what's wrong."
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == []
|
||||
|
||||
def test_finds_home_relative_path(self, tmp_path: Path, monkeypatch):
|
||||
# Simulate ~/foo.png by pointing HOME at tmp_path and creating the file
|
||||
monkeypatch.setenv("HOME", str(tmp_path))
|
||||
img = tmp_path / "foo.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
paths, urls = extract_image_refs("see ~/foo.png please")
|
||||
assert paths == [str(img)]
|
||||
assert urls == []
|
||||
|
||||
def test_skips_nonexistent_paths(self, tmp_path: Path):
|
||||
# Path-shaped but no file on disk → skipped.
|
||||
body = f"What's at {tmp_path}/never_created.png ?"
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == []
|
||||
assert urls == []
|
||||
|
||||
def test_finds_http_image_url(self):
|
||||
body = "Check out https://example.com/photos/cat.png — cute right?"
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == []
|
||||
assert urls == ["https://example.com/photos/cat.png"]
|
||||
|
||||
def test_finds_https_url_with_query_string(self):
|
||||
body = "Diagram: https://cdn.example.com/img.jpeg?size=large&v=2 here"
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert urls == ["https://cdn.example.com/img.jpeg?size=large&v=2"]
|
||||
|
||||
def test_url_trailing_punctuation_stripped(self):
|
||||
# Prose punctuation right after the URL must not be part of the URL.
|
||||
body = "See https://example.com/a.png."
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert urls == ["https://example.com/a.png"]
|
||||
|
||||
def test_ignores_non_image_urls(self):
|
||||
body = "See https://example.com/page.html and https://x.com/y.pdf"
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert urls == []
|
||||
|
||||
def test_dedupes_paths_and_urls(self, tmp_path: Path):
|
||||
img = tmp_path / "dup.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
body = (
|
||||
f"First {img} then again {img}. "
|
||||
"Also https://example.com/x.png and https://example.com/x.png again."
|
||||
)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == ["https://example.com/x.png"]
|
||||
|
||||
def test_ignores_paths_in_fenced_code_block(self, tmp_path: Path):
|
||||
img = tmp_path / "real.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
body = (
|
||||
"Outside the block, attach this:\n"
|
||||
f"{img}\n"
|
||||
"But not these examples:\n"
|
||||
"```\n"
|
||||
f"some_other_image: /tmp/example.png\n"
|
||||
f"url: https://example.com/example.png\n"
|
||||
"```\n"
|
||||
)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == []
|
||||
|
||||
def test_ignores_paths_in_inline_code(self, tmp_path: Path):
|
||||
img = tmp_path / "real.jpg"
|
||||
img.write_bytes(_png_bytes())
|
||||
body = (
|
||||
f"Attach {img}, but ignore the example "
|
||||
"`https://example.com/skip.png` in backticks."
|
||||
)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == []
|
||||
|
||||
def test_does_not_match_paths_inside_urls(self, tmp_path: Path):
|
||||
# The lookbehind in the regex prevents matching the path-portion of
|
||||
# a URL as a local path. Only the URL should be detected.
|
||||
body = "Just the URL: https://example.com/some/dir/image.png"
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == []
|
||||
assert urls == ["https://example.com/some/dir/image.png"]
|
||||
|
||||
def test_mixed_paths_and_urls(self, tmp_path: Path):
|
||||
img = tmp_path / "local.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
body = (
|
||||
f"Compare local {img} against the design at "
|
||||
"https://example.com/design/v2.png — does it match?"
|
||||
)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == ["https://example.com/design/v2.png"]
|
||||
|
||||
def test_case_insensitive_extension(self, tmp_path: Path):
|
||||
img = tmp_path / "shouty.PNG"
|
||||
img.write_bytes(_png_bytes())
|
||||
body = f"see {img}"
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
|
||||
|
||||
# ─── build_native_content_parts with URLs ────────────────────────────────────
|
||||
|
||||
|
||||
class TestBuildNativeContentPartsURLs:
|
||||
"""URL pass-through support added so kanban task bodies (and other
|
||||
inbound surfaces) can route remote image URLs straight to the model."""
|
||||
|
||||
def test_url_only_no_local_paths(self):
|
||||
parts, skipped = build_native_content_parts(
|
||||
"what is this?",
|
||||
[],
|
||||
image_urls=["https://example.com/diagram.png"],
|
||||
)
|
||||
assert skipped == []
|
||||
assert len(parts) == 2
|
||||
assert parts[0]["type"] == "text"
|
||||
assert "[Image attached: https://example.com/diagram.png]" in parts[0]["text"]
|
||||
assert parts[0]["text"].startswith("what is this?")
|
||||
assert parts[1] == {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "https://example.com/diagram.png"},
|
||||
}
|
||||
|
||||
def test_mixed_path_and_url(self, tmp_path: Path):
|
||||
img = tmp_path / "local.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
parts, skipped = build_native_content_parts(
|
||||
"compare these",
|
||||
[str(img)],
|
||||
image_urls=["https://example.com/remote.jpg"],
|
||||
)
|
||||
assert skipped == []
|
||||
# 1 text + 2 image parts (local data URL first, then remote URL).
|
||||
image_parts = [p for p in parts if p.get("type") == "image_url"]
|
||||
assert len(image_parts) == 2
|
||||
assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,")
|
||||
assert image_parts[1]["image_url"]["url"] == "https://example.com/remote.jpg"
|
||||
text = parts[0]["text"]
|
||||
assert "[Image attached at:" in text
|
||||
assert "[Image attached: https://example.com/remote.jpg]" in text
|
||||
|
||||
def test_empty_url_list_is_no_op(self, tmp_path: Path):
|
||||
img = tmp_path / "x.png"
|
||||
img.write_bytes(_png_bytes())
|
||||
# image_urls=[] should behave the same as not passing it at all.
|
||||
parts_no_urls, _ = build_native_content_parts("hi", [str(img)])
|
||||
parts_empty_urls, _ = build_native_content_parts("hi", [str(img)], image_urls=[])
|
||||
assert parts_no_urls == parts_empty_urls
|
||||
|
||||
def test_blank_url_strings_are_dropped(self):
|
||||
parts, _ = build_native_content_parts(
|
||||
"x", [], image_urls=["", " ", "https://example.com/a.png"]
|
||||
)
|
||||
image_parts = [p for p in parts if p.get("type") == "image_url"]
|
||||
assert len(image_parts) == 1
|
||||
assert image_parts[0]["image_url"]["url"] == "https://example.com/a.png"
|
||||
|
||||
def test_url_only_inserts_default_prompt_when_text_empty(self):
|
||||
parts, _ = build_native_content_parts(
|
||||
"", [], image_urls=["https://example.com/a.png"]
|
||||
)
|
||||
assert parts[0]["type"] == "text"
|
||||
assert parts[0]["text"].startswith("What do you see in this image?")
|
||||
|
|
|
|||
238
tests/hermes_cli/test_kanban_worker_image_extraction.py
Normal file
238
tests/hermes_cli/test_kanban_worker_image_extraction.py
Normal file
|
|
@ -0,0 +1,238 @@
|
|||
"""Worker-side image enrichment for kanban tasks.
|
||||
|
||||
When a kanban task body contains a local image path or an ``http(s)://``
|
||||
image URL, the worker must surface that image to the model on its first
|
||||
user turn — matching the CLI/gateway behaviour for inbound images.
|
||||
|
||||
The dispatcher spawns the worker as
|
||||
``hermes -p <profile> chat -q "work kanban task <id>"``. The task body
|
||||
itself never appears in argv; the worker has to read it from the kanban
|
||||
DB during startup. These tests cover the round-trip:
|
||||
|
||||
task body → kanban_db.get_task → extract_image_refs →
|
||||
build_native_content_parts → multimodal user turn
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from hermes_cli import kanban_db as kb
|
||||
from agent.image_routing import (
|
||||
build_native_content_parts,
|
||||
extract_image_refs,
|
||||
)
|
||||
|
||||
|
||||
# Tiny 1×1 transparent PNG used to back any path the tests stick into a
|
||||
# task body. extract_image_refs validates the path exists on disk, so the
|
||||
# byte content has to be a real readable file (any image bytes will do).
|
||||
_PNG = base64.b64decode(
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGNgYGBgAAAABQABpfZFQAAAAABJRU5ErkJggg=="
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def kanban_home(tmp_path: Path, monkeypatch):
|
||||
"""Isolated HERMES_HOME with a fresh kanban DB for each test."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
||||
kb.init_db()
|
||||
return home
|
||||
|
||||
|
||||
def _add_task_with_body(body: str, *, title: str = "Look at this") -> str:
|
||||
conn = kb.connect()
|
||||
try:
|
||||
task_id = kb.create_task(
|
||||
conn,
|
||||
title=title,
|
||||
body=body,
|
||||
assignee="worker-a",
|
||||
tenant=None,
|
||||
)
|
||||
finally:
|
||||
conn.close()
|
||||
return task_id
|
||||
|
||||
|
||||
def _read_body(task_id: str) -> str:
|
||||
conn = kb.connect()
|
||||
try:
|
||||
task = kb.get_task(conn, task_id)
|
||||
return (task.body if task is not None else "") or ""
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
class TestExtractFromTaskBody:
|
||||
"""Read a real kanban task body and run it through extract_image_refs."""
|
||||
|
||||
def test_local_path_in_body_round_trips(self, kanban_home, tmp_path):
|
||||
img = tmp_path / "screenshot.png"
|
||||
img.write_bytes(_PNG)
|
||||
tid = _add_task_with_body(
|
||||
f"Please review the screenshot at {img} and confirm "
|
||||
"the alignment is right."
|
||||
)
|
||||
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == []
|
||||
|
||||
def test_url_in_body_round_trips(self, kanban_home):
|
||||
tid = _add_task_with_body(
|
||||
"The design lives at https://example.com/mock/v3.png — "
|
||||
"make the implementation match it."
|
||||
)
|
||||
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == []
|
||||
assert urls == ["https://example.com/mock/v3.png"]
|
||||
|
||||
def test_mixed_path_and_url_in_body(self, kanban_home, tmp_path):
|
||||
img = tmp_path / "current.png"
|
||||
img.write_bytes(_PNG)
|
||||
tid = _add_task_with_body(
|
||||
f"Compare the current screenshot {img} against the design at "
|
||||
"https://example.com/target.png and write a diff."
|
||||
)
|
||||
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == [str(img)]
|
||||
assert urls == ["https://example.com/target.png"]
|
||||
|
||||
def test_body_without_images_yields_nothing(self, kanban_home):
|
||||
tid = _add_task_with_body(
|
||||
"Refactor the auth module to use the new session helper."
|
||||
)
|
||||
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == []
|
||||
assert urls == []
|
||||
|
||||
def test_empty_body_is_safe(self, kanban_home):
|
||||
tid = _add_task_with_body("")
|
||||
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
assert paths == []
|
||||
assert urls == []
|
||||
|
||||
|
||||
class TestBuildPartsFromTaskBody:
|
||||
"""Verify the full pipeline produces a multimodal user turn."""
|
||||
|
||||
def test_local_path_becomes_native_image_part(self, kanban_home, tmp_path):
|
||||
img = tmp_path / "design.png"
|
||||
img.write_bytes(_PNG)
|
||||
tid = _add_task_with_body(f"Check out {img} — what's broken?")
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
|
||||
# Mirrors the cli.py wiring: pass the worker's literal -q argument
|
||||
# (the dispatcher uses ``"work kanban task <id>"``) plus the
|
||||
# extracted refs through build_native_content_parts.
|
||||
parts, skipped = build_native_content_parts(
|
||||
f"work kanban task {tid}",
|
||||
paths,
|
||||
image_urls=urls or None,
|
||||
)
|
||||
|
||||
assert skipped == []
|
||||
# text part + one image_url part
|
||||
assert len(parts) == 2
|
||||
assert parts[0]["type"] == "text"
|
||||
assert parts[0]["text"].startswith(f"work kanban task {tid}")
|
||||
assert f"[Image attached at: {img}]" in parts[0]["text"]
|
||||
assert parts[1]["type"] == "image_url"
|
||||
assert parts[1]["image_url"]["url"].startswith("data:image/png;base64,")
|
||||
|
||||
def test_url_becomes_image_url_part(self, kanban_home):
|
||||
tid = _add_task_with_body(
|
||||
"Reference: https://example.com/target.jpg — match it."
|
||||
)
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
|
||||
parts, skipped = build_native_content_parts(
|
||||
f"work kanban task {tid}",
|
||||
paths,
|
||||
image_urls=urls or None,
|
||||
)
|
||||
|
||||
assert skipped == []
|
||||
assert len(parts) == 2
|
||||
assert parts[0]["type"] == "text"
|
||||
assert "[Image attached: https://example.com/target.jpg]" in parts[0]["text"]
|
||||
assert parts[1] == {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": "https://example.com/target.jpg"},
|
||||
}
|
||||
|
||||
def test_body_with_both_yields_two_image_parts(self, kanban_home, tmp_path):
|
||||
img = tmp_path / "local.png"
|
||||
img.write_bytes(_PNG)
|
||||
tid = _add_task_with_body(
|
||||
f"Diff {img} vs https://example.com/target.png — explain it."
|
||||
)
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
|
||||
parts, skipped = build_native_content_parts(
|
||||
f"work kanban task {tid}",
|
||||
paths,
|
||||
image_urls=urls or None,
|
||||
)
|
||||
|
||||
assert skipped == []
|
||||
image_parts = [p for p in parts if p.get("type") == "image_url"]
|
||||
assert len(image_parts) == 2
|
||||
# Local file is embedded as a data URL; remote URL passes through.
|
||||
assert image_parts[0]["image_url"]["url"].startswith("data:image/png;base64,")
|
||||
assert image_parts[1]["image_url"]["url"] == "https://example.com/target.png"
|
||||
|
||||
def test_body_with_no_images_leaves_query_untouched(self, kanban_home):
|
||||
tid = _add_task_with_body(
|
||||
"Rewrite the README intro paragraph to focus on use cases."
|
||||
)
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
|
||||
parts, skipped = build_native_content_parts(
|
||||
f"work kanban task {tid}",
|
||||
paths,
|
||||
image_urls=urls or None,
|
||||
)
|
||||
|
||||
# No images → plain text-only return (single part, no list mutation).
|
||||
assert skipped == []
|
||||
assert len(parts) == 1
|
||||
assert parts[0]["type"] == "text"
|
||||
assert parts[0]["text"] == f"work kanban task {tid}"
|
||||
|
||||
def test_code_block_example_is_not_attached(self, kanban_home, tmp_path):
|
||||
# Only the real image outside the fenced code block should attach.
|
||||
real = tmp_path / "real.png"
|
||||
real.write_bytes(_PNG)
|
||||
tid = _add_task_with_body(
|
||||
f"Real screenshot:\n{real}\n\n"
|
||||
"Example we DON'T want attached:\n"
|
||||
"```\n"
|
||||
"image: /tmp/example_only.png\n"
|
||||
"url: https://example.com/example.png\n"
|
||||
"```\n"
|
||||
)
|
||||
body = _read_body(tid)
|
||||
paths, urls = extract_image_refs(body)
|
||||
|
||||
assert paths == [str(real)]
|
||||
assert urls == []
|
||||
Loading…
Add table
Add a link
Reference in a new issue