mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-31 06:51:29 +00:00
feat(kanban): attach images referenced in task bodies to worker vision (#34210)
Kanban workers now scan the task body for local image paths and http(s) image URLs and attach them to the worker's first user turn — matching the CLI/gateway behaviour for inbound images. Before, a user pasting `/home/me/screenshot.png` or `https://example.com/img.png` into a kanban task description had it sent to the model as plain text and the pixels were never seen. How it works: * agent/image_routing.py gains extract_image_refs(text) → (paths, urls) that mirrors gateway/platforms/base.py:extract_local_files (absolute / ~-relative paths, image extensions only, ignores fenced/inline code). * build_native_content_parts() accepts an optional image_urls= kwarg and emits passthrough image_url parts for remote URLs alongside the base64 data: URLs used for local paths. * cli.py (single-query/quiet branch — the path every dispatcher-spawned worker takes) detects HERMES_KANBAN_TASK, reads the task body via kanban_db.get_task, runs extract_image_refs, and threads the results into the existing image-routing decision (native vs text). Best-effort: enrichment failures never block worker startup. Tested: * tests/agent/test_image_routing.py — 22 new tests for extract_image_refs and URL pass-through in build_native_content_parts. * tests/hermes_cli/test_kanban_worker_image_extraction.py — 10 new tests driving real kanban_db round-trip (create task → read body → extract refs → build parts). * E2E: created a fake kanban task with a body referencing both a local PNG and an https URL; verified the worker pipeline produces a multimodal user turn with 1 text part + 2 image_url parts (data URL for the local file, passthrough URL for the remote).
This commit is contained in:
parent
1b1e30510a
commit
769ee86cd2
4 changed files with 610 additions and 20 deletions
56
cli.py
56
cli.py
|
|
@ -15125,13 +15125,50 @@ def main(
|
|||
# Handle single query mode
|
||||
if query or image:
|
||||
query, single_query_images = _collect_query_images(query, image)
|
||||
# Kanban workers spawn with ``hermes chat -q "work kanban task <id>"``;
|
||||
# the actual task description lives in the task body. Mirror the
|
||||
# gateway/CLI behaviour for inbound images by scanning the body for
|
||||
# local image paths and http(s) image URLs and attaching them to the
|
||||
# worker's first turn. Without this, users who paste a screenshot
|
||||
# path or URL into a kanban task body never get it routed to the
|
||||
# model's vision input.
|
||||
single_query_image_urls: list[str] = []
|
||||
_kanban_task_id = os.environ.get("HERMES_KANBAN_TASK", "").strip()
|
||||
if _kanban_task_id:
|
||||
try:
|
||||
from hermes_cli import kanban_db as _kb
|
||||
from agent.image_routing import extract_image_refs as _extract_refs
|
||||
|
||||
_conn = _kb.connect()
|
||||
try:
|
||||
_task = _kb.get_task(_conn, _kanban_task_id)
|
||||
finally:
|
||||
try:
|
||||
_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
_body = getattr(_task, "body", "") if _task is not None else ""
|
||||
if _body:
|
||||
_kb_paths, _kb_urls = _extract_refs(_body)
|
||||
if _kb_paths:
|
||||
# Dedupe against any --image the user already passed.
|
||||
_seen = {str(p) for p in single_query_images}
|
||||
for _p in _kb_paths:
|
||||
if _p not in _seen:
|
||||
_seen.add(_p)
|
||||
single_query_images.append(Path(_p))
|
||||
if _kb_urls:
|
||||
single_query_image_urls.extend(_kb_urls)
|
||||
except Exception as _exc:
|
||||
# Best-effort enrichment; never block worker startup on it.
|
||||
logger.debug("kanban image-ref extraction failed: %s", _exc)
|
||||
if quiet:
|
||||
# Quiet mode: suppress banner, spinner, tool previews.
|
||||
# Only print the final response and parseable session info.
|
||||
cli.tool_progress_mode = "off"
|
||||
if cli._ensure_runtime_credentials():
|
||||
effective_query: Any = query
|
||||
if single_query_images:
|
||||
if single_query_images or single_query_image_urls:
|
||||
# Honour the same image-routing decision used by the
|
||||
# interactive path. With a vision-capable model (incl.
|
||||
# custom-provider models declared via
|
||||
|
|
@ -15160,19 +15197,26 @@ def main(
|
|||
_parts, _skipped = _build_parts(
|
||||
query if isinstance(query, str) else "",
|
||||
[str(p) for p in single_query_images],
|
||||
image_urls=list(single_query_image_urls) or None,
|
||||
)
|
||||
if any(p.get("type") == "image_url" for p in _parts):
|
||||
effective_query = _parts
|
||||
else:
|
||||
# All images unreadable — text fallback.
|
||||
# ``_preprocess_images_with_vision`` only knows
|
||||
# about local files; URLs would be lost there,
|
||||
# so keep the original query text intact when
|
||||
# only URLs were supplied.
|
||||
if single_query_images:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query, single_query_images, announce=False,
|
||||
)
|
||||
except Exception:
|
||||
if single_query_images:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query, single_query_images, announce=False,
|
||||
)
|
||||
except Exception:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query, single_query_images, announce=False,
|
||||
)
|
||||
else:
|
||||
elif single_query_images:
|
||||
effective_query = cli._preprocess_images_with_vision(
|
||||
query,
|
||||
single_query_images,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue