From 975e13091e9562011254fba1dbc3b4245589bb74 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 20 May 2026 23:05:46 -0700 Subject: [PATCH] fix(cli): honour image-routing decision in quiet-mode -q --image path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interactive CLI input path consults decide_image_input_mode() to pick between native image_url attachment and the vision_analyze text pipeline, but the non-interactive 'hermes chat -Q -q ... --image FOO' path unconditionally called _preprocess_images_with_vision() — so even with `model.supports_vision: true` set, --image always went through the text-pipeline. Symptom: vision_analyze runs 4-5s per image and the model sees a lossy text summary instead of the actual pixels. Mirror the interactive path: load config, call decide_image_input_mode, branch on native vs text. Falls back to the text-pipeline on any import or build error (Pyright-clean: _build_parts guarded with `is not None`). Live E2E (provider=custom, base_url=openrouter, anthropic/claude-haiku-4.5, red 64x64 PNG): baseline (no override): vision_analyze called (8 log lines), 5.8s with supports_vision: vision_analyze NOT called (0 log lines), 3.9s Same model, same image, single knob flips text→native routing. --- cli.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/cli.py b/cli.py index 9e1b0a628e5..2783ca31bf2 100644 --- a/cli.py +++ b/cli.py @@ -14423,13 +14423,54 @@ def main( # Only print the final response and parseable session info. cli.tool_progress_mode = "off" if cli._ensure_runtime_credentials(): - effective_query = query + effective_query: Any = query if single_query_images: - effective_query = cli._preprocess_images_with_vision( - query, - single_query_images, - announce=False, - ) + # Honour the same image-routing decision used by the + # interactive path. With a vision-capable model (incl. + # custom-provider models declared via + # `model.supports_vision: true`), attach images natively + # as image_url content parts. Otherwise fall back to the + # text-pipeline (vision_analyze pre-description). + _img_mode = "text" + _build_parts = None + try: + from agent.image_routing import ( + build_native_content_parts as _build_parts, # noqa: F811 + ) + from agent.image_routing import decide_image_input_mode + from hermes_cli.config import load_config + + _img_mode = decide_image_input_mode( + (cli.provider or "").strip(), + (cli.model or "").strip(), + load_config(), + ) + except Exception: + _img_mode = "text" + + if _img_mode == "native" and _build_parts is not None: + try: + _parts, _skipped = _build_parts( + query if isinstance(query, str) else "", + [str(p) for p in single_query_images], + ) + if any(p.get("type") == "image_url" for p in _parts): + effective_query = _parts + else: + # All images unreadable — text fallback. + effective_query = cli._preprocess_images_with_vision( + query, single_query_images, announce=False, + ) + except Exception: + effective_query = cli._preprocess_images_with_vision( + query, single_query_images, announce=False, + ) + else: + effective_query = cli._preprocess_images_with_vision( + query, + single_query_images, + announce=False, + ) turn_route = cli._resolve_turn_agent_config(effective_query) if turn_route["signature"] != cli._active_agent_route_signature: cli.agent = None