mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
OpenAI-compatible clients (Open WebUI, LobeChat, etc.) can now send vision
requests to the API server. Both endpoints accept the canonical OpenAI
multimodal shape:
Chat Completions: {type: text|image_url, image_url: {url, detail?}}
Responses: {type: input_text|input_image, image_url: <str>, detail?}
The server validates and converts both into a single internal shape that the
existing agent pipeline already handles (Anthropic adapter converts,
OpenAI-wire providers pass through). Remote http(s) URLs and data:image/*
URLs are supported.
Uploaded files (file, input_file, file_id) and non-image data: URLs are
rejected with 400 unsupported_content_type.
Changes:
- gateway/platforms/api_server.py
- _normalize_multimodal_content(): validates + normalizes both Chat and
Responses content shapes. Returns a plain string for text-only content
(preserves prompt-cache behavior on existing callers) or a canonical
[{type:text|image_url,...}] list when images are present.
- _content_has_visible_payload(): replaces the bare truthy check so a
user turn with only an image no longer rejects as 'No user message'.
- _handle_chat_completions and _handle_responses both call the new helper
for user/assistant content; system messages continue to flatten to text.
- Codex conversation_history, input[], and inline history paths all share
the same validator. No duplicated normalizers.
- run_agent.py
- _summarize_user_message_for_log(): produces a short string summary
('[1 image] describe this') from list content for logging, spinner
previews, and trajectory writes. Fixes AttributeError when list
user_message hit user_message[:80] + '...' / .replace().
- _chat_content_to_responses_parts(): module-level helper that converts
chat-style multimodal content to Responses 'input_text'/'input_image'
parts. Used in _chat_messages_to_responses_input for Codex routing.
- _preflight_codex_input_items() now validates and passes through list
content parts for user/assistant messages instead of stringifying.
- tests/gateway/test_api_server_multimodal.py (new, 38 tests)
- Unit coverage for _normalize_multimodal_content, including both part
formats, data URL gating, and all reject paths.
- Real aiohttp HTTP integration on /v1/chat/completions and /v1/responses
verifying multimodal payloads reach _run_agent intact.
- 400 coverage for file / input_file / non-image data URL.
- tests/run_agent/test_run_agent_multimodal_prologue.py (new)
- Regression coverage for the prologue no-crash contract.
- _chat_content_to_responses_parts round-trip coverage.
- website/docs/user-guide/features/api-server.md
- Inline image examples for both endpoints.
- Updated Limitations: files still unsupported, images now supported.
Validated live against openrouter/anthropic/claude-opus-4.6:
POST /v1/chat/completions → 200, vision-accurate description
POST /v1/responses → 200, same image, clean output_text
POST /v1/chat/completions [file] → 400 unsupported_content_type
POST /v1/responses [input_file] → 400 unsupported_content_type
POST /v1/responses [non-image data URL] → 400 unsupported_content_type
Closes #5621, #8253, #4046, #6632.
Co-authored-by: Paul Bergeron <paul@gamma.app>
Co-authored-by: zhangxicen <zhangxicen@example.com>
Co-authored-by: Manuel Schipper <manuelschipper@users.noreply.github.com>
Co-authored-by: pradeep7127 <pradeep7127@users.noreply.github.com>
103 lines
4.6 KiB
Python
103 lines
4.6 KiB
Python
"""Regression tests for run_conversation's prologue handling of multimodal content.
|
|
|
|
PR #5621 and earlier multimodal PRs hit an ``AttributeError`` in
|
|
``run_agent.run_conversation`` because the prologue unconditionally called
|
|
``user_message[:80] + "..."`` / ``.replace()`` / ``_safe_print(f"...{user_message[:60]}")``
|
|
on what was now a list. These tests cover the two fixes:
|
|
|
|
1. ``_summarize_user_message_for_log`` accepts strings, lists, and ``None``.
|
|
2. ``_chat_content_to_responses_parts`` converts chat-style content to the
|
|
Responses API ``input_text`` / ``input_image`` shape.
|
|
|
|
They do NOT boot the full AIAgent — the prologue-fix guarantees are pure
|
|
function contracts at module scope.
|
|
"""
|
|
|
|
from run_agent import _chat_content_to_responses_parts, _summarize_user_message_for_log
|
|
|
|
|
|
class TestSummarizeUserMessageForLog:
|
|
def test_plain_string_passthrough(self):
|
|
assert _summarize_user_message_for_log("hello world") == "hello world"
|
|
|
|
def test_none_returns_empty_string(self):
|
|
assert _summarize_user_message_for_log(None) == ""
|
|
|
|
def test_text_only_list(self):
|
|
content = [{"type": "text", "text": "hi"}, {"type": "text", "text": "there"}]
|
|
assert _summarize_user_message_for_log(content) == "hi there"
|
|
|
|
def test_list_with_image_only(self):
|
|
content = [{"type": "image_url", "image_url": {"url": "https://x"}}]
|
|
# Image-only: "[1 image]" marker, no trailing space.
|
|
assert _summarize_user_message_for_log(content) == "[1 image]"
|
|
|
|
def test_list_with_text_and_image(self):
|
|
content = [
|
|
{"type": "text", "text": "describe this"},
|
|
{"type": "image_url", "image_url": {"url": "https://x"}},
|
|
]
|
|
summary = _summarize_user_message_for_log(content)
|
|
assert "[1 image]" in summary
|
|
assert "describe this" in summary
|
|
|
|
def test_list_with_multiple_images(self):
|
|
content = [
|
|
{"type": "text", "text": "compare these"},
|
|
{"type": "image_url", "image_url": {"url": "a"}},
|
|
{"type": "image_url", "image_url": {"url": "b"}},
|
|
]
|
|
summary = _summarize_user_message_for_log(content)
|
|
assert "[2 images]" in summary
|
|
|
|
def test_scalar_fallback(self):
|
|
assert _summarize_user_message_for_log(42) == "42"
|
|
|
|
def test_list_supports_slice_and_replace(self):
|
|
"""The whole point of this helper: its output must be a plain str."""
|
|
content = [{"type": "text", "text": "x" * 200}, {"type": "image_url", "image_url": {"url": "y"}}]
|
|
summary = _summarize_user_message_for_log(content)
|
|
# These are the operations the run_conversation prologue performs.
|
|
_ = summary[:80] + "..."
|
|
_ = summary.replace("\n", " ")
|
|
|
|
|
|
class TestChatContentToResponsesParts:
|
|
def test_non_list_returns_empty(self):
|
|
assert _chat_content_to_responses_parts("hi") == []
|
|
assert _chat_content_to_responses_parts(None) == []
|
|
|
|
def test_text_parts_become_input_text(self):
|
|
content = [{"type": "text", "text": "hello"}]
|
|
assert _chat_content_to_responses_parts(content) == [{"type": "input_text", "text": "hello"}]
|
|
|
|
def test_image_url_object_becomes_input_image(self):
|
|
content = [{"type": "image_url", "image_url": {"url": "https://x", "detail": "high"}}]
|
|
assert _chat_content_to_responses_parts(content) == [
|
|
{"type": "input_image", "image_url": "https://x", "detail": "high"},
|
|
]
|
|
|
|
def test_bare_string_image_url(self):
|
|
content = [{"type": "image_url", "image_url": "https://x"}]
|
|
assert _chat_content_to_responses_parts(content) == [{"type": "input_image", "image_url": "https://x"}]
|
|
|
|
def test_responses_format_passthrough(self):
|
|
"""Input already in Responses format should round-trip cleanly."""
|
|
content = [
|
|
{"type": "input_text", "text": "hi"},
|
|
{"type": "input_image", "image_url": "https://x"},
|
|
]
|
|
assert _chat_content_to_responses_parts(content) == [
|
|
{"type": "input_text", "text": "hi"},
|
|
{"type": "input_image", "image_url": "https://x"},
|
|
]
|
|
|
|
def test_unknown_parts_skipped(self):
|
|
"""Unknown types shouldn't crash — filtered silently at this level
|
|
(the API server's normalizer rejects them earlier)."""
|
|
content = [{"type": "text", "text": "ok"}, {"type": "audio", "x": "y"}]
|
|
assert _chat_content_to_responses_parts(content) == [{"type": "input_text", "text": "ok"}]
|
|
|
|
def test_empty_url_image_skipped(self):
|
|
content = [{"type": "image_url", "image_url": {"url": ""}}]
|
|
assert _chat_content_to_responses_parts(content) == []
|