mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(api-server): inline image inputs on /v1/chat/completions and /v1/responses (#12969)
OpenAI-compatible clients (Open WebUI, LobeChat, etc.) can now send vision
requests to the API server. Both endpoints accept the canonical OpenAI
multimodal shape:
Chat Completions: {type: text|image_url, image_url: {url, detail?}}
Responses: {type: input_text|input_image, image_url: <str>, detail?}
The server validates and converts both into a single internal shape that the
existing agent pipeline already handles (Anthropic adapter converts,
OpenAI-wire providers pass through). Remote http(s) URLs and data:image/*
URLs are supported.
Uploaded files (file, input_file, file_id) and non-image data: URLs are
rejected with 400 unsupported_content_type.
Changes:
- gateway/platforms/api_server.py
- _normalize_multimodal_content(): validates + normalizes both Chat and
Responses content shapes. Returns a plain string for text-only content
(preserves prompt-cache behavior on existing callers) or a canonical
[{type:text|image_url,...}] list when images are present.
- _content_has_visible_payload(): replaces the bare truthy check so a
user turn with only an image no longer rejects as 'No user message'.
- _handle_chat_completions and _handle_responses both call the new helper
for user/assistant content; system messages continue to flatten to text.
- Codex conversation_history, input[], and inline history paths all share
the same validator. No duplicated normalizers.
- run_agent.py
- _summarize_user_message_for_log(): produces a short string summary
('[1 image] describe this') from list content for logging, spinner
previews, and trajectory writes. Fixes AttributeError when list
user_message hit user_message[:80] + '...' / .replace().
- _chat_content_to_responses_parts(): module-level helper that converts
chat-style multimodal content to Responses 'input_text'/'input_image'
parts. Used in _chat_messages_to_responses_input for Codex routing.
- _preflight_codex_input_items() now validates and passes through list
content parts for user/assistant messages instead of stringifying.
- tests/gateway/test_api_server_multimodal.py (new, 38 tests)
- Unit coverage for _normalize_multimodal_content, including both part
formats, data URL gating, and all reject paths.
- Real aiohttp HTTP integration on /v1/chat/completions and /v1/responses
verifying multimodal payloads reach _run_agent intact.
- 400 coverage for file / input_file / non-image data URL.
- tests/run_agent/test_run_agent_multimodal_prologue.py (new)
- Regression coverage for the prologue no-crash contract.
- _chat_content_to_responses_parts round-trip coverage.
- website/docs/user-guide/features/api-server.md
- Inline image examples for both endpoints.
- Updated Limitations: files still unsupported, images now supported.
Validated live against openrouter/anthropic/claude-opus-4.6:
POST /v1/chat/completions → 200, vision-accurate description
POST /v1/responses → 200, same image, clean output_text
POST /v1/chat/completions [file] → 400 unsupported_content_type
POST /v1/responses [input_file] → 400 unsupported_content_type
POST /v1/responses [non-image data URL] → 400 unsupported_content_type
Closes #5621, #8253, #4046, #6632.
Co-authored-by: Paul Bergeron <paul@gamma.app>
Co-authored-by: zhangxicen <zhangxicen@example.com>
Co-authored-by: Manuel Schipper <manuelschipper@users.noreply.github.com>
Co-authored-by: pradeep7127 <pradeep7127@users.noreply.github.com>
This commit is contained in:
parent
3218d58fc5
commit
f683132c1d
5 changed files with 776 additions and 20 deletions
|
|
@ -117,6 +117,160 @@ def _normalize_chat_content(
|
|||
return ""
|
||||
|
||||
|
||||
# Content part type aliases used by the OpenAI Chat Completions and Responses
|
||||
# APIs. We accept both spellings on input and emit a single canonical internal
|
||||
# shape (``{"type": "text", ...}`` / ``{"type": "image_url", ...}``) that the
|
||||
# rest of the agent pipeline already understands.
|
||||
_TEXT_PART_TYPES = frozenset({"text", "input_text", "output_text"})
|
||||
_IMAGE_PART_TYPES = frozenset({"image_url", "input_image"})
|
||||
_FILE_PART_TYPES = frozenset({"file", "input_file"})
|
||||
|
||||
|
||||
def _normalize_multimodal_content(content: Any) -> Any:
|
||||
"""Validate and normalize multimodal content for the API server.
|
||||
|
||||
Returns a plain string when the content is text-only, or a list of
|
||||
``{"type": "text"|"image_url", ...}`` parts when images are present.
|
||||
The output shape is the native OpenAI Chat Completions vision format,
|
||||
which the agent pipeline accepts verbatim (OpenAI-wire providers) or
|
||||
converts (``_preprocess_anthropic_content`` for Anthropic).
|
||||
|
||||
Raises ``ValueError`` with an OpenAI-style code on invalid input:
|
||||
* ``unsupported_content_type`` — file/input_file/file_id parts, or
|
||||
non-image ``data:`` URLs.
|
||||
* ``invalid_image_url`` — missing URL or unsupported scheme.
|
||||
* ``invalid_content_part`` — malformed text/image objects.
|
||||
|
||||
Callers translate the ValueError into a 400 response.
|
||||
"""
|
||||
# Scalar passthrough mirrors ``_normalize_chat_content``.
|
||||
if content is None:
|
||||
return ""
|
||||
if isinstance(content, str):
|
||||
return content[:MAX_NORMALIZED_TEXT_LENGTH] if len(content) > MAX_NORMALIZED_TEXT_LENGTH else content
|
||||
if not isinstance(content, list):
|
||||
# Mirror the legacy text-normalizer's fallback so callers that
|
||||
# pre-existed image support still get a string back.
|
||||
return _normalize_chat_content(content)
|
||||
|
||||
items = content[:MAX_CONTENT_LIST_SIZE] if len(content) > MAX_CONTENT_LIST_SIZE else content
|
||||
normalized_parts: List[Dict[str, Any]] = []
|
||||
text_accum_len = 0
|
||||
|
||||
for part in items:
|
||||
if isinstance(part, str):
|
||||
if part:
|
||||
trimmed = part[:MAX_NORMALIZED_TEXT_LENGTH]
|
||||
normalized_parts.append({"type": "text", "text": trimmed})
|
||||
text_accum_len += len(trimmed)
|
||||
continue
|
||||
|
||||
if not isinstance(part, dict):
|
||||
# Ignore unknown scalars for forward compatibility with future
|
||||
# Responses API additions (e.g. ``refusal``). The same policy
|
||||
# the text normalizer applies.
|
||||
continue
|
||||
|
||||
raw_type = part.get("type")
|
||||
part_type = str(raw_type or "").strip().lower()
|
||||
|
||||
if part_type in _TEXT_PART_TYPES:
|
||||
text = part.get("text")
|
||||
if text is None:
|
||||
continue
|
||||
if not isinstance(text, str):
|
||||
text = str(text)
|
||||
if text:
|
||||
trimmed = text[:MAX_NORMALIZED_TEXT_LENGTH]
|
||||
normalized_parts.append({"type": "text", "text": trimmed})
|
||||
text_accum_len += len(trimmed)
|
||||
continue
|
||||
|
||||
if part_type in _IMAGE_PART_TYPES:
|
||||
detail = part.get("detail")
|
||||
image_ref = part.get("image_url")
|
||||
# OpenAI Responses sends ``input_image`` with a top-level
|
||||
# ``image_url`` string; Chat Completions sends ``image_url`` as
|
||||
# ``{"url": "...", "detail": "..."}``. Support both.
|
||||
if isinstance(image_ref, dict):
|
||||
url_value = image_ref.get("url")
|
||||
detail = image_ref.get("detail", detail)
|
||||
else:
|
||||
url_value = image_ref
|
||||
if not isinstance(url_value, str) or not url_value.strip():
|
||||
raise ValueError("invalid_image_url:Image parts must include a non-empty image URL.")
|
||||
url_value = url_value.strip()
|
||||
lowered = url_value.lower()
|
||||
if lowered.startswith("data:"):
|
||||
if not lowered.startswith("data:image/") or "," not in url_value:
|
||||
raise ValueError(
|
||||
"unsupported_content_type:Only image data URLs are supported. "
|
||||
"Non-image data payloads are not supported."
|
||||
)
|
||||
elif not (lowered.startswith("http://") or lowered.startswith("https://")):
|
||||
raise ValueError(
|
||||
"invalid_image_url:Image inputs must use http(s) URLs or data:image/... URLs."
|
||||
)
|
||||
image_part: Dict[str, Any] = {"type": "image_url", "image_url": {"url": url_value}}
|
||||
if detail is not None:
|
||||
if not isinstance(detail, str) or not detail.strip():
|
||||
raise ValueError("invalid_content_part:Image detail must be a non-empty string when provided.")
|
||||
image_part["image_url"]["detail"] = detail.strip()
|
||||
normalized_parts.append(image_part)
|
||||
continue
|
||||
|
||||
if part_type in _FILE_PART_TYPES:
|
||||
raise ValueError(
|
||||
"unsupported_content_type:Inline image inputs are supported, "
|
||||
"but uploaded files and document inputs are not supported on this endpoint."
|
||||
)
|
||||
|
||||
# Unknown part type — reject explicitly so clients get a clear error
|
||||
# instead of a silently dropped turn.
|
||||
raise ValueError(
|
||||
f"unsupported_content_type:Unsupported content part type {raw_type!r}. "
|
||||
"Only text and image_url/input_image parts are supported."
|
||||
)
|
||||
|
||||
if not normalized_parts:
|
||||
return ""
|
||||
|
||||
# Text-only: collapse to a plain string so downstream logging/trajectory
|
||||
# code sees the native shape and prompt caching on text-only turns is
|
||||
# unaffected.
|
||||
if all(p.get("type") == "text" for p in normalized_parts):
|
||||
return "\n".join(p["text"] for p in normalized_parts if p.get("text"))
|
||||
|
||||
return normalized_parts
|
||||
|
||||
|
||||
def _content_has_visible_payload(content: Any) -> bool:
|
||||
"""True when content has any text or image attachment. Used to reject empty turns."""
|
||||
if isinstance(content, str):
|
||||
return bool(content.strip())
|
||||
if isinstance(content, list):
|
||||
for part in content:
|
||||
if isinstance(part, dict):
|
||||
ptype = str(part.get("type") or "").strip().lower()
|
||||
if ptype in _TEXT_PART_TYPES and str(part.get("text") or "").strip():
|
||||
return True
|
||||
if ptype in _IMAGE_PART_TYPES:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _multimodal_validation_error(exc: ValueError, *, param: str) -> "web.Response":
|
||||
"""Translate a ``_normalize_multimodal_content`` ValueError into a 400 response."""
|
||||
raw = str(exc)
|
||||
code, _, message = raw.partition(":")
|
||||
if not message:
|
||||
code, message = "invalid_content_part", raw
|
||||
return web.json_response(
|
||||
_openai_error(message, code=code, param=param),
|
||||
status=400,
|
||||
)
|
||||
|
||||
|
||||
def check_api_server_requirements() -> bool:
|
||||
"""Check if API server dependencies are available."""
|
||||
return AIOHTTP_AVAILABLE
|
||||
|
|
@ -637,26 +791,32 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
system_prompt = None
|
||||
conversation_messages: List[Dict[str, str]] = []
|
||||
|
||||
for msg in messages:
|
||||
for idx, msg in enumerate(messages):
|
||||
role = msg.get("role", "")
|
||||
content = _normalize_chat_content(msg.get("content", ""))
|
||||
raw_content = msg.get("content", "")
|
||||
if role == "system":
|
||||
# Accumulate system messages
|
||||
# System messages don't support images (Anthropic rejects, OpenAI
|
||||
# text-model systems don't render them). Flatten to text.
|
||||
content = _normalize_chat_content(raw_content)
|
||||
if system_prompt is None:
|
||||
system_prompt = content
|
||||
else:
|
||||
system_prompt = system_prompt + "\n" + content
|
||||
elif role in ("user", "assistant"):
|
||||
try:
|
||||
content = _normalize_multimodal_content(raw_content)
|
||||
except ValueError as exc:
|
||||
return _multimodal_validation_error(exc, param=f"messages[{idx}].content")
|
||||
conversation_messages.append({"role": role, "content": content})
|
||||
|
||||
# Extract the last user message as the primary input
|
||||
user_message = ""
|
||||
user_message: Any = ""
|
||||
history = []
|
||||
if conversation_messages:
|
||||
user_message = conversation_messages[-1].get("content", "")
|
||||
history = conversation_messages[:-1]
|
||||
|
||||
if not user_message:
|
||||
if not _content_has_visible_payload(user_message):
|
||||
return web.json_response(
|
||||
{"error": {"message": "No user message found in messages", "type": "invalid_request_error"}},
|
||||
status=400,
|
||||
|
|
@ -1424,16 +1584,19 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
# No error if conversation doesn't exist yet — it's a new conversation
|
||||
|
||||
# Normalize input to message list
|
||||
input_messages: List[Dict[str, str]] = []
|
||||
input_messages: List[Dict[str, Any]] = []
|
||||
if isinstance(raw_input, str):
|
||||
input_messages = [{"role": "user", "content": raw_input}]
|
||||
elif isinstance(raw_input, list):
|
||||
for item in raw_input:
|
||||
for idx, item in enumerate(raw_input):
|
||||
if isinstance(item, str):
|
||||
input_messages.append({"role": "user", "content": item})
|
||||
elif isinstance(item, dict):
|
||||
role = item.get("role", "user")
|
||||
content = _normalize_chat_content(item.get("content", ""))
|
||||
try:
|
||||
content = _normalize_multimodal_content(item.get("content", ""))
|
||||
except ValueError as exc:
|
||||
return _multimodal_validation_error(exc, param=f"input[{idx}].content")
|
||||
input_messages.append({"role": role, "content": content})
|
||||
else:
|
||||
return web.json_response(_openai_error("'input' must be a string or array"), status=400)
|
||||
|
|
@ -1442,7 +1605,7 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
# This lets stateless clients supply their own history instead of
|
||||
# relying on server-side response chaining via previous_response_id.
|
||||
# Precedence: explicit conversation_history > previous_response_id.
|
||||
conversation_history: List[Dict[str, str]] = []
|
||||
conversation_history: List[Dict[str, Any]] = []
|
||||
raw_history = body.get("conversation_history")
|
||||
if raw_history:
|
||||
if not isinstance(raw_history, list):
|
||||
|
|
@ -1456,7 +1619,11 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
_openai_error(f"conversation_history[{i}] must have 'role' and 'content' fields"),
|
||||
status=400,
|
||||
)
|
||||
conversation_history.append({"role": str(entry["role"]), "content": str(entry["content"])})
|
||||
try:
|
||||
entry_content = _normalize_multimodal_content(entry["content"])
|
||||
except ValueError as exc:
|
||||
return _multimodal_validation_error(exc, param=f"conversation_history[{i}].content")
|
||||
conversation_history.append({"role": str(entry["role"]), "content": entry_content})
|
||||
if previous_response_id:
|
||||
logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")
|
||||
|
||||
|
|
@ -1476,8 +1643,8 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
conversation_history.append(msg)
|
||||
|
||||
# Last input message is the user_message
|
||||
user_message = input_messages[-1].get("content", "") if input_messages else ""
|
||||
if not user_message:
|
||||
user_message: Any = input_messages[-1].get("content", "") if input_messages else ""
|
||||
if not _content_has_visible_payload(user_message):
|
||||
return web.json_response(_openai_error("No user message found in input"), status=400)
|
||||
|
||||
# Truncation support
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue