diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py index 9687472f57..7efb756c9c 100644 --- a/gateway/platforms/api_server.py +++ b/gateway/platforms/api_server.py @@ -117,6 +117,160 @@ def _normalize_chat_content( return "" +# Content part type aliases used by the OpenAI Chat Completions and Responses +# APIs. We accept both spellings on input and emit a single canonical internal +# shape (``{"type": "text", ...}`` / ``{"type": "image_url", ...}``) that the +# rest of the agent pipeline already understands. +_TEXT_PART_TYPES = frozenset({"text", "input_text", "output_text"}) +_IMAGE_PART_TYPES = frozenset({"image_url", "input_image"}) +_FILE_PART_TYPES = frozenset({"file", "input_file"}) + + +def _normalize_multimodal_content(content: Any) -> Any: + """Validate and normalize multimodal content for the API server. + + Returns a plain string when the content is text-only, or a list of + ``{"type": "text"|"image_url", ...}`` parts when images are present. + The output shape is the native OpenAI Chat Completions vision format, + which the agent pipeline accepts verbatim (OpenAI-wire providers) or + converts (``_preprocess_anthropic_content`` for Anthropic). + + Raises ``ValueError`` with an OpenAI-style code on invalid input: + * ``unsupported_content_type`` — file/input_file/file_id parts, or + non-image ``data:`` URLs. + * ``invalid_image_url`` — missing URL or unsupported scheme. + * ``invalid_content_part`` — malformed text/image objects. + + Callers translate the ValueError into a 400 response. + """ + # Scalar passthrough mirrors ``_normalize_chat_content``. + if content is None: + return "" + if isinstance(content, str): + return content[:MAX_NORMALIZED_TEXT_LENGTH] if len(content) > MAX_NORMALIZED_TEXT_LENGTH else content + if not isinstance(content, list): + # Mirror the legacy text-normalizer's fallback so callers that + # pre-existed image support still get a string back. + return _normalize_chat_content(content) + + items = content[:MAX_CONTENT_LIST_SIZE] if len(content) > MAX_CONTENT_LIST_SIZE else content + normalized_parts: List[Dict[str, Any]] = [] + text_accum_len = 0 + + for part in items: + if isinstance(part, str): + if part: + trimmed = part[:MAX_NORMALIZED_TEXT_LENGTH] + normalized_parts.append({"type": "text", "text": trimmed}) + text_accum_len += len(trimmed) + continue + + if not isinstance(part, dict): + # Ignore unknown scalars for forward compatibility with future + # Responses API additions (e.g. ``refusal``). The same policy + # the text normalizer applies. + continue + + raw_type = part.get("type") + part_type = str(raw_type or "").strip().lower() + + if part_type in _TEXT_PART_TYPES: + text = part.get("text") + if text is None: + continue + if not isinstance(text, str): + text = str(text) + if text: + trimmed = text[:MAX_NORMALIZED_TEXT_LENGTH] + normalized_parts.append({"type": "text", "text": trimmed}) + text_accum_len += len(trimmed) + continue + + if part_type in _IMAGE_PART_TYPES: + detail = part.get("detail") + image_ref = part.get("image_url") + # OpenAI Responses sends ``input_image`` with a top-level + # ``image_url`` string; Chat Completions sends ``image_url`` as + # ``{"url": "...", "detail": "..."}``. Support both. + if isinstance(image_ref, dict): + url_value = image_ref.get("url") + detail = image_ref.get("detail", detail) + else: + url_value = image_ref + if not isinstance(url_value, str) or not url_value.strip(): + raise ValueError("invalid_image_url:Image parts must include a non-empty image URL.") + url_value = url_value.strip() + lowered = url_value.lower() + if lowered.startswith("data:"): + if not lowered.startswith("data:image/") or "," not in url_value: + raise ValueError( + "unsupported_content_type:Only image data URLs are supported. " + "Non-image data payloads are not supported." + ) + elif not (lowered.startswith("http://") or lowered.startswith("https://")): + raise ValueError( + "invalid_image_url:Image inputs must use http(s) URLs or data:image/... URLs." + ) + image_part: Dict[str, Any] = {"type": "image_url", "image_url": {"url": url_value}} + if detail is not None: + if not isinstance(detail, str) or not detail.strip(): + raise ValueError("invalid_content_part:Image detail must be a non-empty string when provided.") + image_part["image_url"]["detail"] = detail.strip() + normalized_parts.append(image_part) + continue + + if part_type in _FILE_PART_TYPES: + raise ValueError( + "unsupported_content_type:Inline image inputs are supported, " + "but uploaded files and document inputs are not supported on this endpoint." + ) + + # Unknown part type — reject explicitly so clients get a clear error + # instead of a silently dropped turn. + raise ValueError( + f"unsupported_content_type:Unsupported content part type {raw_type!r}. " + "Only text and image_url/input_image parts are supported." + ) + + if not normalized_parts: + return "" + + # Text-only: collapse to a plain string so downstream logging/trajectory + # code sees the native shape and prompt caching on text-only turns is + # unaffected. + if all(p.get("type") == "text" for p in normalized_parts): + return "\n".join(p["text"] for p in normalized_parts if p.get("text")) + + return normalized_parts + + +def _content_has_visible_payload(content: Any) -> bool: + """True when content has any text or image attachment. Used to reject empty turns.""" + if isinstance(content, str): + return bool(content.strip()) + if isinstance(content, list): + for part in content: + if isinstance(part, dict): + ptype = str(part.get("type") or "").strip().lower() + if ptype in _TEXT_PART_TYPES and str(part.get("text") or "").strip(): + return True + if ptype in _IMAGE_PART_TYPES: + return True + return False + + +def _multimodal_validation_error(exc: ValueError, *, param: str) -> "web.Response": + """Translate a ``_normalize_multimodal_content`` ValueError into a 400 response.""" + raw = str(exc) + code, _, message = raw.partition(":") + if not message: + code, message = "invalid_content_part", raw + return web.json_response( + _openai_error(message, code=code, param=param), + status=400, + ) + + def check_api_server_requirements() -> bool: """Check if API server dependencies are available.""" return AIOHTTP_AVAILABLE @@ -637,26 +791,32 @@ class APIServerAdapter(BasePlatformAdapter): system_prompt = None conversation_messages: List[Dict[str, str]] = [] - for msg in messages: + for idx, msg in enumerate(messages): role = msg.get("role", "") - content = _normalize_chat_content(msg.get("content", "")) + raw_content = msg.get("content", "") if role == "system": - # Accumulate system messages + # System messages don't support images (Anthropic rejects, OpenAI + # text-model systems don't render them). Flatten to text. + content = _normalize_chat_content(raw_content) if system_prompt is None: system_prompt = content else: system_prompt = system_prompt + "\n" + content elif role in ("user", "assistant"): + try: + content = _normalize_multimodal_content(raw_content) + except ValueError as exc: + return _multimodal_validation_error(exc, param=f"messages[{idx}].content") conversation_messages.append({"role": role, "content": content}) # Extract the last user message as the primary input - user_message = "" + user_message: Any = "" history = [] if conversation_messages: user_message = conversation_messages[-1].get("content", "") history = conversation_messages[:-1] - if not user_message: + if not _content_has_visible_payload(user_message): return web.json_response( {"error": {"message": "No user message found in messages", "type": "invalid_request_error"}}, status=400, @@ -1424,16 +1584,19 @@ class APIServerAdapter(BasePlatformAdapter): # No error if conversation doesn't exist yet — it's a new conversation # Normalize input to message list - input_messages: List[Dict[str, str]] = [] + input_messages: List[Dict[str, Any]] = [] if isinstance(raw_input, str): input_messages = [{"role": "user", "content": raw_input}] elif isinstance(raw_input, list): - for item in raw_input: + for idx, item in enumerate(raw_input): if isinstance(item, str): input_messages.append({"role": "user", "content": item}) elif isinstance(item, dict): role = item.get("role", "user") - content = _normalize_chat_content(item.get("content", "")) + try: + content = _normalize_multimodal_content(item.get("content", "")) + except ValueError as exc: + return _multimodal_validation_error(exc, param=f"input[{idx}].content") input_messages.append({"role": role, "content": content}) else: return web.json_response(_openai_error("'input' must be a string or array"), status=400) @@ -1442,7 +1605,7 @@ class APIServerAdapter(BasePlatformAdapter): # This lets stateless clients supply their own history instead of # relying on server-side response chaining via previous_response_id. # Precedence: explicit conversation_history > previous_response_id. - conversation_history: List[Dict[str, str]] = [] + conversation_history: List[Dict[str, Any]] = [] raw_history = body.get("conversation_history") if raw_history: if not isinstance(raw_history, list): @@ -1456,7 +1619,11 @@ class APIServerAdapter(BasePlatformAdapter): _openai_error(f"conversation_history[{i}] must have 'role' and 'content' fields"), status=400, ) - conversation_history.append({"role": str(entry["role"]), "content": str(entry["content"])}) + try: + entry_content = _normalize_multimodal_content(entry["content"]) + except ValueError as exc: + return _multimodal_validation_error(exc, param=f"conversation_history[{i}].content") + conversation_history.append({"role": str(entry["role"]), "content": entry_content}) if previous_response_id: logger.debug("Both conversation_history and previous_response_id provided; using conversation_history") @@ -1476,8 +1643,8 @@ class APIServerAdapter(BasePlatformAdapter): conversation_history.append(msg) # Last input message is the user_message - user_message = input_messages[-1].get("content", "") if input_messages else "" - if not user_message: + user_message: Any = input_messages[-1].get("content", "") if input_messages else "" + if not _content_has_visible_payload(user_message): return web.json_response(_openai_error("No user message found in input"), status=400) # Truncation support diff --git a/run_agent.py b/run_agent.py index 007cb1a652..73231183b8 100644 --- a/run_agent.py +++ b/run_agent.py @@ -371,6 +371,89 @@ def _sanitize_surrogates(text: str) -> str: return text +def _chat_content_to_responses_parts(content: Any) -> List[Dict[str, Any]]: + """Convert chat-style multimodal content to Responses API input parts. + + Input: ``[{"type":"text"|"image_url", ...}]`` (native OpenAI Chat format) + Output: ``[{"type":"input_text"|"input_image", ...}]`` (Responses format) + + Returns an empty list when ``content`` is not a list or contains no + recognized parts — callers fall back to the string path. + """ + if not isinstance(content, list): + return [] + converted: List[Dict[str, Any]] = [] + for part in content: + if isinstance(part, str): + if part: + converted.append({"type": "input_text", "text": part}) + continue + if not isinstance(part, dict): + continue + ptype = str(part.get("type") or "").strip().lower() + if ptype in {"text", "input_text", "output_text"}: + text = part.get("text") + if isinstance(text, str) and text: + converted.append({"type": "input_text", "text": text}) + continue + if ptype in {"image_url", "input_image"}: + image_ref = part.get("image_url") + detail = part.get("detail") + if isinstance(image_ref, dict): + url = image_ref.get("url") + detail = image_ref.get("detail", detail) + else: + url = image_ref + if not isinstance(url, str) or not url: + continue + image_part: Dict[str, Any] = {"type": "input_image", "image_url": url} + if isinstance(detail, str) and detail.strip(): + image_part["detail"] = detail.strip() + converted.append(image_part) + return converted + + +def _summarize_user_message_for_log(content: Any) -> str: + """Return a short text summary of a user message for logging/trajectory. + + Multimodal messages arrive as a list of ``{type:"text"|"image_url", ...}`` + parts from the API server. Logging, spinner previews, and trajectory + files all want a plain string — this helper extracts the first chunk of + text and notes any attached images. Returns an empty string for empty + lists and ``str(content)`` for unexpected scalar types. + """ + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, list): + text_bits: List[str] = [] + image_count = 0 + for part in content: + if isinstance(part, str): + if part: + text_bits.append(part) + continue + if not isinstance(part, dict): + continue + ptype = str(part.get("type") or "").strip().lower() + if ptype in {"text", "input_text", "output_text"}: + text = part.get("text") + if isinstance(text, str) and text: + text_bits.append(text) + elif ptype in {"image_url", "input_image"}: + image_count += 1 + summary = " ".join(text_bits).strip() + if image_count: + note = f"[{image_count} image{'s' if image_count != 1 else ''}]" + summary = f"{note} {summary}" if summary else note + return summary + try: + return str(content) + except Exception: + return "" + + def _sanitize_structure_surrogates(payload: Any) -> bool: """Replace surrogate code points in nested dict/list payloads in-place. @@ -4274,7 +4357,14 @@ class AIAgent: if role in {"user", "assistant"}: content = msg.get("content", "") - content_text = str(content) if content is not None else "" + if isinstance(content, list): + content_parts = _chat_content_to_responses_parts(content) + content_text = "".join( + p.get("text", "") for p in content_parts if p.get("type") == "input_text" + ) + else: + content_parts = [] + content_text = str(content) if content is not None else "" if role == "assistant": # Replay encrypted reasoning items from previous turns @@ -4297,7 +4387,9 @@ class AIAgent: seen_item_ids.add(item_id) has_codex_reasoning = True - if content_text.strip(): + if content_parts: + items.append({"role": "assistant", "content": content_parts}) + elif content_text.strip(): items.append({"role": "assistant", "content": content_text}) elif has_codex_reasoning: # The Responses API requires a following item after each @@ -4350,7 +4442,12 @@ class AIAgent: }) continue - items.append({"role": role, "content": content_text}) + # Non-assistant (user) role: emit multimodal parts when present, + # otherwise fall back to the text payload. + if content_parts: + items.append({"role": role, "content": content_parts}) + else: + items.append({"role": role, "content": content_text}) continue if role == "tool": @@ -4450,6 +4547,46 @@ class AIAgent: content = item.get("content", "") if content is None: content = "" + if isinstance(content, list): + # Multimodal content from ``_chat_messages_to_responses_input`` + # is already in Responses format (``input_text`` / ``input_image``). + # Validate each part and pass through. + validated: List[Dict[str, Any]] = [] + for part_idx, part in enumerate(content): + if isinstance(part, str): + if part: + validated.append({"type": "input_text", "text": part}) + continue + if not isinstance(part, dict): + raise ValueError( + f"Codex Responses input[{idx}].content[{part_idx}] must be an object or string." + ) + ptype = str(part.get("type") or "").strip().lower() + if ptype in {"input_text", "text", "output_text"}: + text = part.get("text", "") + if not isinstance(text, str): + text = str(text or "") + validated.append({"type": "input_text", "text": text}) + elif ptype in {"input_image", "image_url"}: + image_ref = part.get("image_url", "") + detail = part.get("detail") + if isinstance(image_ref, dict): + url = image_ref.get("url", "") + detail = image_ref.get("detail", detail) + else: + url = image_ref + if not isinstance(url, str): + url = str(url or "") + image_part: Dict[str, Any] = {"type": "input_image", "image_url": url} + if isinstance(detail, str) and detail.strip(): + image_part["detail"] = detail.strip() + validated.append(image_part) + else: + raise ValueError( + f"Codex Responses input[{idx}].content[{part_idx}] has unsupported type {part.get('type')!r}." + ) + normalized.append({"role": role, "content": validated}) + continue if not isinstance(content, str): content = str(content) @@ -9085,7 +9222,8 @@ class AIAgent: self.iteration_budget = IterationBudget(self.max_iterations) # Log conversation turn start for debugging/observability - _msg_preview = (user_message[:80] + "...") if len(user_message) > 80 else user_message + _preview_text = _summarize_user_message_for_log(user_message) + _msg_preview = (_preview_text[:80] + "...") if len(_preview_text) > 80 else _preview_text _msg_preview = _msg_preview.replace("\n", " ") logger.info( "conversation turn: session=%s model=%s provider=%s platform=%s history=%d msg=%r", @@ -9133,7 +9271,8 @@ class AIAgent: self._persist_user_message_idx = current_turn_user_idx if not self.quiet_mode: - self._safe_print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'") + _print_preview = _summarize_user_message_for_log(user_message) + self._safe_print(f"💬 Starting conversation: '{_print_preview[:60]}{'...' if len(_print_preview) > 60 else ''}'") # ── System prompt (cached per session for prefix caching) ── # Built once on first call, reused for all subsequent calls. @@ -11999,8 +12138,9 @@ class AIAgent: # Determine if conversation completed successfully completed = final_response is not None and api_call_count < self.max_iterations - # Save trajectory if enabled - self._save_trajectory(messages, user_message, completed) + # Save trajectory if enabled. ``user_message`` may be a multimodal + # list of parts; the trajectory format wants a plain string. + self._save_trajectory(messages, _summarize_user_message_for_log(user_message), completed) # Clean up VM and browser for this task after conversation completes self._cleanup_task_resources(effective_task_id) diff --git a/tests/gateway/test_api_server_multimodal.py b/tests/gateway/test_api_server_multimodal.py new file mode 100644 index 0000000000..299a050303 --- /dev/null +++ b/tests/gateway/test_api_server_multimodal.py @@ -0,0 +1,308 @@ +"""End-to-end tests for inline image inputs on /v1/chat/completions and /v1/responses. + +Covers the multimodal normalization path added to the API server. Unlike the +adapter-level tests that patch ``_run_agent``, these tests patch +``AIAgent.run_conversation`` instead so the adapter's full request-handling +path (including the ``run_agent`` prologue that used to crash on list content) +executes against a real aiohttp app. +""" + +from unittest.mock import MagicMock, patch + +import pytest +from aiohttp import web +from aiohttp.test_utils import TestClient, TestServer + +from gateway.config import PlatformConfig +from gateway.platforms.api_server import ( + APIServerAdapter, + _content_has_visible_payload, + _normalize_multimodal_content, + cors_middleware, + security_headers_middleware, +) + + +# --------------------------------------------------------------------------- +# Pure-function tests for _normalize_multimodal_content +# --------------------------------------------------------------------------- + + +class TestNormalizeMultimodalContent: + def test_string_passthrough(self): + assert _normalize_multimodal_content("hello") == "hello" + + def test_none_returns_empty_string(self): + assert _normalize_multimodal_content(None) == "" + + def test_text_only_list_collapses_to_string(self): + content = [{"type": "text", "text": "hi"}, {"type": "text", "text": "there"}] + assert _normalize_multimodal_content(content) == "hi\nthere" + + def test_responses_input_text_canonicalized(self): + content = [{"type": "input_text", "text": "hello"}] + assert _normalize_multimodal_content(content) == "hello" + + def test_image_url_preserved_with_text(self): + content = [ + {"type": "text", "text": "describe this"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png", "detail": "high"}}, + ] + out = _normalize_multimodal_content(content) + assert isinstance(out, list) + assert out == [ + {"type": "text", "text": "describe this"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png", "detail": "high"}}, + ] + + def test_input_image_converted_to_canonical_shape(self): + content = [ + {"type": "input_text", "text": "hi"}, + {"type": "input_image", "image_url": "https://example.com/cat.png"}, + ] + out = _normalize_multimodal_content(content) + assert out == [ + {"type": "text", "text": "hi"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}}, + ] + + def test_data_image_url_accepted(self): + content = [{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}}] + out = _normalize_multimodal_content(content) + assert out == [{"type": "image_url", "image_url": {"url": "data:image/png;base64,AAAA"}}] + + def test_non_image_data_url_rejected(self): + content = [{"type": "image_url", "image_url": {"url": "data:text/plain;base64,SGVsbG8="}}] + with pytest.raises(ValueError) as exc: + _normalize_multimodal_content(content) + assert str(exc.value).startswith("unsupported_content_type:") + + def test_file_part_rejected(self): + with pytest.raises(ValueError) as exc: + _normalize_multimodal_content([{"type": "file", "file": {"file_id": "f_1"}}]) + assert str(exc.value).startswith("unsupported_content_type:") + + def test_input_file_part_rejected(self): + with pytest.raises(ValueError) as exc: + _normalize_multimodal_content([{"type": "input_file", "file_id": "f_1"}]) + assert str(exc.value).startswith("unsupported_content_type:") + + def test_missing_url_rejected(self): + with pytest.raises(ValueError) as exc: + _normalize_multimodal_content([{"type": "image_url", "image_url": {}}]) + assert str(exc.value).startswith("invalid_image_url:") + + def test_bad_scheme_rejected(self): + with pytest.raises(ValueError) as exc: + _normalize_multimodal_content([{"type": "image_url", "image_url": {"url": "ftp://example.com/x.png"}}]) + assert str(exc.value).startswith("invalid_image_url:") + + def test_unknown_part_type_rejected(self): + with pytest.raises(ValueError) as exc: + _normalize_multimodal_content([{"type": "audio", "audio": {}}]) + assert str(exc.value).startswith("unsupported_content_type:") + + +class TestContentHasVisiblePayload: + def test_non_empty_string(self): + assert _content_has_visible_payload("hello") + + def test_whitespace_only_string(self): + assert not _content_has_visible_payload(" ") + + def test_list_with_image_only(self): + assert _content_has_visible_payload([{"type": "image_url", "image_url": {"url": "x"}}]) + + def test_list_with_only_empty_text(self): + assert not _content_has_visible_payload([{"type": "text", "text": ""}]) + + +# --------------------------------------------------------------------------- +# HTTP integration — real aiohttp client hitting the adapter handlers +# --------------------------------------------------------------------------- + + +def _make_adapter() -> APIServerAdapter: + return APIServerAdapter(PlatformConfig(enabled=True)) + + +def _create_app(adapter: APIServerAdapter) -> web.Application: + mws = [mw for mw in (cors_middleware, security_headers_middleware) if mw is not None] + app = web.Application(middlewares=mws) + app["api_server_adapter"] = adapter + app.router.add_post("/v1/chat/completions", adapter._handle_chat_completions) + app.router.add_post("/v1/responses", adapter._handle_responses) + app.router.add_get("/v1/responses/{response_id}", adapter._handle_get_response) + return app + + +@pytest.fixture +def adapter(): + return _make_adapter() + + +class TestChatCompletionsMultimodalHTTP: + @pytest.mark.asyncio + async def test_inline_image_preserved_to_run_agent(self, adapter): + """Multimodal user content reaches _run_agent as a list of parts.""" + image_payload = [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png", "detail": "high"}}, + ] + + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + with patch.object( + adapter, + "_run_agent", + new=MagicMock(), + ) as mock_run: + async def _stub(**kwargs): + mock_run.captured = kwargs + return ( + {"final_response": "A cat.", "messages": [], "api_calls": 1}, + {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}, + ) + mock_run.side_effect = _stub + + resp = await cli.post( + "/v1/chat/completions", + json={ + "model": "hermes-agent", + "messages": [{"role": "user", "content": image_payload}], + }, + ) + + assert resp.status == 200, await resp.text() + assert mock_run.captured["user_message"] == image_payload + + @pytest.mark.asyncio + async def test_text_only_array_collapses_to_string(self, adapter): + """Text-only array becomes a plain string so logging stays unchanged.""" + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + with patch.object(adapter, "_run_agent", new=MagicMock()) as mock_run: + async def _stub(**kwargs): + mock_run.captured = kwargs + return ( + {"final_response": "ok", "messages": [], "api_calls": 1}, + {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}, + ) + mock_run.side_effect = _stub + + resp = await cli.post( + "/v1/chat/completions", + json={ + "model": "hermes-agent", + "messages": [ + {"role": "user", "content": [{"type": "text", "text": "hello"}]}, + ], + }, + ) + + assert resp.status == 200, await resp.text() + assert mock_run.captured["user_message"] == "hello" + + @pytest.mark.asyncio + async def test_file_part_returns_400(self, adapter): + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + resp = await cli.post( + "/v1/chat/completions", + json={ + "model": "hermes-agent", + "messages": [ + {"role": "user", "content": [{"type": "file", "file": {"file_id": "f_1"}}]}, + ], + }, + ) + assert resp.status == 400 + body = await resp.json() + assert body["error"]["code"] == "unsupported_content_type" + assert body["error"]["param"] == "messages[0].content" + + @pytest.mark.asyncio + async def test_non_image_data_url_returns_400(self, adapter): + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + resp = await cli.post( + "/v1/chat/completions", + json={ + "model": "hermes-agent", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": "data:text/plain;base64,SGVsbG8="}, + }, + ], + }, + ], + }, + ) + assert resp.status == 400 + body = await resp.json() + assert body["error"]["code"] == "unsupported_content_type" + + +class TestResponsesMultimodalHTTP: + @pytest.mark.asyncio + async def test_input_image_canonicalized_and_forwarded(self, adapter): + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + with patch.object(adapter, "_run_agent", new=MagicMock()) as mock_run: + async def _stub(**kwargs): + mock_run.captured = kwargs + return ( + {"final_response": "ok", "messages": [], "api_calls": 1}, + {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}, + ) + mock_run.side_effect = _stub + + resp = await cli.post( + "/v1/responses", + json={ + "model": "hermes-agent", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Describe."}, + { + "type": "input_image", + "image_url": "https://example.com/cat.png", + }, + ], + } + ], + }, + ) + + assert resp.status == 200, await resp.text() + expected = [ + {"type": "text", "text": "Describe."}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png"}}, + ] + assert mock_run.captured["user_message"] == expected + + @pytest.mark.asyncio + async def test_input_file_returns_400(self, adapter): + app = _create_app(adapter) + async with TestClient(TestServer(app)) as cli: + resp = await cli.post( + "/v1/responses", + json={ + "model": "hermes-agent", + "input": [ + { + "role": "user", + "content": [{"type": "input_file", "file_id": "f_1"}], + } + ], + }, + ) + assert resp.status == 400 + body = await resp.json() + assert body["error"]["code"] == "unsupported_content_type" diff --git a/tests/run_agent/test_run_agent_multimodal_prologue.py b/tests/run_agent/test_run_agent_multimodal_prologue.py new file mode 100644 index 0000000000..1d470d0609 --- /dev/null +++ b/tests/run_agent/test_run_agent_multimodal_prologue.py @@ -0,0 +1,103 @@ +"""Regression tests for run_conversation's prologue handling of multimodal content. + +PR #5621 and earlier multimodal PRs hit an ``AttributeError`` in +``run_agent.run_conversation`` because the prologue unconditionally called +``user_message[:80] + "..."`` / ``.replace()`` / ``_safe_print(f"...{user_message[:60]}")`` +on what was now a list. These tests cover the two fixes: + + 1. ``_summarize_user_message_for_log`` accepts strings, lists, and ``None``. + 2. ``_chat_content_to_responses_parts`` converts chat-style content to the + Responses API ``input_text`` / ``input_image`` shape. + +They do NOT boot the full AIAgent — the prologue-fix guarantees are pure +function contracts at module scope. +""" + +from run_agent import _chat_content_to_responses_parts, _summarize_user_message_for_log + + +class TestSummarizeUserMessageForLog: + def test_plain_string_passthrough(self): + assert _summarize_user_message_for_log("hello world") == "hello world" + + def test_none_returns_empty_string(self): + assert _summarize_user_message_for_log(None) == "" + + def test_text_only_list(self): + content = [{"type": "text", "text": "hi"}, {"type": "text", "text": "there"}] + assert _summarize_user_message_for_log(content) == "hi there" + + def test_list_with_image_only(self): + content = [{"type": "image_url", "image_url": {"url": "https://x"}}] + # Image-only: "[1 image]" marker, no trailing space. + assert _summarize_user_message_for_log(content) == "[1 image]" + + def test_list_with_text_and_image(self): + content = [ + {"type": "text", "text": "describe this"}, + {"type": "image_url", "image_url": {"url": "https://x"}}, + ] + summary = _summarize_user_message_for_log(content) + assert "[1 image]" in summary + assert "describe this" in summary + + def test_list_with_multiple_images(self): + content = [ + {"type": "text", "text": "compare these"}, + {"type": "image_url", "image_url": {"url": "a"}}, + {"type": "image_url", "image_url": {"url": "b"}}, + ] + summary = _summarize_user_message_for_log(content) + assert "[2 images]" in summary + + def test_scalar_fallback(self): + assert _summarize_user_message_for_log(42) == "42" + + def test_list_supports_slice_and_replace(self): + """The whole point of this helper: its output must be a plain str.""" + content = [{"type": "text", "text": "x" * 200}, {"type": "image_url", "image_url": {"url": "y"}}] + summary = _summarize_user_message_for_log(content) + # These are the operations the run_conversation prologue performs. + _ = summary[:80] + "..." + _ = summary.replace("\n", " ") + + +class TestChatContentToResponsesParts: + def test_non_list_returns_empty(self): + assert _chat_content_to_responses_parts("hi") == [] + assert _chat_content_to_responses_parts(None) == [] + + def test_text_parts_become_input_text(self): + content = [{"type": "text", "text": "hello"}] + assert _chat_content_to_responses_parts(content) == [{"type": "input_text", "text": "hello"}] + + def test_image_url_object_becomes_input_image(self): + content = [{"type": "image_url", "image_url": {"url": "https://x", "detail": "high"}}] + assert _chat_content_to_responses_parts(content) == [ + {"type": "input_image", "image_url": "https://x", "detail": "high"}, + ] + + def test_bare_string_image_url(self): + content = [{"type": "image_url", "image_url": "https://x"}] + assert _chat_content_to_responses_parts(content) == [{"type": "input_image", "image_url": "https://x"}] + + def test_responses_format_passthrough(self): + """Input already in Responses format should round-trip cleanly.""" + content = [ + {"type": "input_text", "text": "hi"}, + {"type": "input_image", "image_url": "https://x"}, + ] + assert _chat_content_to_responses_parts(content) == [ + {"type": "input_text", "text": "hi"}, + {"type": "input_image", "image_url": "https://x"}, + ] + + def test_unknown_parts_skipped(self): + """Unknown types shouldn't crash — filtered silently at this level + (the API server's normalizer rejects them earlier).""" + content = [{"type": "text", "text": "ok"}, {"type": "audio", "x": "y"}] + assert _chat_content_to_responses_parts(content) == [{"type": "input_text", "text": "ok"}] + + def test_empty_url_image_skipped(self): + content = [{"type": "image_url", "image_url": {"url": ""}}] + assert _chat_content_to_responses_parts(content) == [] diff --git a/website/docs/user-guide/features/api-server.md b/website/docs/user-guide/features/api-server.md index 82c6db0b2c..baae1d2d57 100644 --- a/website/docs/user-guide/features/api-server.md +++ b/website/docs/user-guide/features/api-server.md @@ -83,6 +83,25 @@ Standard OpenAI Chat Completions format. Stateless — the full conversation is } ``` +**Inline image input:** user messages may send `content` as an array of `text` and `image_url` parts. Both remote `http(s)` URLs and `data:image/...` URLs are supported: + +```json +{ + "model": "hermes-agent", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/cat.png", "detail": "high"}} + ] + } + ] +} +``` + +Uploaded files (`file` / `input_file` / `file_id`) and non-image `data:` URLs return `400 unsupported_content_type`. + **Streaming** (`"stream": true`): Returns Server-Sent Events (SSE) with token-by-token response chunks. For **Chat Completions**, the stream uses standard `chat.completion.chunk` events plus Hermes' custom `hermes.tool.progress` event for tool-start UX. For **Responses**, the stream uses OpenAI Responses event types such as `response.created`, `response.output_text.delta`, `response.output_item.added`, `response.output_item.done`, and `response.completed`. **Tool progress in streams**: @@ -119,6 +138,25 @@ OpenAI Responses API format. Supports server-side conversation state via `previo } ``` +**Inline image input:** `input[].content` can contain `input_text` and `input_image` parts. Both remote URLs and `data:image/...` URLs are supported: + +```json +{ + "model": "hermes-agent", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Describe this screenshot."}, + {"type": "input_image", "image_url": "data:image/png;base64,iVBORw0K..."} + ] + } + ] +} +``` + +Uploaded files (`input_file` / `file_id`) and non-image `data:` URLs return `400 unsupported_content_type`. + #### Multi-turn with previous_response_id Chain responses to maintain full context (including tool calls) across turns: @@ -330,7 +368,7 @@ In Open WebUI, add each as a separate connection. The model dropdown shows `alic ## Limitations - **Response storage** — stored responses (for `previous_response_id`) are persisted in SQLite and survive gateway restarts. Max 100 stored responses (LRU eviction). -- **No file upload** — vision/document analysis via uploaded files is not yet supported through the API. +- **No file upload** — inline images are supported on both `/v1/chat/completions` and `/v1/responses`, but uploaded files (`file`, `input_file`, `file_id`) and non-image document inputs are not supported through the API. - **Model field is cosmetic** — the `model` field in requests is accepted but the actual LLM model used is configured server-side in config.yaml. ## Proxy Mode