fix(acp): advertise and forward image prompts

2026-05-02 02:01:47 +00:00 · 2026-04-30 13:21:23 +01:00 · 2026-04-30 13:21:23 +01:00 · cdf9793d6d
commit cdf9793d6d
parent 29bcd2f6e9
2 changed files with 104 additions and 6 deletions
--- a/acp_adapter/server.py
+++ b/acp_adapter/server.py
@ -31,6 +31,7 @@ from acp.schema import (
    McpServerStdio,
    ModelInfo,
    NewSessionResponse,
+    PromptCapabilities,
    PromptResponse,
    ResumeSessionResponse,
    SetSessionConfigOptionResponse,
@ -90,17 +91,69 @@ def _extract_text(
        | EmbeddedResourceContentBlock
    ],
 ) -> str:
-    """Extract plain text from ACP content blocks."""
+    """Extract plain text from ACP content blocks for display/commands."""
    parts: list[str] = []
    for block in prompt:
        if isinstance(block, TextContentBlock):
            parts.append(block.text)
        elif hasattr(block, "text"):
            parts.append(str(block.text))
-        # Non-text blocks are ignored for now.
    return "\n".join(parts)


+def _image_block_to_openai_part(block: ImageContentBlock) -> dict[str, Any] | None:
+    """Convert an ACP image content block to OpenAI-style multimodal content."""
+    data = str(getattr(block, "data", "") or "").strip()
+    uri = str(getattr(block, "uri", "") or "").strip()
+    mime_type = str(getattr(block, "mime_type", "") or "image/png").strip() or "image/png"
+
+    if data:
+        url = data if data.startswith("data:") else f"data:{mime_type};base64,{data}"
+    elif uri:
+        url = uri
+    else:
+        return None
+
+    return {"type": "image_url", "image_url": {"url": url}}
+
+
+def _content_blocks_to_openai_user_content(
+    prompt: list[
+        TextContentBlock
+        | ImageContentBlock
+        | AudioContentBlock
+        | ResourceContentBlock
+        | EmbeddedResourceContentBlock
+    ],
+) -> str | list[dict[str, Any]]:
+    """Convert ACP prompt blocks into a Hermes/OpenAI-compatible user content payload."""
+    parts: list[dict[str, Any]] = []
+    text_parts: list[str] = []
+
+    for block in prompt:
+        if isinstance(block, TextContentBlock):
+            if block.text:
+                parts.append({"type": "text", "text": block.text})
+                text_parts.append(block.text)
+            continue
+        if isinstance(block, ImageContentBlock):
+            image_part = _image_block_to_openai_part(block)
+            if image_part is not None:
+                parts.append(image_part)
+            continue
+
+    if not parts:
+        return _extract_text(prompt)
+
+    # Keep pure text prompts as strings so slash-command handling and text-only
+    # providers keep the exact legacy path. Switch to structured content only
+    # when an actual non-text block is present.
+    if all(part.get("type") == "text" for part in parts):
+        return "\n".join(text_parts)
+
+    return parts
+
+
 class HermesACPAgent(acp.Agent):
    """ACP Agent implementation wrapping Hermes AIAgent."""

@ -354,6 +407,7 @@ class HermesACPAgent(acp.Agent):
            agent_info=Implementation(name="hermes-agent", version=HERMES_VERSION),
            agent_capabilities=AgentCapabilities(
                load_session=True,
+                prompt_capabilities=PromptCapabilities(image=True),
                session_capabilities=SessionCapabilities(
                    fork=SessionForkCapabilities(),
                    list=SessionListCapabilities(),
@ -593,11 +647,18 @@ class HermesACPAgent(acp.Agent):
            return PromptResponse(stop_reason="refusal")

        user_text = _extract_text(prompt).strip()
-        if not user_text:
+        user_content = _content_blocks_to_openai_user_content(prompt)
+        has_content = bool(user_text) or (
+            isinstance(user_content, list) and bool(user_content)
+        )
+        if not has_content:
            return PromptResponse(stop_reason="end_turn")

-        # Intercept slash commands — handle locally without calling the LLM
-        if user_text.startswith("/"):
+        # Intercept slash commands — handle locally without calling the LLM.
+        # Slash commands are text-only; if the client included images/resources,
+        # send the whole multimodal prompt to the agent instead of treating it as
+        # an ACP command.
+        if isinstance(user_content, str) and user_text.startswith("/"):
            response_text = self._handle_slash_command(user_text, state)
            if response_text is not None:
                if self._conn:
@ -680,9 +741,10 @@ class HermesACPAgent(acp.Agent):
            os.environ["HERMES_INTERACTIVE"] = "1"
            try:
                result = agent.run_conversation(
-                    user_message=user_text,
+                    user_message=user_content,
                    conversation_history=state.history,
                    task_id=session_id,
+                    persist_user_message=user_text or "[Image attachment]",
                )
                return result
            except Exception as e:
--- a/tests/acp_adapter/test_acp_images.py
+++ b/tests/acp_adapter/test_acp_images.py
@ -0,0 +1,36 @@
+import pytest
+from acp.schema import ImageContentBlock, TextContentBlock
+
+from acp_adapter.server import HermesACPAgent, _content_blocks_to_openai_user_content
+
+
+def test_acp_image_blocks_convert_to_openai_multimodal_content():
+    content = _content_blocks_to_openai_user_content([
+        TextContentBlock(type="text", text="What is in this image?"),
+        ImageContentBlock(type="image", data="aGVsbG8=", mimeType="image/png"),
+    ])
+
+    assert content == [
+        {"type": "text", "text": "What is in this image?"},
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,aGVsbG8="},
+        },
+    ]
+
+
+def test_text_only_acp_blocks_stay_string_for_legacy_prompt_path():
+    content = _content_blocks_to_openai_user_content([
+        TextContentBlock(type="text", text="/help"),
+    ])
+
+    assert content == "/help"
+
+
+@pytest.mark.asyncio
+async def test_initialize_advertises_image_prompt_capability():
+    response = await HermesACPAgent().initialize()
+
+    assert response.agent_capabilities is not None
+    assert response.agent_capabilities.prompt_capabilities is not None
+    assert response.agent_capabilities.prompt_capabilities.image is True