diff --git a/acp_adapter/server.py b/acp_adapter/server.py index 64a31063eb..862e9c5866 100644 --- a/acp_adapter/server.py +++ b/acp_adapter/server.py @@ -31,6 +31,7 @@ from acp.schema import ( McpServerStdio, ModelInfo, NewSessionResponse, + PromptCapabilities, PromptResponse, ResumeSessionResponse, SetSessionConfigOptionResponse, @@ -90,17 +91,69 @@ def _extract_text( | EmbeddedResourceContentBlock ], ) -> str: - """Extract plain text from ACP content blocks.""" + """Extract plain text from ACP content blocks for display/commands.""" parts: list[str] = [] for block in prompt: if isinstance(block, TextContentBlock): parts.append(block.text) elif hasattr(block, "text"): parts.append(str(block.text)) - # Non-text blocks are ignored for now. return "\n".join(parts) +def _image_block_to_openai_part(block: ImageContentBlock) -> dict[str, Any] | None: + """Convert an ACP image content block to OpenAI-style multimodal content.""" + data = str(getattr(block, "data", "") or "").strip() + uri = str(getattr(block, "uri", "") or "").strip() + mime_type = str(getattr(block, "mime_type", "") or "image/png").strip() or "image/png" + + if data: + url = data if data.startswith("data:") else f"data:{mime_type};base64,{data}" + elif uri: + url = uri + else: + return None + + return {"type": "image_url", "image_url": {"url": url}} + + +def _content_blocks_to_openai_user_content( + prompt: list[ + TextContentBlock + | ImageContentBlock + | AudioContentBlock + | ResourceContentBlock + | EmbeddedResourceContentBlock + ], +) -> str | list[dict[str, Any]]: + """Convert ACP prompt blocks into a Hermes/OpenAI-compatible user content payload.""" + parts: list[dict[str, Any]] = [] + text_parts: list[str] = [] + + for block in prompt: + if isinstance(block, TextContentBlock): + if block.text: + parts.append({"type": "text", "text": block.text}) + text_parts.append(block.text) + continue + if isinstance(block, ImageContentBlock): + image_part = _image_block_to_openai_part(block) + if image_part is not None: + parts.append(image_part) + continue + + if not parts: + return _extract_text(prompt) + + # Keep pure text prompts as strings so slash-command handling and text-only + # providers keep the exact legacy path. Switch to structured content only + # when an actual non-text block is present. + if all(part.get("type") == "text" for part in parts): + return "\n".join(text_parts) + + return parts + + class HermesACPAgent(acp.Agent): """ACP Agent implementation wrapping Hermes AIAgent.""" @@ -354,6 +407,7 @@ class HermesACPAgent(acp.Agent): agent_info=Implementation(name="hermes-agent", version=HERMES_VERSION), agent_capabilities=AgentCapabilities( load_session=True, + prompt_capabilities=PromptCapabilities(image=True), session_capabilities=SessionCapabilities( fork=SessionForkCapabilities(), list=SessionListCapabilities(), @@ -593,11 +647,18 @@ class HermesACPAgent(acp.Agent): return PromptResponse(stop_reason="refusal") user_text = _extract_text(prompt).strip() - if not user_text: + user_content = _content_blocks_to_openai_user_content(prompt) + has_content = bool(user_text) or ( + isinstance(user_content, list) and bool(user_content) + ) + if not has_content: return PromptResponse(stop_reason="end_turn") - # Intercept slash commands — handle locally without calling the LLM - if user_text.startswith("/"): + # Intercept slash commands — handle locally without calling the LLM. + # Slash commands are text-only; if the client included images/resources, + # send the whole multimodal prompt to the agent instead of treating it as + # an ACP command. + if isinstance(user_content, str) and user_text.startswith("/"): response_text = self._handle_slash_command(user_text, state) if response_text is not None: if self._conn: @@ -680,9 +741,10 @@ class HermesACPAgent(acp.Agent): os.environ["HERMES_INTERACTIVE"] = "1" try: result = agent.run_conversation( - user_message=user_text, + user_message=user_content, conversation_history=state.history, task_id=session_id, + persist_user_message=user_text or "[Image attachment]", ) return result except Exception as e: diff --git a/tests/acp_adapter/test_acp_images.py b/tests/acp_adapter/test_acp_images.py new file mode 100644 index 0000000000..03d37840f3 --- /dev/null +++ b/tests/acp_adapter/test_acp_images.py @@ -0,0 +1,36 @@ +import pytest +from acp.schema import ImageContentBlock, TextContentBlock + +from acp_adapter.server import HermesACPAgent, _content_blocks_to_openai_user_content + + +def test_acp_image_blocks_convert_to_openai_multimodal_content(): + content = _content_blocks_to_openai_user_content([ + TextContentBlock(type="text", text="What is in this image?"), + ImageContentBlock(type="image", data="aGVsbG8=", mimeType="image/png"), + ]) + + assert content == [ + {"type": "text", "text": "What is in this image?"}, + { + "type": "image_url", + "image_url": {"url": "data:image/png;base64,aGVsbG8="}, + }, + ] + + +def test_text_only_acp_blocks_stay_string_for_legacy_prompt_path(): + content = _content_blocks_to_openai_user_content([ + TextContentBlock(type="text", text="/help"), + ]) + + assert content == "/help" + + +@pytest.mark.asyncio +async def test_initialize_advertises_image_prompt_capability(): + response = await HermesACPAgent().initialize() + + assert response.agent_capabilities is not None + assert response.agent_capabilities.prompt_capabilities is not None + assert response.agent_capabilities.prompt_capabilities.image is True