diff --git a/agent/image_routing.py b/agent/image_routing.py index bd2ba83c87..49eaed2f9d 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -190,24 +190,30 @@ def build_native_content_parts( """Build an OpenAI-style ``content`` list for a user turn. Shape: - [{"type": "text", "text": "..."}, + [{"type": "text", "text": "...\\n\\n[Image attached at: /local/path]"}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}, ...] + The local path of each successfully attached image is appended to the + text part as ``[Image attached at: ]``. The model still sees the + pixels via the ``image_url`` part (full native vision); the path note + just gives it a string handle so MCP/skill tools that take an image + path or URL argument can be invoked on the same image without an + extra round-trip. This parallels the text-mode hint produced by + ``Runner._enrich_message_with_vision`` (``vision_analyze using image_url: + ``) so behaviour is consistent across both image input modes. + Images are attached at their native size. If a provider rejects the request because an image is too large (e.g. Anthropic's 5 MB per-image ceiling), the agent's retry loop transparently shrinks and retries once — see ``run_agent._try_shrink_image_parts_in_messages``. Returns (content_parts, skipped_paths). Skipped paths are files that - couldn't be read from disk. + couldn't be read from disk and are NOT advertised in the path hints. """ - parts: List[Dict[str, Any]] = [] skipped: List[str] = [] - - text = (user_text or "").strip() - if text: - parts.append({"type": "text", "text": text}) + image_parts: List[Dict[str, Any]] = [] + attached_paths: List[str] = [] for raw_path in image_paths: p = Path(raw_path) @@ -218,15 +224,30 @@ def build_native_content_parts( if not data_url: skipped.append(str(raw_path)) continue - parts.append({ + image_parts.append({ "type": "image_url", "image_url": {"url": data_url}, }) + attached_paths.append(str(raw_path)) - # If the text was empty, add a neutral prompt so the turn isn't just images. - if not text and any(p.get("type") == "image_url" for p in parts): - parts.insert(0, {"type": "text", "text": "What do you see in this image?"}) + text = (user_text or "").strip() + # If at least one image attached, build a single text part that combines + # the user's caption (or a neutral default) with one path hint per image. + if attached_paths: + base_text = text or "What do you see in this image?" + path_hints = "\n".join( + f"[Image attached at: {p}]" for p in attached_paths + ) + combined_text = f"{base_text}\n\n{path_hints}" + parts: List[Dict[str, Any]] = [{"type": "text", "text": combined_text}] + parts.extend(image_parts) + return parts, skipped + + # No images successfully attached — fall back to plain text-only behaviour. + parts = [] + if text: + parts.append({"type": "text", "text": text}) return parts, skipped diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index 9fd02eeecc..aef7bbda65 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -127,7 +127,11 @@ class TestBuildNativeContentParts: parts, skipped = build_native_content_parts("hello", [str(img)]) assert skipped == [] assert len(parts) == 2 - assert parts[0] == {"type": "text", "text": "hello"} + assert parts[0]["type"] == "text" + # User caption is preserved and a per-image path hint is appended so + # the model can use the local path as a string argument for tools + # that take ``image_url: str`` (issue #18960). + assert parts[0]["text"] == f"hello\n\n[Image attached at: {img}]" assert parts[1]["type"] == "image_url" assert parts[1]["image_url"]["url"].startswith("data:image/png;base64,") @@ -137,17 +141,51 @@ class TestBuildNativeContentParts: parts, skipped = build_native_content_parts("", [str(img)]) assert skipped == [] # Even with empty user text, we insert a neutral prompt so the turn - # isn't just pixels. + # isn't just pixels, and the path hint is appended after. assert parts[0]["type"] == "text" - assert parts[0]["text"] == "What do you see in this image?" + assert parts[0]["text"] == ( + f"What do you see in this image?\n\n[Image attached at: {img}]" + ) assert parts[1]["type"] == "image_url" def test_missing_file_is_skipped(self, tmp_path: Path): parts, skipped = build_native_content_parts("hi", [str(tmp_path / "missing.png")]) assert skipped == [str(tmp_path / "missing.png")] - # Only text remains. + # Skipped paths are NOT advertised in the path hints — the model + # would otherwise be told a non-existent file is attached. assert parts == [{"type": "text", "text": "hi"}] + def test_path_hint_appended(self, tmp_path: Path): + """The local path of each attached image is appended to the user + text part so MCP/skill tools that take ``image_url: str`` can be + invoked on the same image (issue #18960). Mirrors text-mode + behaviour (`Runner._enrich_message_with_vision`). + """ + img = tmp_path / "scan.png" + img.write_bytes(_png_bytes()) + parts, _ = build_native_content_parts("attach this", [str(img)]) + text_part = next(p for p in parts if p.get("type") == "text") + assert "[Image attached at:" in text_part["text"] + assert str(img) in text_part["text"] + # User caption is preserved verbatim ahead of the hint. + assert text_part["text"].startswith("attach this") + + def test_path_hint_one_per_attached_image(self, tmp_path: Path): + """Each successfully attached image gets its own path hint line; + skipped images do NOT appear in the hints. + """ + good = tmp_path / "good.png" + good.write_bytes(_png_bytes()) + missing = tmp_path / "missing.png" # never created + parts, skipped = build_native_content_parts( + "see attached", [str(good), str(missing)] + ) + assert skipped == [str(missing)] + text_part = next(p for p in parts if p.get("type") == "text") + assert text_part["text"].count("[Image attached at:") == 1 + assert str(good) in text_part["text"] + assert str(missing) not in text_part["text"] + def test_multiple_images(self, tmp_path: Path): img1 = tmp_path / "a.png" img2 = tmp_path / "b.png" @@ -157,6 +195,11 @@ class TestBuildNativeContentParts: assert skipped == [] image_parts = [p for p in parts if p.get("type") == "image_url"] assert len(image_parts) == 2 + # Both paths surface in the text part, one per line. + text_part = next(p for p in parts if p.get("type") == "text") + assert text_part["text"].count("[Image attached at:") == 2 + assert str(img1) in text_part["text"] + assert str(img2) in text_part["text"] def test_mime_inference_jpg(self, tmp_path: Path): img = tmp_path / "photo.jpg"