From 11b9b146f111e45c9349c622c7a65ea3e7629518 Mon Sep 17 00:00:00 2001 From: briandevans <252620095+briandevans@users.noreply.github.com> Date: Sat, 2 May 2026 17:12:46 -0700 Subject: [PATCH] fix(image-routing): expose attached image paths in native multimodal text part In native image mode (vision-capable models like gpt-4o, claude-sonnet-4), build_native_content_parts() previously emitted only the user's caption plus image_url parts. The local file path of each attached image never appeared in the conversation text, so the model could see the pixels but had no string handle for tools that take image_url: str (custom MCP tools, vision_analyze on a re-look, attach-to-tracker workflows). The text-mode path already injects an equivalent hint via Runner._enrich_message_with_vision ("...vision_analyze using image_url: ..."). This brings native mode to parity by appending one "[Image attached at: ]" line per successfully attached image to the user-text part of the multimodal turn. Skipped (unreadable) paths are NOT advertised, so the model is never told a non-existent file is attached. Co-Authored-By: Claude Opus 4.7 (1M context) --- agent/image_routing.py | 43 +++++++++++++++++++------- tests/agent/test_image_routing.py | 51 ++++++++++++++++++++++++++++--- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/agent/image_routing.py b/agent/image_routing.py index bd2ba83c87..49eaed2f9d 100644 --- a/agent/image_routing.py +++ b/agent/image_routing.py @@ -190,24 +190,30 @@ def build_native_content_parts( """Build an OpenAI-style ``content`` list for a user turn. Shape: - [{"type": "text", "text": "..."}, + [{"type": "text", "text": "...\\n\\n[Image attached at: /local/path]"}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}, ...] + The local path of each successfully attached image is appended to the + text part as ``[Image attached at: ]``. The model still sees the + pixels via the ``image_url`` part (full native vision); the path note + just gives it a string handle so MCP/skill tools that take an image + path or URL argument can be invoked on the same image without an + extra round-trip. This parallels the text-mode hint produced by + ``Runner._enrich_message_with_vision`` (``vision_analyze using image_url: + ``) so behaviour is consistent across both image input modes. + Images are attached at their native size. If a provider rejects the request because an image is too large (e.g. Anthropic's 5 MB per-image ceiling), the agent's retry loop transparently shrinks and retries once — see ``run_agent._try_shrink_image_parts_in_messages``. Returns (content_parts, skipped_paths). Skipped paths are files that - couldn't be read from disk. + couldn't be read from disk and are NOT advertised in the path hints. """ - parts: List[Dict[str, Any]] = [] skipped: List[str] = [] - - text = (user_text or "").strip() - if text: - parts.append({"type": "text", "text": text}) + image_parts: List[Dict[str, Any]] = [] + attached_paths: List[str] = [] for raw_path in image_paths: p = Path(raw_path) @@ -218,15 +224,30 @@ def build_native_content_parts( if not data_url: skipped.append(str(raw_path)) continue - parts.append({ + image_parts.append({ "type": "image_url", "image_url": {"url": data_url}, }) + attached_paths.append(str(raw_path)) - # If the text was empty, add a neutral prompt so the turn isn't just images. - if not text and any(p.get("type") == "image_url" for p in parts): - parts.insert(0, {"type": "text", "text": "What do you see in this image?"}) + text = (user_text or "").strip() + # If at least one image attached, build a single text part that combines + # the user's caption (or a neutral default) with one path hint per image. + if attached_paths: + base_text = text or "What do you see in this image?" + path_hints = "\n".join( + f"[Image attached at: {p}]" for p in attached_paths + ) + combined_text = f"{base_text}\n\n{path_hints}" + parts: List[Dict[str, Any]] = [{"type": "text", "text": combined_text}] + parts.extend(image_parts) + return parts, skipped + + # No images successfully attached — fall back to plain text-only behaviour. + parts = [] + if text: + parts.append({"type": "text", "text": text}) return parts, skipped diff --git a/tests/agent/test_image_routing.py b/tests/agent/test_image_routing.py index 9fd02eeecc..aef7bbda65 100644 --- a/tests/agent/test_image_routing.py +++ b/tests/agent/test_image_routing.py @@ -127,7 +127,11 @@ class TestBuildNativeContentParts: parts, skipped = build_native_content_parts("hello", [str(img)]) assert skipped == [] assert len(parts) == 2 - assert parts[0] == {"type": "text", "text": "hello"} + assert parts[0]["type"] == "text" + # User caption is preserved and a per-image path hint is appended so + # the model can use the local path as a string argument for tools + # that take ``image_url: str`` (issue #18960). + assert parts[0]["text"] == f"hello\n\n[Image attached at: {img}]" assert parts[1]["type"] == "image_url" assert parts[1]["image_url"]["url"].startswith("data:image/png;base64,") @@ -137,17 +141,51 @@ class TestBuildNativeContentParts: parts, skipped = build_native_content_parts("", [str(img)]) assert skipped == [] # Even with empty user text, we insert a neutral prompt so the turn - # isn't just pixels. + # isn't just pixels, and the path hint is appended after. assert parts[0]["type"] == "text" - assert parts[0]["text"] == "What do you see in this image?" + assert parts[0]["text"] == ( + f"What do you see in this image?\n\n[Image attached at: {img}]" + ) assert parts[1]["type"] == "image_url" def test_missing_file_is_skipped(self, tmp_path: Path): parts, skipped = build_native_content_parts("hi", [str(tmp_path / "missing.png")]) assert skipped == [str(tmp_path / "missing.png")] - # Only text remains. + # Skipped paths are NOT advertised in the path hints — the model + # would otherwise be told a non-existent file is attached. assert parts == [{"type": "text", "text": "hi"}] + def test_path_hint_appended(self, tmp_path: Path): + """The local path of each attached image is appended to the user + text part so MCP/skill tools that take ``image_url: str`` can be + invoked on the same image (issue #18960). Mirrors text-mode + behaviour (`Runner._enrich_message_with_vision`). + """ + img = tmp_path / "scan.png" + img.write_bytes(_png_bytes()) + parts, _ = build_native_content_parts("attach this", [str(img)]) + text_part = next(p for p in parts if p.get("type") == "text") + assert "[Image attached at:" in text_part["text"] + assert str(img) in text_part["text"] + # User caption is preserved verbatim ahead of the hint. + assert text_part["text"].startswith("attach this") + + def test_path_hint_one_per_attached_image(self, tmp_path: Path): + """Each successfully attached image gets its own path hint line; + skipped images do NOT appear in the hints. + """ + good = tmp_path / "good.png" + good.write_bytes(_png_bytes()) + missing = tmp_path / "missing.png" # never created + parts, skipped = build_native_content_parts( + "see attached", [str(good), str(missing)] + ) + assert skipped == [str(missing)] + text_part = next(p for p in parts if p.get("type") == "text") + assert text_part["text"].count("[Image attached at:") == 1 + assert str(good) in text_part["text"] + assert str(missing) not in text_part["text"] + def test_multiple_images(self, tmp_path: Path): img1 = tmp_path / "a.png" img2 = tmp_path / "b.png" @@ -157,6 +195,11 @@ class TestBuildNativeContentParts: assert skipped == [] image_parts = [p for p in parts if p.get("type") == "image_url"] assert len(image_parts) == 2 + # Both paths surface in the text part, one per line. + text_part = next(p for p in parts if p.get("type") == "text") + assert text_part["text"].count("[Image attached at:") == 2 + assert str(img1) in text_part["text"] + assert str(img2) in text_part["text"] def test_mime_inference_jpg(self, tmp_path: Path): img = tmp_path / "photo.jpg"