diff --git a/gateway/run.py b/gateway/run.py index 1a97457d9eb..26368cd394e 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -688,7 +688,18 @@ def _last_transcript_timestamp(history: Optional[List[Dict[str, Any]]]) -> Any: # ordinary outputs. Only tools that intentionally create deliverable media # artifacts should be eligible for automatic append when the model omits them # from the final gateway reply. -_AUTO_APPEND_MEDIA_TOOL_NAMES = {"text_to_speech", "text_to_speech_tool"} +_AUTO_APPEND_MEDIA_TOOL_NAMES = { + "text_to_speech", + "text_to_speech_tool", + "image_generate", +} + +# Tools in this set return their deliverable artifact as a JSON payload with a +# local-file path field rather than a literal ``MEDIA:`` tag (e.g. image_generate +# returns ``{"success": true, "image": "/abs/path.png"}``). The auto-append path +# extracts the path from these fields so delivery is deterministic and does not +# depend on the model restating the path in its final reply. +_JSON_MEDIA_TOOL_PATH_FIELDS = ("host_image", "image", "agent_visible_image") # Extension-anchored MEDIA: matcher for tool results. Mirrors the dispatch-site @@ -755,10 +766,28 @@ def _collect_auto_append_media_tags( if tool_name_by_call_id.get(call_id) not in _AUTO_APPEND_MEDIA_TOOL_NAMES: continue content = str(msg.get("content") or "") + tool_name = tool_name_by_call_id.get(call_id) + # JSON-payload tools (image_generate) return a local-file path in a + # known field rather than a MEDIA: tag. Extract it so delivery is + # deterministic even when the model omits the path from its reply. + if tool_name == "image_generate" and "MEDIA:" not in content: + try: + payload = json.loads(content) + except Exception: + payload = None + if isinstance(payload, dict) and payload.get("success"): + for field in _JSON_MEDIA_TOOL_PATH_FIELDS: + path = payload.get(field) + if (isinstance(path, str) + and _TOOL_MEDIA_RE.fullmatch(f"MEDIA:{path}") + and path not in history_media_paths): + media_tags.append(f"MEDIA:{path}") + break + continue if "MEDIA:" not in content: continue for match in _TOOL_MEDIA_RE.finditer(content): - path = match.group(1).strip().rstrip('\",}') + path = match.group(1).strip().rstrip('",}') if path and path not in history_media_paths: media_tags.append(f"MEDIA:{path}") if "[[audio_as_voice]]" in content: diff --git a/tests/gateway/test_media_extraction.py b/tests/gateway/test_media_extraction.py index 11a44f629ff..74b4c877f67 100644 --- a/tests/gateway/test_media_extraction.py +++ b/tests/gateway/test_media_extraction.py @@ -159,7 +159,106 @@ caption tags, voice = _collect_auto_append_media_tags(messages, history_offset=0) assert tags == ["MEDIA:/tmp/voice.ogg"] assert voice is True - + + def test_gateway_auto_append_image_generate_json_path(self): + """image_generate returns a local path in JSON (no MEDIA: tag); it is + auto-appended so delivery doesn't depend on the model restating it.""" + from gateway.run import _collect_auto_append_media_tags + + messages = [ + {"role": "user", "content": "Make me a cat"}, + { + "role": "assistant", + "tool_calls": [ + {"id": "call_img", "function": {"name": "image_generate"}} + ], + }, + { + "role": "tool", + "tool_call_id": "call_img", + "content": '{"success": true, "image": "/tmp/gen/cat.png", "agent_visible_image": "/tmp/gen/cat.png"}', + }, + {"role": "assistant", "content": "Here's your cat."}, + ] + + tags, voice = _collect_auto_append_media_tags(messages, history_offset=0) + assert tags == ["MEDIA:/tmp/gen/cat.png"] + assert voice is False + + def test_gateway_auto_append_image_generate_prefers_host_path(self): + """When host and sandbox paths differ, the host-deliverable path wins.""" + from gateway.run import _collect_auto_append_media_tags + + messages = [ + {"role": "user", "content": "Make me a dog"}, + { + "role": "assistant", + "tool_calls": [ + {"id": "call_img", "function": {"name": "image_generate"}} + ], + }, + { + "role": "tool", + "tool_call_id": "call_img", + "content": '{"success": true, "host_image": "/host/dog.jpg", "image": "/host/dog.jpg", "agent_visible_image": "/sandbox/dog.jpg"}', + }, + ] + + tags, _ = _collect_auto_append_media_tags(messages, history_offset=0) + assert tags == ["MEDIA:/host/dog.jpg"] + + def test_gateway_auto_append_image_generate_failure_and_url_ignored(self): + """Failed generations and remote URLs are not auto-delivered.""" + from gateway.run import _collect_auto_append_media_tags + + def _img_msgs(content): + return [ + { + "role": "assistant", + "tool_calls": [ + {"id": "c", "function": {"name": "image_generate"}} + ], + }, + {"role": "tool", "tool_call_id": "c", "content": content}, + ] + + # Failed generation + tags, _ = _collect_auto_append_media_tags( + _img_msgs('{"success": false, "image": null, "error": "boom"}'), + history_offset=0, + ) + assert tags == [] + + # Remote URL is not a local file path + tags, _ = _collect_auto_append_media_tags( + _img_msgs('{"success": true, "image": "https://fal.media/x/cat.png"}'), + history_offset=0, + ) + assert tags == [] + + def test_gateway_auto_append_image_generate_dedupes_history(self): + """A generated image path already in history is not re-sent.""" + from gateway.run import _collect_auto_append_media_tags + + messages = [ + { + "role": "assistant", + "tool_calls": [ + {"id": "c", "function": {"name": "image_generate"}} + ], + }, + { + "role": "tool", + "tool_call_id": "c", + "content": '{"success": true, "image": "/tmp/gen/cat.png"}', + }, + ] + + tags, _ = _collect_auto_append_media_tags( + messages, history_offset=0, history_media_paths={"/tmp/gen/cat.png"} + ) + assert tags == [] + def test_media_tags_not_extracted_from_history(self): """MEDIA tags from previous turns should NOT be extracted again.""" # Simulate conversation history with a TTS call from a previous turn