fix(gateway): scope tool-result MEDIA scan to current turn

The post-run scan that appends tool-emitted MEDIA: tags to the final
response iterated every tool/function message in the full conversation
and relied solely on path-based dedup against paths reconstructed from
the replayable transcript. When that reconstruction does not byte-match
the in-memory tool content (timestamp stripping, observed-context
withholding, compression rewrites), a stale path emitted several turns
earlier is absent from the dedup set and leaks onto a later text-only
reply (Telegram 'Sending media group of 1 photo(s)' with no MEDIA
directive present).

Scope the scan to this turn's new messages by slicing result['messages']
at len(agent_history) (agent_history is passed as conversation_history
into run_conversation, so the returned list is history + this turn).
Retain path-based dedup as a secondary guard and as the sole guard on
the compression-shrink fallback, preserving the #160 behaviour.

Closes #34608
This commit is contained in:
Bartok9 2026-05-29 08:39:04 -04:00 committed by Teknium
parent 38c4f8c371
commit 08c0b22417
2 changed files with 158 additions and 4 deletions

View file

@ -17478,13 +17478,33 @@ class GatewayRunner:
# append any that aren't already present in the final response, so the
# adapter's extract_media() can find and deliver the files exactly once.
#
# Uses path-based deduplication against _history_media_paths (collected
# before run_conversation) instead of index slicing. This is safe even
# when context compression shrinks the message list. (Fixes #160)
# Scope the scan to THIS turn's tool results only. ``agent_history``
# was passed into run_conversation as ``conversation_history``, so the
# agent's returned ``messages`` list is ``agent_history`` followed by
# the messages produced this turn. Slicing at ``len(agent_history)``
# isolates the current turn precisely, so a stale MEDIA: path emitted
# by a tool several turns earlier (still present in the full message
# list) can never leak onto a later text-only reply. (Fixes #34608)
#
# Path-based deduplication against _history_media_paths (collected
# before run_conversation) is retained as a secondary guard. It is
# also the sole guard on the fallback branch taken when mid-run
# context compression shrinks the message list below the original
# history length, preserving the compression-safe behaviour of #160.
if "MEDIA:" not in final_response:
media_tags = []
has_voice_directive = False
for msg in result.get("messages", []):
_all_msgs = result.get("messages", [])
_history_len = len(agent_history)
# Only trust the slice boundary when the message list still
# contains the full history prefix. Mid-run compression can
# rewrite/shrink the list; in that case fall back to scanning
# everything and rely on _history_media_paths for dedup.
if _history_len and len(_all_msgs) >= _history_len:
_scan_msgs = _all_msgs[_history_len:]
else:
_scan_msgs = _all_msgs
for msg in _scan_msgs:
if msg.get("role") in {"tool", "function"}:
content = msg.get("content", "")
if "MEDIA:" in content:

View file

@ -5,6 +5,10 @@ Verifies that MEDIA tags (e.g., from TTS tool) are only extracted from
messages in the CURRENT turn, not from the full conversation history.
This prevents voice messages from accumulating and being sent multiple
times per reply. (Regression test for #160)
Also covers #34608: a stale MEDIA: path emitted by an execute_code /
make_image tool several turns earlier must not leak onto a later
text-only reply, even when the path-based dedup set fails to capture it.
"""
import pytest
@ -43,6 +47,37 @@ def extract_media_tags_fixed(result_messages, history_len):
return media_tags, has_voice_directive
def extract_media_tags_production(result_messages, history_len, history_media_paths):
"""Mirror of the production scan in gateway/run.py after the #34608 fix.
Primary guard: scope the scan to the current turn via ``history_len``
slicing (matching how ``agent_history`` is passed as
``conversation_history`` into ``run_conversation``). Secondary guard:
path-based dedup against ``history_media_paths`` (the #160 compression-safe
fallback, also used when compression shrinks the list below history_len).
"""
media_tags = []
has_voice_directive = False
if len(result_messages) >= history_len and history_len:
scan_msgs = result_messages[history_len:]
else:
scan_msgs = result_messages
for msg in scan_msgs:
if msg.get("role") == "tool" or msg.get("role") == "function":
content = msg.get("content", "")
if "MEDIA:" in content:
for match in re.finditer(r'MEDIA:(\S+)', content):
path = match.group(1).strip().rstrip('",}')
if path and path not in history_media_paths:
media_tags.append(f"MEDIA:{path}")
if "[[audio_as_voice]]" in content:
has_voice_directive = True
return media_tags, has_voice_directive
def extract_media_tags_broken(result_messages):
"""
The BROKEN behavior: extract MEDIA tags from ALL messages including history.
@ -180,5 +215,104 @@ class TestMediaExtraction:
assert len(unique) == 2 # After dedup: same.ogg and different.ogg
class TestStaleToolMediaLeak:
"""Regression tests for #34608.
A MEDIA: path emitted by an execute_code / make_image tool several turns
earlier remains in the full conversation message list. A later text-only
reply (zero MEDIA directives) must NOT attach that stale image.
The production code previously relied solely on path-based dedup against
paths reconstructed from the replayable transcript. When that
reconstruction does not byte-match the in-memory tool content (timestamp
stripping, observed-context withholding, compression rewrites), the stale
path is absent from the dedup set and leaks. Turn-scoped slicing closes
this class of bug deterministically.
"""
def test_stale_execute_code_media_not_attached_to_text_only_reply(self):
"""The exact #34608 scenario: make_image cover from an earlier turn."""
# Prior turn generated an image via execute_code stdout.
history = [
{"role": "user", "content": "Make a cover image"},
{"role": "assistant", "content": None,
"tool_calls": [{"id": "1", "function": {"name": "execute_code"}}]},
{"role": "tool", "tool_call_id": "1",
"content": "Generating cover...\nMEDIA:/tmp/seosmi_cover.png\nDone."},
{"role": "assistant", "content": "Here is your cover."},
]
# Current turn: plain text status update, zero MEDIA directives.
new_messages = [
{"role": "user", "content": "What skill version am I on?"},
{"role": "assistant", "content": "You're on v0.15.1."},
]
all_messages = history + new_messages
history_len = len(history)
# Simulate the dedup set FAILING to capture the stale path (the real
# #34608 condition: replayable-history reconstruction diverged from
# the in-memory tool content, so the path is not in the set).
history_media_paths = set()
tags, voice = extract_media_tags_production(
all_messages, history_len, history_media_paths
)
assert tags == [], (
"Stale tool MEDIA from a prior turn must not leak onto a "
f"later text-only reply, got {tags}"
)
assert voice is False
# The pre-fix production behaviour (scan everything, dedup only) would
# have leaked the stale path when the dedup set missed it.
broken_tags, _ = extract_media_tags_broken(all_messages)
assert any("seosmi_cover.png" in t for t in broken_tags), (
"Sanity: the unscoped scan does surface the stale path"
)
def test_current_turn_media_still_attached_when_dedup_set_empty(self):
"""Turn-scoping must not suppress genuinely new media."""
history = [
{"role": "user", "content": "hi"},
{"role": "assistant", "content": "hello"},
]
new_messages = [
{"role": "user", "content": "Make me a cover image"},
{"role": "assistant", "content": None,
"tool_calls": [{"id": "9", "function": {"name": "execute_code"}}]},
{"role": "tool", "tool_call_id": "9",
"content": "MEDIA:/tmp/fresh_cover.png"},
{"role": "assistant", "content": "Here it is."},
]
all_messages = history + new_messages
tags, _ = extract_media_tags_production(
all_messages, len(history), set()
)
assert len(tags) == 1 and "fresh_cover.png" in tags[0]
def test_compression_shrink_falls_back_to_path_dedup(self):
"""When the list is shorter than history_len (mid-run compression),
fall back to scanning everything with path-based dedup so the #160
compression-safe guarantee is preserved."""
# Post-compression list is shorter than the original history length.
compressed_messages = [
{"role": "user", "content": "summary so far..."},
{"role": "tool", "tool_call_id": "7",
"content": "MEDIA:/tmp/old_from_history.png"},
{"role": "assistant", "content": "ok"},
]
original_history_len = 12 # larger than the compressed list
# The old path IS captured in the dedup set here (history scan ran
# before compression), so it must still be excluded.
history_media_paths = {"/tmp/old_from_history.png"}
tags, _ = extract_media_tags_production(
compressed_messages, original_history_len, history_media_paths
)
assert tags == [], (
"On the compression fallback path, path-dedup must still exclude "
f"known-old media, got {tags}"
)
if __name__ == "__main__":
pytest.main([__file__, "-v"])