fix(gateway): support Windows absolute paths in MEDIA tag regex and extract_local_files (#34632)

The MEDIA_TAG_CLEANUP_RE and extract_local_files path regex both used
(?:~/|/) to anchor paths, which only matches Unix-style absolute and
home-relative paths. Two additional _TOOL_MEDIA_RE patterns in run.py
had the same limitation. Windows absolute paths (C:\Users\..., D:/...)
were silently ignored, causing MEDIA directive delivery to fail.

Add [A-Za-z]:[/\\] as a third anchor alternative in all four regex
locations (base.py x2, run.py x2). Also update path separators in
extract_local_files from / to [/\\] so it can traverse Windows
directory trees.

Revert accidental + quantifier in MEDIA_TAG_CLEANUP_RE lookahead
that changed match-one to match-one-or-more (unrelated to fix).

Fixes: #34632
This commit is contained in:
Tranquil-Flow 2026-05-30 03:15:30 +02:00 committed by Teknium
parent 45465b0d5d
commit 51d165a8e7
4 changed files with 194 additions and 5 deletions

View file

@ -1191,10 +1191,12 @@ _MEDIA_EXT_ALTERNATION = "|".join(
# bare-path detector (extract_local_files) downstream rather than silently
# deleted. Shared by the non-streaming dispatch path and the streaming
# consumer so both behave identically.
# Path anchors: ``~/`` (Unix home-relative), ``/`` (Unix absolute),
# ``X:\\`` or ``X:/`` (Windows drive-letter absolute — #34632).
MEDIA_TAG_CLEANUP_RE = re.compile(
r'''[`"']?MEDIA:\s*'''
r'''(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|'''
r'''(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
r'''(?:~/|/|[A-Za-z]:[/\\])\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
r'''(?=[\s`"',;:)\]}]|$)[`"']?''',
re.IGNORECASE,
)
@ -2665,9 +2667,10 @@ class BasePlatformAdapter(ABC):
# (?<![/:\w.]) prevents matching inside URLs (e.g. https://…/img.png)
# and relative paths (./foo.png)
# (?:~/|/) anchors to absolute or home-relative paths
# (?:~/|/) anchors to absolute or home-relative Unix paths
# (?:[A-Za-z]:[/\\]) anchors to Windows drive-letter paths (#34632)
path_re = re.compile(
r'(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:' + ext_part + r')\b',
r'(?<![/:\w.])(?:~/|/|[A-Za-z]:[/\\])(?:[\w.\-]+[/\\])*[\w.\-]+\.(?:' + ext_part + r')\b',
re.IGNORECASE,
)

View file

@ -17320,7 +17320,7 @@ class GatewayRunner:
_hc = _hm.get("content", "")
if "MEDIA:" in _hc:
_TOOL_MEDIA_RE = re.compile(
r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
r'txt|csv|apk|ipa))',
@ -17646,7 +17646,7 @@ class GatewayRunner:
content = msg.get("content", "")
if "MEDIA:" in content:
_TOOL_MEDIA_RE = re.compile(
r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
r'txt|csv|apk|ipa))',

View file

@ -361,6 +361,45 @@ class TestExtractMedia:
assert "[[audio_as_voice]]" not in cleaned
assert "[[as_document]]" not in cleaned
# Windows path support — regression coverage for #34632
def test_media_tag_windows_backslash_path(self):
"""extract_media should recognise Windows backslash paths."""
media, cleaned = BasePlatformAdapter.extract_media(
r"MEDIA:C:\Users\kotsu\file.pdf"
)
assert len(media) == 1
assert media[0][0].endswith("file.pdf")
def test_media_tag_windows_forward_slash_path(self):
"""extract_media should recognise Windows forward-slash paths."""
media, cleaned = BasePlatformAdapter.extract_media(
"MEDIA:C:/Users/kotsu/file.pdf"
)
assert len(media) == 1
assert media[0][0].endswith("file.pdf")
def test_media_tag_windows_drive_root(self):
"""extract_media should recognise a path at the drive root."""
media, cleaned = BasePlatformAdapter.extract_media(
r"MEDIA:D:\report.md"
)
assert len(media) == 1
assert media[0][0].endswith("report.md")
def test_media_tag_unix_paths_still_work(self):
"""Unix absolute and tilde paths must still extract after Windows change."""
for content in ["MEDIA:/tmp/audio.ogg", r"MEDIA:~/docs/notes.md"]:
media, _ = BasePlatformAdapter.extract_media(content)
assert len(media) == 1, f"Failed for: {content}"
def test_relative_path_still_ignored(self):
"""Relative Windows-style paths (no drive letter) must not match."""
media, _ = BasePlatformAdapter.extract_media(
r"MEDIA:Users\kotsu\file.pdf"
)
assert media == []
class TestMediaExtensionAllowlistParity:
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.

View file

@ -0,0 +1,147 @@
"""Tests for _TOOL_MEDIA_RE regex patterns in gateway/run.py.
Issue #34632: The _TOOL_MEDIA_RE patterns in GatewayRunner used (?:/|~\/) to
anchor paths, which only matched Unix-style absolute and home-relative paths.
Windows absolute paths (C:\\Users\\..., D:/...) were silently ignored, causing
MEDIA directive delivery to fail on Windows.
Fix: Add [A-Za-z]:[/\\\\] as a third anchor alternative in both patterns.
Two identical _TOOL_MEDIA_RE patterns exist in run.py:
1. History scanning (~L17223): collects already-seen media paths
2. Result scanning (~L17549): extracts new media tags from agent output
This test file validates that both equivalent regex patterns correctly match
Windows paths while preserving existing Unix path matching behavior.
"""
import re
import pytest
# Reconstruct the exact _TOOL_MEDIA_RE pattern from gateway/run.py
# The pattern is built by concatenating raw string parts:
# r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|...))'
_TOOL_MEDIA_RE = re.compile(
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
r'txt|csv|apk|ipa))',
re.IGNORECASE,
)
# Reconstruct the pre-fix pattern (without Windows anchor) for regression proof
_TOOL_MEDIA_RE_PRE_FIX = re.compile(
r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
r'txt|csv|apk|ipa))',
re.IGNORECASE,
)
class TestToolMediaReWindowsPaths:
"""Issue #34632: _TOOL_MEDIA_RE must match Windows absolute paths."""
# ── Positive: Windows paths now match ──────────────────────────
@pytest.mark.parametrize("media_tag, expected_path", [
# Windows backslash paths
("MEDIA:C:\\Users\\test\\image.png", "C:\\Users\\test\\image.png"),
("MEDIA:D:\\data\\report.pdf", "D:\\data\\report.pdf"),
("MEDIA:E:\\Photos\\vacation.jpg", "E:\\Photos\\vacation.jpg"),
# Windows forward-slash paths
("MEDIA:C:/Users/test/image.png", "C:/Users/test/image.png"),
("MEDIA:D:/data/report.pdf", "D:/data/report.pdf"),
# Mixed separators
("MEDIA:C:\\Users/test\\image.webp", "C:\\Users/test\\image.webp"),
# Various extensions
("MEDIA:F:\\videos\\clip.mp4", "F:\\videos\\clip.mp4"),
("MEDIA:G:\\audio\\song.mp3", "G:\\audio\\song.mp3"),
("MEDIA:H:\\docs\\sheet.xlsx", "H:\\docs\\sheet.xlsx"),
("MEDIA:Z:\\archive\\backup.zip", "Z:\\archive\\backup.zip"),
])
def test_windows_paths_match(self, media_tag, expected_path):
"""Windows absolute paths with drive letters are matched."""
match = _TOOL_MEDIA_RE.search(media_tag)
assert match is not None, f"Should match: {media_tag}"
assert match.group(1) == expected_path
# ── Positive: Unix paths still match ───────────────────────────
@pytest.mark.parametrize("media_tag, expected_path", [
("MEDIA:/tmp/output.png", "/tmp/output.png"),
("MEDIA:/var/log/report.pdf", "/var/log/report.pdf"),
("MEDIA:/home/user/docs/file.txt", "/home/user/docs/file.txt"),
# Home-relative
("MEDIA:~/Downloads/image.jpg", "~/Downloads/image.jpg"),
("MEDIA:~/Documents/report.pdf", "~/Documents/report.pdf"),
])
def test_unix_paths_still_match(self, media_tag, expected_path):
"""Unix-style absolute and home-relative paths still match."""
match = _TOOL_MEDIA_RE.search(media_tag)
assert match is not None, f"Should match: {media_tag}"
assert match.group(1) == expected_path
# ── Negative: invalid paths don't match ────────────────────────
@pytest.mark.parametrize("text", [
"No MEDIA tag here",
"MEDIA:relative/path/file.png", # relative path, no anchor
"MEDIA:file.png", # no directory
"MEDIA:C:file.png", # drive letter but no separator
"MEDIA:/path/to/file.unknown", # unsupported extension
"MEDIA:/path/to/file", # no extension
"MEDIA:", # empty path
])
def test_invalid_paths_dont_match(self, text):
"""Non-MEDIA text, relative paths, and unsupported extensions are ignored."""
match = _TOOL_MEDIA_RE.search(text)
assert match is None, f"Should NOT match: {text}"
# ── Negative/preserved: old pattern rejects Windows paths ──────
@pytest.mark.parametrize("media_tag", [
"MEDIA:C:\\Users\\test\\image.png",
"MEDIA:D:/data/report.pdf",
"MEDIA:C:\\path\\file.jpg",
])
def test_pre_fix_pattern_rejects_windows(self, media_tag):
"""The pre-fix pattern (without Windows anchor) does NOT match Windows paths.
This proves the fix is necessary without it, these paths are silently ignored."""
match = _TOOL_MEDIA_RE_PRE_FIX.search(media_tag)
assert match is None, f"Pre-fix pattern should NOT match: {media_tag}"
# ── Edge cases ─────────────────────────────────────────────────
def test_multiple_media_tags_in_content(self):
"""Multiple MEDIA tags in the same content are all found."""
content = (
"Some text MEDIA:C:\\path\\img.png and more MEDIA:/tmp/out.pdf trailing"
)
matches = list(_TOOL_MEDIA_RE.finditer(content))
assert len(matches) == 2
paths = [m.group(1) for m in matches]
assert "C:\\path\\img.png" in paths
assert "/tmp/out.pdf" in paths
def test_case_insensitive_drive_letter(self):
"""Drive letters are case-insensitive due to re.IGNORECASE."""
match_lower = _TOOL_MEDIA_RE.search("MEDIA:c:\\path\\file.png")
match_upper = _TOOL_MEDIA_RE.search("MEDIA:C:\\path\\file.png")
assert match_lower is not None
assert match_upper is not None
assert match_lower.group(1).lower() == match_upper.group(1).lower()
@pytest.mark.parametrize("media_tag", [
"MEDIA:C:\\path\\file.jpeg",
"MEDIA:C:\\path\\file.JPG",
"MEDIA:C:\\path\\file.GIF",
"MEDIA:C:\\path\\file.MP4",
])
def test_case_insensitive_extensions(self, media_tag):
"""File extensions are matched case-insensitively."""
match = _TOOL_MEDIA_RE.search(media_tag)
assert match is not None, f"Should match: {media_tag}"