mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(gateway): support Windows absolute paths in MEDIA tag regex and extract_local_files (#34632)
The MEDIA_TAG_CLEANUP_RE and extract_local_files path regex both used (?:~/|/) to anchor paths, which only matches Unix-style absolute and home-relative paths. Two additional _TOOL_MEDIA_RE patterns in run.py had the same limitation. Windows absolute paths (C:\Users\..., D:/...) were silently ignored, causing MEDIA directive delivery to fail. Add [A-Za-z]:[/\\] as a third anchor alternative in all four regex locations (base.py x2, run.py x2). Also update path separators in extract_local_files from / to [/\\] so it can traverse Windows directory trees. Revert accidental + quantifier in MEDIA_TAG_CLEANUP_RE lookahead that changed match-one to match-one-or-more (unrelated to fix). Fixes: #34632
This commit is contained in:
parent
45465b0d5d
commit
51d165a8e7
4 changed files with 194 additions and 5 deletions
|
|
@ -1191,10 +1191,12 @@ _MEDIA_EXT_ALTERNATION = "|".join(
|
|||
# bare-path detector (extract_local_files) downstream rather than silently
|
||||
# deleted. Shared by the non-streaming dispatch path and the streaming
|
||||
# consumer so both behave identically.
|
||||
# Path anchors: ``~/`` (Unix home-relative), ``/`` (Unix absolute),
|
||||
# ``X:\\`` or ``X:/`` (Windows drive-letter absolute — #34632).
|
||||
MEDIA_TAG_CLEANUP_RE = re.compile(
|
||||
r'''[`"']?MEDIA:\s*'''
|
||||
r'''(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|'''
|
||||
r'''(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
|
||||
r'''(?:~/|/|[A-Za-z]:[/\\])\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
|
||||
r'''(?=[\s`"',;:)\]}]|$)[`"']?''',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
|
@ -2665,9 +2667,10 @@ class BasePlatformAdapter(ABC):
|
|||
|
||||
# (?<![/:\w.]) prevents matching inside URLs (e.g. https://…/img.png)
|
||||
# and relative paths (./foo.png)
|
||||
# (?:~/|/) anchors to absolute or home-relative paths
|
||||
# (?:~/|/) anchors to absolute or home-relative Unix paths
|
||||
# (?:[A-Za-z]:[/\\]) anchors to Windows drive-letter paths (#34632)
|
||||
path_re = re.compile(
|
||||
r'(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:' + ext_part + r')\b',
|
||||
r'(?<![/:\w.])(?:~/|/|[A-Za-z]:[/\\])(?:[\w.\-]+[/\\])*[\w.\-]+\.(?:' + ext_part + r')\b',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -17320,7 +17320,7 @@ class GatewayRunner:
|
|||
_hc = _hm.get("content", "")
|
||||
if "MEDIA:" in _hc:
|
||||
_TOOL_MEDIA_RE = re.compile(
|
||||
r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
|
||||
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
|
||||
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
|
||||
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
|
||||
r'txt|csv|apk|ipa))',
|
||||
|
|
@ -17646,7 +17646,7 @@ class GatewayRunner:
|
|||
content = msg.get("content", "")
|
||||
if "MEDIA:" in content:
|
||||
_TOOL_MEDIA_RE = re.compile(
|
||||
r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
|
||||
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
|
||||
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
|
||||
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
|
||||
r'txt|csv|apk|ipa))',
|
||||
|
|
|
|||
|
|
@ -361,6 +361,45 @@ class TestExtractMedia:
|
|||
assert "[[audio_as_voice]]" not in cleaned
|
||||
assert "[[as_document]]" not in cleaned
|
||||
|
||||
# Windows path support — regression coverage for #34632
|
||||
|
||||
def test_media_tag_windows_backslash_path(self):
|
||||
"""extract_media should recognise Windows backslash paths."""
|
||||
media, cleaned = BasePlatformAdapter.extract_media(
|
||||
r"MEDIA:C:\Users\kotsu\file.pdf"
|
||||
)
|
||||
assert len(media) == 1
|
||||
assert media[0][0].endswith("file.pdf")
|
||||
|
||||
def test_media_tag_windows_forward_slash_path(self):
|
||||
"""extract_media should recognise Windows forward-slash paths."""
|
||||
media, cleaned = BasePlatformAdapter.extract_media(
|
||||
"MEDIA:C:/Users/kotsu/file.pdf"
|
||||
)
|
||||
assert len(media) == 1
|
||||
assert media[0][0].endswith("file.pdf")
|
||||
|
||||
def test_media_tag_windows_drive_root(self):
|
||||
"""extract_media should recognise a path at the drive root."""
|
||||
media, cleaned = BasePlatformAdapter.extract_media(
|
||||
r"MEDIA:D:\report.md"
|
||||
)
|
||||
assert len(media) == 1
|
||||
assert media[0][0].endswith("report.md")
|
||||
|
||||
def test_media_tag_unix_paths_still_work(self):
|
||||
"""Unix absolute and tilde paths must still extract after Windows change."""
|
||||
for content in ["MEDIA:/tmp/audio.ogg", r"MEDIA:~/docs/notes.md"]:
|
||||
media, _ = BasePlatformAdapter.extract_media(content)
|
||||
assert len(media) == 1, f"Failed for: {content}"
|
||||
|
||||
def test_relative_path_still_ignored(self):
|
||||
"""Relative Windows-style paths (no drive letter) must not match."""
|
||||
media, _ = BasePlatformAdapter.extract_media(
|
||||
r"MEDIA:Users\kotsu\file.pdf"
|
||||
)
|
||||
assert media == []
|
||||
|
||||
|
||||
class TestMediaExtensionAllowlistParity:
|
||||
"""Regression coverage for issue #34517 — the MEDIA: extension black hole.
|
||||
|
|
|
|||
147
tests/gateway/test_run_tool_media_re.py
Normal file
147
tests/gateway/test_run_tool_media_re.py
Normal file
|
|
@ -0,0 +1,147 @@
|
|||
"""Tests for _TOOL_MEDIA_RE regex patterns in gateway/run.py.
|
||||
|
||||
Issue #34632: The _TOOL_MEDIA_RE patterns in GatewayRunner used (?:/|~\/) to
|
||||
anchor paths, which only matched Unix-style absolute and home-relative paths.
|
||||
Windows absolute paths (C:\\Users\\..., D:/...) were silently ignored, causing
|
||||
MEDIA directive delivery to fail on Windows.
|
||||
|
||||
Fix: Add [A-Za-z]:[/\\\\] as a third anchor alternative in both patterns.
|
||||
|
||||
Two identical _TOOL_MEDIA_RE patterns exist in run.py:
|
||||
1. History scanning (~L17223): collects already-seen media paths
|
||||
2. Result scanning (~L17549): extracts new media tags from agent output
|
||||
|
||||
This test file validates that both equivalent regex patterns correctly match
|
||||
Windows paths while preserving existing Unix path matching behavior.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Reconstruct the exact _TOOL_MEDIA_RE pattern from gateway/run.py
|
||||
# The pattern is built by concatenating raw string parts:
|
||||
# r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|...))'
|
||||
_TOOL_MEDIA_RE = re.compile(
|
||||
r'MEDIA:((?:[A-Za-z]:[/\\]|/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
|
||||
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
|
||||
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
|
||||
r'txt|csv|apk|ipa))',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
# Reconstruct the pre-fix pattern (without Windows anchor) for regression proof
|
||||
_TOOL_MEDIA_RE_PRE_FIX = re.compile(
|
||||
r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
|
||||
r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
|
||||
r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
|
||||
r'txt|csv|apk|ipa))',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
class TestToolMediaReWindowsPaths:
|
||||
"""Issue #34632: _TOOL_MEDIA_RE must match Windows absolute paths."""
|
||||
|
||||
# ── Positive: Windows paths now match ──────────────────────────
|
||||
|
||||
@pytest.mark.parametrize("media_tag, expected_path", [
|
||||
# Windows backslash paths
|
||||
("MEDIA:C:\\Users\\test\\image.png", "C:\\Users\\test\\image.png"),
|
||||
("MEDIA:D:\\data\\report.pdf", "D:\\data\\report.pdf"),
|
||||
("MEDIA:E:\\Photos\\vacation.jpg", "E:\\Photos\\vacation.jpg"),
|
||||
# Windows forward-slash paths
|
||||
("MEDIA:C:/Users/test/image.png", "C:/Users/test/image.png"),
|
||||
("MEDIA:D:/data/report.pdf", "D:/data/report.pdf"),
|
||||
# Mixed separators
|
||||
("MEDIA:C:\\Users/test\\image.webp", "C:\\Users/test\\image.webp"),
|
||||
# Various extensions
|
||||
("MEDIA:F:\\videos\\clip.mp4", "F:\\videos\\clip.mp4"),
|
||||
("MEDIA:G:\\audio\\song.mp3", "G:\\audio\\song.mp3"),
|
||||
("MEDIA:H:\\docs\\sheet.xlsx", "H:\\docs\\sheet.xlsx"),
|
||||
("MEDIA:Z:\\archive\\backup.zip", "Z:\\archive\\backup.zip"),
|
||||
])
|
||||
def test_windows_paths_match(self, media_tag, expected_path):
|
||||
"""Windows absolute paths with drive letters are matched."""
|
||||
match = _TOOL_MEDIA_RE.search(media_tag)
|
||||
assert match is not None, f"Should match: {media_tag}"
|
||||
assert match.group(1) == expected_path
|
||||
|
||||
# ── Positive: Unix paths still match ───────────────────────────
|
||||
|
||||
@pytest.mark.parametrize("media_tag, expected_path", [
|
||||
("MEDIA:/tmp/output.png", "/tmp/output.png"),
|
||||
("MEDIA:/var/log/report.pdf", "/var/log/report.pdf"),
|
||||
("MEDIA:/home/user/docs/file.txt", "/home/user/docs/file.txt"),
|
||||
# Home-relative
|
||||
("MEDIA:~/Downloads/image.jpg", "~/Downloads/image.jpg"),
|
||||
("MEDIA:~/Documents/report.pdf", "~/Documents/report.pdf"),
|
||||
])
|
||||
def test_unix_paths_still_match(self, media_tag, expected_path):
|
||||
"""Unix-style absolute and home-relative paths still match."""
|
||||
match = _TOOL_MEDIA_RE.search(media_tag)
|
||||
assert match is not None, f"Should match: {media_tag}"
|
||||
assert match.group(1) == expected_path
|
||||
|
||||
# ── Negative: invalid paths don't match ────────────────────────
|
||||
|
||||
@pytest.mark.parametrize("text", [
|
||||
"No MEDIA tag here",
|
||||
"MEDIA:relative/path/file.png", # relative path, no anchor
|
||||
"MEDIA:file.png", # no directory
|
||||
"MEDIA:C:file.png", # drive letter but no separator
|
||||
"MEDIA:/path/to/file.unknown", # unsupported extension
|
||||
"MEDIA:/path/to/file", # no extension
|
||||
"MEDIA:", # empty path
|
||||
])
|
||||
def test_invalid_paths_dont_match(self, text):
|
||||
"""Non-MEDIA text, relative paths, and unsupported extensions are ignored."""
|
||||
match = _TOOL_MEDIA_RE.search(text)
|
||||
assert match is None, f"Should NOT match: {text}"
|
||||
|
||||
# ── Negative/preserved: old pattern rejects Windows paths ──────
|
||||
|
||||
@pytest.mark.parametrize("media_tag", [
|
||||
"MEDIA:C:\\Users\\test\\image.png",
|
||||
"MEDIA:D:/data/report.pdf",
|
||||
"MEDIA:C:\\path\\file.jpg",
|
||||
])
|
||||
def test_pre_fix_pattern_rejects_windows(self, media_tag):
|
||||
"""The pre-fix pattern (without Windows anchor) does NOT match Windows paths.
|
||||
This proves the fix is necessary — without it, these paths are silently ignored."""
|
||||
match = _TOOL_MEDIA_RE_PRE_FIX.search(media_tag)
|
||||
assert match is None, f"Pre-fix pattern should NOT match: {media_tag}"
|
||||
|
||||
# ── Edge cases ─────────────────────────────────────────────────
|
||||
|
||||
def test_multiple_media_tags_in_content(self):
|
||||
"""Multiple MEDIA tags in the same content are all found."""
|
||||
content = (
|
||||
"Some text MEDIA:C:\\path\\img.png and more MEDIA:/tmp/out.pdf trailing"
|
||||
)
|
||||
matches = list(_TOOL_MEDIA_RE.finditer(content))
|
||||
assert len(matches) == 2
|
||||
paths = [m.group(1) for m in matches]
|
||||
assert "C:\\path\\img.png" in paths
|
||||
assert "/tmp/out.pdf" in paths
|
||||
|
||||
def test_case_insensitive_drive_letter(self):
|
||||
"""Drive letters are case-insensitive due to re.IGNORECASE."""
|
||||
match_lower = _TOOL_MEDIA_RE.search("MEDIA:c:\\path\\file.png")
|
||||
match_upper = _TOOL_MEDIA_RE.search("MEDIA:C:\\path\\file.png")
|
||||
assert match_lower is not None
|
||||
assert match_upper is not None
|
||||
assert match_lower.group(1).lower() == match_upper.group(1).lower()
|
||||
|
||||
@pytest.mark.parametrize("media_tag", [
|
||||
"MEDIA:C:\\path\\file.jpeg",
|
||||
"MEDIA:C:\\path\\file.JPG",
|
||||
"MEDIA:C:\\path\\file.GIF",
|
||||
"MEDIA:C:\\path\\file.MP4",
|
||||
])
|
||||
def test_case_insensitive_extensions(self, media_tag):
|
||||
"""File extensions are matched case-insensitively."""
|
||||
match = _TOOL_MEDIA_RE.search(media_tag)
|
||||
assert match is not None, f"Should match: {media_tag}"
|
||||
Loading…
Add table
Add a link
Reference in a new issue