From ea49b38625b4c8ee9f8aedf6c7c3e9ec36acfe69 Mon Sep 17 00:00:00 2001 From: colin-chang <24368158+colin-chang@users.noreply.github.com> Date: Mon, 18 May 2026 20:07:36 -0700 Subject: [PATCH] fix(gateway): tighten MEDIA extraction regex + silent skip on file-not-found MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes for the MEDIA: extraction pipeline that caused 'file not found' noise in platform channels: 1. run.py — tighten tool-result MEDIA regex from \S+ (any non- whitespace) to require a path pattern with known extensions. Prevents LLM-generated placeholder paths like 'MEDIA:/path/to/example.mp4' from being captured as real media. 2. base.py — remove the |\S+ fallback in extract_media() that catches anything non-whitespace as a potential MEDIA path. This was the primary cause of false positives — strings like '' in tool output were captured as MEDIA: paths. 3. mattermost.py — replace the file-not-found error message sent to the channel with a silent logger.warning() skip. When a path extracted by MEDIA doesn't exist on disk, the channel no longer gets a noisy '(file not found: ...)' message. Impact: eliminates the persistent 'file not found' spam in Mattermost channels caused by over-broad MEDIA regex patterns matching non-path text in tool output. --- gateway/platforms/base.py | 2 +- gateway/platforms/mattermost.py | 5 +++-- gateway/run.py | 18 ++++++++++++++++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 34ebc385fa9..5899843aa1e 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -2137,7 +2137,7 @@ class BasePlatformAdapter(ABC): # Extract MEDIA: tags, allowing optional whitespace after the colon # and quoted/backticked paths for LLM-formatted outputs. media_pattern = re.compile( - r'''[`"']?MEDIA:\s*(?P`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$)|\S+)[`"']?''' + r'''[`"']?MEDIA:\s*(?P`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?''' ) for match in media_pattern.finditer(content): path = match.group("path").strip() diff --git a/gateway/platforms/mattermost.py b/gateway/platforms/mattermost.py index 9487f8a1edf..574f465b1dd 100644 --- a/gateway/platforms/mattermost.py +++ b/gateway/platforms/mattermost.py @@ -471,9 +471,10 @@ class MattermostAdapter(BasePlatformAdapter): p = Path(file_path) if not p.exists(): - return await self.send( - chat_id, f"{caption or ''}\n(file not found: {file_path})", reply_to + logger.warning( + "Mattermost: local file not found, skipping: %s", file_path ) + return SendResult(success=True, message_id=None) fname = file_name or p.name ct = mimetypes.guess_type(fname)[0] or "application/octet-stream" diff --git a/gateway/run.py b/gateway/run.py index 08805076714..8a25c7a441b 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -15778,7 +15778,14 @@ class GatewayRunner: if _hm.get("role") in {"tool", "function"}: _hc = _hm.get("content", "") if "MEDIA:" in _hc: - for _match in re.finditer(r'MEDIA:(\S+)', _hc): + _TOOL_MEDIA_RE = re.compile( + r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|' + r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|' + r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|' + r'txt|csv|apk|ipa))', + re.IGNORECASE + ) + for _match in _TOOL_MEDIA_RE.finditer(_hc): _p = _match.group(1).strip().rstrip('",}') if _p: _history_media_paths.add(_p) @@ -16067,7 +16074,14 @@ class GatewayRunner: if msg.get("role") in {"tool", "function"}: content = msg.get("content", "") if "MEDIA:" in content: - for match in re.finditer(r'MEDIA:(\S+)', content): + _TOOL_MEDIA_RE = re.compile( + r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|' + r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|' + r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|' + r'txt|csv|apk|ipa))', + re.IGNORECASE + ) + for match in _TOOL_MEDIA_RE.finditer(content): path = match.group(1).strip().rstrip('",}') if path and path not in _history_media_paths: media_tags.append(f"MEDIA:{path}")