From ea49b38625b4c8ee9f8aedf6c7c3e9ec36acfe69 Mon Sep 17 00:00:00 2001
From: colin-chang <24368158+colin-chang@users.noreply.github.com>
Date: Mon, 18 May 2026 20:07:36 -0700
Subject: [PATCH] fix(gateway): tighten MEDIA extraction regex + silent skip on
 file-not-found
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three related fixes for the MEDIA:<path> extraction pipeline that
caused 'file not found' noise in platform channels:

1. run.py — tighten tool-result MEDIA regex from \S+ (any non-
   whitespace) to require a path pattern with known extensions.
   Prevents LLM-generated placeholder paths like
   'MEDIA:/path/to/example.mp4' from being captured as real media.

2. base.py — remove the |\S+ fallback in extract_media() that
   catches anything non-whitespace as a potential MEDIA path.
   This was the primary cause of false positives — strings like
   '' in tool output were captured as MEDIA: paths.

3. mattermost.py — replace the file-not-found error message sent
   to the channel with a silent logger.warning() skip. When a
   path extracted by MEDIA doesn't exist on disk, the channel
   no longer gets a noisy '(file not found: ...)' message.

Impact: eliminates the persistent 'file not found' spam in
Mattermost channels caused by over-broad MEDIA regex patterns
matching non-path text in tool output.
---
 gateway/platforms/base.py       |  2 +-
 gateway/platforms/mattermost.py |  5 +++--
 gateway/run.py                  | 18 ++++++++++++++++--
 3 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py
index 34ebc385fa9..5899843aa1e 100644
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -2137,7 +2137,7 @@ class BasePlatformAdapter(ABC):
         # Extract MEDIA:<path> tags, allowing optional whitespace after the colon
         # and quoted/backticked paths for LLM-formatted outputs.
         media_pattern = re.compile(
-            r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$)|\S+)[`"']?'''
+            r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?'''
         )
         for match in media_pattern.finditer(content):
             path = match.group("path").strip()
diff --git a/gateway/platforms/mattermost.py b/gateway/platforms/mattermost.py
index 9487f8a1edf..574f465b1dd 100644
--- a/gateway/platforms/mattermost.py
+++ b/gateway/platforms/mattermost.py
@@ -471,9 +471,10 @@ class MattermostAdapter(BasePlatformAdapter):
 
         p = Path(file_path)
         if not p.exists():
-            return await self.send(
-                chat_id, f"{caption or ''}\n(file not found: {file_path})", reply_to
+            logger.warning(
+                "Mattermost: local file not found, skipping: %s", file_path
             )
+            return SendResult(success=True, message_id=None)
 
         fname = file_name or p.name
         ct = mimetypes.guess_type(fname)[0] or "application/octet-stream"
diff --git a/gateway/run.py b/gateway/run.py
index 08805076714..8a25c7a441b 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -15778,7 +15778,14 @@ class GatewayRunner:
                 if _hm.get("role") in {"tool", "function"}:
                     _hc = _hm.get("content", "")
                     if "MEDIA:" in _hc:
-                        for _match in re.finditer(r'MEDIA:(\S+)', _hc):
+                        _TOOL_MEDIA_RE = re.compile(
+                            r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
+                            r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
+                            r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
+                            r'txt|csv|apk|ipa))',
+                            re.IGNORECASE
+                        )
+                        for _match in _TOOL_MEDIA_RE.finditer(_hc):
                             _p = _match.group(1).strip().rstrip('",}')
                             if _p:
                                 _history_media_paths.add(_p)
@@ -16067,7 +16074,14 @@ class GatewayRunner:
                     if msg.get("role") in {"tool", "function"}:
                         content = msg.get("content", "")
                         if "MEDIA:" in content:
-                            for match in re.finditer(r'MEDIA:(\S+)', content):
+                            _TOOL_MEDIA_RE = re.compile(
+                                r'MEDIA:((?:/|~\/)\S+\.(?:png|jpe?g|gif|webp|'
+                                r'mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|'
+                                r'flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|'
+                                r'txt|csv|apk|ipa))',
+                                re.IGNORECASE
+                            )
+                            for match in _TOOL_MEDIA_RE.finditer(content):
                                 path = match.group(1).strip().rstrip('",}')
                                 if path and path not in _history_media_paths:
                                     media_tags.append(f"MEDIA:{path}")