From d34f03c32a28b786f2a385d9c29342bb42814210 Mon Sep 17 00:00:00 2001
From: leon7609 <leosma@gmail.com>
Date: Sun, 3 May 2026 11:20:00 +0800
Subject: [PATCH] feat(gateway): support [[as_document]] directive for skill
 media routing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Skills that produce large/lossless images (e.g. info-graph, where a
rendered JPG is 1-2 MB) currently lose quality in Telegram delivery
because `_IMAGE_EXTS` membership routes the file through
`send_multiple_images` → `sendMediaGroup`, which Telegram's server
re-encodes to JPEG @ 1280px max edge. The original bytes only survive
when the file goes through `send_document`, which the dispatch tables
in three places (`_process_message_background`, `_deliver_media_from_response`,
and the `send_message` tool's telegram path) only reach for files
whose extension is NOT in `_IMAGE_EXTS`.

This commit adds an `[[as_document]]` directive that mirrors the
existing `[[audio_as_voice]]` shape: a skill emits the directive once
in its response, and every image-extension MEDIA: file in that response
is delivered via `send_document` instead of `send_multiple_images` /
`sendPhoto`. The directive is detected at the dispatch sites (which see
the raw response) and the directive string is stripped from the
user-visible cleaned text in `extract_media` so it never leaks.

Granularity is intentionally all-or-nothing per response, matching
[[audio_as_voice]]'s scope. Skills that need fine control can split into
two responses.

Verified the targeted use case: info-graph emits

    信息图已生成（...）
    [[as_document]]
    MEDIA:/tmp/info-graph-x/infographic.jpg

→ Telegram receives `infographic.jpg` via sendDocument, original 1MB
JPEG bytes preserved, no recompression. Forwarding and download
filenames stay clean (`infographic.jpg`).

Tests: +3 cases in TestExtractMedia covering directive strip, isolation
from voice flag, and coexistence with [[audio_as_voice]]. All
113 pre-existing media/extract/send tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 gateway/platforms/base.py           | 46 ++++++++++++++++++++++++-----
 gateway/run.py                      | 17 +++++++++--
 tests/gateway/test_platform_base.py | 31 +++++++++++++++++++
 tools/send_message_tool.py          | 14 +++++++--
 4 files changed, 94 insertions(+), 14 deletions(-)
diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py
index 5abbef808d..80e5e66526 100644
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -1874,23 +1874,38 @@ class BasePlatformAdapter(ABC):
     def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
         """
         Extract MEDIA:<path> tags and [[audio_as_voice]] directives from response text.
-        
+
         The TTS tool returns responses like:
             [[audio_as_voice]]
             MEDIA:/path/to/audio.ogg
-        
+
+        Skills that produce large/lossless images (e.g. info-graph, where a
+        rendered JPG is 1-2 MB but Telegram's sendPhoto recompresses to
+        ~200 KB at 1280px) can use ``[[as_document]]`` to request unmodified
+        delivery via sendDocument instead of sendPhoto/sendMediaGroup. The
+        directive is detected at the dispatch sites (which have access to the
+        original response); this method just strips it so it never leaks into
+        user-visible text. Per-file granularity is intentionally not exposed —
+        when an agent emits ``[[as_document]]`` once, every image path in the
+        same response is delivered as a document, mirroring the all-or-nothing
+        scope of ``[[audio_as_voice]]``.
+
         Args:
             content: The response text to scan.
-        
+
         Returns:
             Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed).
         """
         media = []
         cleaned = content
-        
+
         # Check for [[audio_as_voice]] directive
         has_voice_tag = "[[audio_as_voice]]" in content
         cleaned = cleaned.replace("[[audio_as_voice]]", "")
+        # Strip [[as_document]] directive — callers inspect the original
+        # ``content`` for it (so they can still react to it); here we just
+        # keep it out of the user-visible cleaned text.
+        cleaned = cleaned.replace("[[as_document]]", "")
         
         # Extract MEDIA:<path> tags, allowing optional whitespace after the colon
         # and quoted/backticked paths for LLM-formatted outputs.
@@ -2815,13 +2830,21 @@ class BasePlatformAdapter(ABC):
             if not response:
                 logger.debug("[%s] Handler returned empty/None response for %s", self.name, event.source.chat_id)
             if response:
+                # Capture [[as_document]] before extract_media strips it, so the
+                # dispatch partition below can route image-extension files
+                # through send_document instead of send_multiple_images. Used
+                # by skills that produce large/lossless images (e.g. info-graph)
+                # where Telegram's sendPhoto recompression destroys legibility.
+                force_document_attachments = "[[as_document]]" in response
+
                 # Extract MEDIA:<path> tags (from TTS tool) before other processing
                 media_files, response = self.extract_media(response)
-                
+
                 # Extract image URLs and send them as native platform attachments
                 images, text_content = self.extract_images(response)
                 # Strip any remaining internal directives from message body (fixes #1561)
                 text_content = text_content.replace("[[audio_as_voice]]", "").strip()
+                text_content = text_content.replace("[[as_document]]", "").strip()
                 text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip()
                 if images:
                     logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))
@@ -2923,19 +2946,26 @@ class BasePlatformAdapter(ABC):
                 _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
 
                 # Partition images out of media_files + local_files so they
-                # can be sent as a single batch (Signal RPC)
+                # can be sent as a single batch (Signal RPC). When
+                # ``[[as_document]]`` was set on the original response, image
+                # files skip the photo path and route to send_document below
+                # so they're delivered with original bytes (no Telegram
+                # sendPhoto recompression).
                 from urllib.parse import quote as _quote
                 _image_paths: list = []
                 _non_image_media: list = []
                 for media_path, is_voice in media_files:
                     _ext = Path(media_path).suffix.lower()
-                    if _ext in _IMAGE_EXTS and not is_voice:
+                    if (_ext in _IMAGE_EXTS
+                            and not is_voice
+                            and not force_document_attachments):
                         _image_paths.append(media_path)
                     else:
                         _non_image_media.append((media_path, is_voice))
                 _non_image_local: list = []
                 for file_path in local_files:
-                    if Path(file_path).suffix.lower() in _IMAGE_EXTS:
+                    if (Path(file_path).suffix.lower() in _IMAGE_EXTS
+                            and not force_document_attachments):
                         _image_paths.append(file_path)
                     else:
                         _non_image_local.append(file_path)
diff --git a/gateway/run.py b/gateway/run.py
index 91b80d6741..7fda24614b 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -8961,6 +8961,12 @@ class GatewayRunner:
         from urllib.parse import quote as _quote
 
         try:
+            # Capture [[as_document]] before extract_media strips it, so the
+            # dispatch partition below can route image-extension files
+            # through send_document (preserving bytes) instead of
+            # send_multiple_images (Telegram sendPhoto recompresses to ~1280px).
+            force_document_attachments = "[[as_document]]" in response
+
             media_files, _ = adapter.extract_media(response)
             _, cleaned = adapter.extract_images(response)
             local_files, _ = adapter.extract_local_files(cleaned)
@@ -8973,19 +8979,24 @@ class GatewayRunner:
             _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
 
             # Partition out images so they can be sent as a single batch
-            # (e.g. Signal's multi-attachment RPC)
+            # (e.g. Signal's multi-attachment RPC). When [[as_document]] was
+            # set, image-extension files skip the photo path and route to
+            # send_document below — preserving original bytes.
             image_paths: list = []
             non_image_media: list = []
             for media_path, is_voice in media_files:
                 ext = Path(media_path).suffix.lower()
-                if ext in _IMAGE_EXTS and not is_voice:
+                if (ext in _IMAGE_EXTS
+                        and not is_voice
+                        and not force_document_attachments):
                     image_paths.append(media_path)
                 else:
                     non_image_media.append((media_path, is_voice))
 
             non_image_local: list = []
             for file_path in local_files:
-                if Path(file_path).suffix.lower() in _IMAGE_EXTS:
+                if (Path(file_path).suffix.lower() in _IMAGE_EXTS
+                        and not force_document_attachments):
                     image_paths.append(file_path)
                 else:
                     non_image_local.append(file_path)
diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py
index 84f3b7239f..23646545bf 100644
--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@@ -329,6 +329,37 @@ class TestExtractMedia:
         assert media == [("/tmp/Jane Doe/speech.flac", False)]
         assert cleaned == ""
 
+    def test_as_document_directive_stripped_from_cleaned_text(self):
+        """[[as_document]] is a routing directive — strip it from
+        user-visible text just like [[audio_as_voice]]. Callers detect the
+        directive on the original content (before extract_media)."""
+        content = "Here is your infographic:\n[[as_document]]\nMEDIA:/tmp/x.jpg"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert media == [("/tmp/x.jpg", False)]
+        assert "[[as_document]]" not in cleaned
+        assert "Here is your infographic" in cleaned
+
+    def test_as_document_directive_alone_does_not_attach_voice_flag(self):
+        """[[as_document]] is independent of [[audio_as_voice]] — combining
+        them in the same response should not entangle the flags."""
+        content = "[[as_document]]\nMEDIA:/tmp/x.jpg"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert media == [("/tmp/x.jpg", False)]  # voice flag stays False
+        assert "[[as_document]]" not in cleaned
+
+    def test_both_directives_can_coexist(self):
+        """A response could (rarely) contain both [[audio_as_voice]] for an
+        ogg file AND [[as_document]] for an attached image. The voice flag
+        propagates per-tuple; [[as_document]] is detected at dispatch."""
+        content = "[[audio_as_voice]]\n[[as_document]]\nMEDIA:/tmp/x.ogg"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        # Voice flag is propagated to every media tuple (this matches the
+        # existing extract_media contract)
+        assert media == [("/tmp/x.ogg", True)]
+        # Both directives stripped from cleaned text
+        assert "[[audio_as_voice]]" not in cleaned
+        assert "[[as_document]]" not in cleaned
+
 
 # ---------------------------------------------------------------------------
 # should_send_media_as_audio
diff --git a/tools/send_message_tool.py b/tools/send_message_tool.py
index 938cb977b6..380208d429 100644
--- a/tools/send_message_tool.py
+++ b/tools/send_message_tool.py
@@ -242,6 +242,12 @@ def _handle_send(args):
 
     from gateway.platforms.base import BasePlatformAdapter
 
+    # Capture [[as_document]] directive before extract_media strips it.
+    # Image-extension files in this batch will route through send_document
+    # instead of send_photo so the original bytes survive (e.g. info-graph
+    # JPGs where Telegram's sendPhoto recompresses to 1280px).
+    force_document_attachments = "[[as_document]]" in message
+
     media_files, cleaned_message = BasePlatformAdapter.extract_media(message)
     mirror_text = cleaned_message.strip() or _describe_media_for_mirror(media_files)
 
@@ -277,6 +283,7 @@ def _handle_send(args):
                 cleaned_message,
                 thread_id=thread_id,
                 media_files=media_files,
+                force_document=force_document_attachments,
             )
         )
         if used_home_channel and isinstance(result, dict) and result.get("success"):
@@ -437,7 +444,7 @@ async def _send_via_adapter(platform, pconfig, chat_id, chunk):
     return {"error": f"No live adapter for platform '{platform.value}'. Is the gateway running with this platform connected?"}
 
 
-async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None):
+async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None, force_document=False):
     """Route a message to the appropriate platform sender.
 
     Long messages are automatically chunked to fit within platform limits
@@ -514,6 +521,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
                 media_files=media_files if is_last else [],
                 thread_id=thread_id,
                 disable_link_previews=disable_link_previews,
+                force_document=force_document,
             )
             if isinstance(result, dict) and result.get("error"):
                 return result
@@ -667,7 +675,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
     return last_result
 
 
-async def _send_telegram(token, chat_id, message, media_files=None, thread_id=None, disable_link_previews=False):
+async def _send_telegram(token, chat_id, message, media_files=None, thread_id=None, disable_link_previews=False, force_document=False):
     """Send via Telegram Bot API (one-shot, no polling needed).
 
     Applies markdown→MarkdownV2 formatting (same as the gateway adapter)
@@ -750,7 +758,7 @@ async def _send_telegram(token, chat_id, message, media_files=None, thread_id=No
             ext = os.path.splitext(media_path)[1].lower()
             try:
                 with open(media_path, "rb") as f:
-                    if ext in _IMAGE_EXTS:
+                    if ext in _IMAGE_EXTS and not force_document:
                         last_msg = await bot.send_photo(
                             chat_id=int_chat_id, photo=f, **thread_kwargs
                         )