From d34f03c32a28b786f2a385d9c29342bb42814210 Mon Sep 17 00:00:00 2001 From: leon7609 Date: Sun, 3 May 2026 11:20:00 +0800 Subject: [PATCH] feat(gateway): support [[as_document]] directive for skill media routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skills that produce large/lossless images (e.g. info-graph, where a rendered JPG is 1-2 MB) currently lose quality in Telegram delivery because `_IMAGE_EXTS` membership routes the file through `send_multiple_images` → `sendMediaGroup`, which Telegram's server re-encodes to JPEG @ 1280px max edge. The original bytes only survive when the file goes through `send_document`, which the dispatch tables in three places (`_process_message_background`, `_deliver_media_from_response`, and the `send_message` tool's telegram path) only reach for files whose extension is NOT in `_IMAGE_EXTS`. This commit adds an `[[as_document]]` directive that mirrors the existing `[[audio_as_voice]]` shape: a skill emits the directive once in its response, and every image-extension MEDIA: file in that response is delivered via `send_document` instead of `send_multiple_images` / `sendPhoto`. The directive is detected at the dispatch sites (which see the raw response) and the directive string is stripped from the user-visible cleaned text in `extract_media` so it never leaks. Granularity is intentionally all-or-nothing per response, matching [[audio_as_voice]]'s scope. Skills that need fine control can split into two responses. Verified the targeted use case: info-graph emits 信息图已生成(...) [[as_document]] MEDIA:/tmp/info-graph-x/infographic.jpg → Telegram receives `infographic.jpg` via sendDocument, original 1MB JPEG bytes preserved, no recompression. Forwarding and download filenames stay clean (`infographic.jpg`). Tests: +3 cases in TestExtractMedia covering directive strip, isolation from voice flag, and coexistence with [[audio_as_voice]]. All 113 pre-existing media/extract/send tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- gateway/platforms/base.py | 46 ++++++++++++++++++++++++----- gateway/run.py | 17 +++++++++-- tests/gateway/test_platform_base.py | 31 +++++++++++++++++++ tools/send_message_tool.py | 14 +++++++-- 4 files changed, 94 insertions(+), 14 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 5abbef808d..80e5e66526 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -1874,23 +1874,38 @@ class BasePlatformAdapter(ABC): def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]: """ Extract MEDIA: tags and [[audio_as_voice]] directives from response text. - + The TTS tool returns responses like: [[audio_as_voice]] MEDIA:/path/to/audio.ogg - + + Skills that produce large/lossless images (e.g. info-graph, where a + rendered JPG is 1-2 MB but Telegram's sendPhoto recompresses to + ~200 KB at 1280px) can use ``[[as_document]]`` to request unmodified + delivery via sendDocument instead of sendPhoto/sendMediaGroup. The + directive is detected at the dispatch sites (which have access to the + original response); this method just strips it so it never leaks into + user-visible text. Per-file granularity is intentionally not exposed — + when an agent emits ``[[as_document]]`` once, every image path in the + same response is delivered as a document, mirroring the all-or-nothing + scope of ``[[audio_as_voice]]``. + Args: content: The response text to scan. - + Returns: Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed). """ media = [] cleaned = content - + # Check for [[audio_as_voice]] directive has_voice_tag = "[[audio_as_voice]]" in content cleaned = cleaned.replace("[[audio_as_voice]]", "") + # Strip [[as_document]] directive — callers inspect the original + # ``content`` for it (so they can still react to it); here we just + # keep it out of the user-visible cleaned text. + cleaned = cleaned.replace("[[as_document]]", "") # Extract MEDIA: tags, allowing optional whitespace after the colon # and quoted/backticked paths for LLM-formatted outputs. @@ -2815,13 +2830,21 @@ class BasePlatformAdapter(ABC): if not response: logger.debug("[%s] Handler returned empty/None response for %s", self.name, event.source.chat_id) if response: + # Capture [[as_document]] before extract_media strips it, so the + # dispatch partition below can route image-extension files + # through send_document instead of send_multiple_images. Used + # by skills that produce large/lossless images (e.g. info-graph) + # where Telegram's sendPhoto recompression destroys legibility. + force_document_attachments = "[[as_document]]" in response + # Extract MEDIA: tags (from TTS tool) before other processing media_files, response = self.extract_media(response) - + # Extract image URLs and send them as native platform attachments images, text_content = self.extract_images(response) # Strip any remaining internal directives from message body (fixes #1561) text_content = text_content.replace("[[audio_as_voice]]", "").strip() + text_content = text_content.replace("[[as_document]]", "").strip() text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip() if images: logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response)) @@ -2923,19 +2946,26 @@ class BasePlatformAdapter(ABC): _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'} # Partition images out of media_files + local_files so they - # can be sent as a single batch (Signal RPC) + # can be sent as a single batch (Signal RPC). When + # ``[[as_document]]`` was set on the original response, image + # files skip the photo path and route to send_document below + # so they're delivered with original bytes (no Telegram + # sendPhoto recompression). from urllib.parse import quote as _quote _image_paths: list = [] _non_image_media: list = [] for media_path, is_voice in media_files: _ext = Path(media_path).suffix.lower() - if _ext in _IMAGE_EXTS and not is_voice: + if (_ext in _IMAGE_EXTS + and not is_voice + and not force_document_attachments): _image_paths.append(media_path) else: _non_image_media.append((media_path, is_voice)) _non_image_local: list = [] for file_path in local_files: - if Path(file_path).suffix.lower() in _IMAGE_EXTS: + if (Path(file_path).suffix.lower() in _IMAGE_EXTS + and not force_document_attachments): _image_paths.append(file_path) else: _non_image_local.append(file_path) diff --git a/gateway/run.py b/gateway/run.py index 91b80d6741..7fda24614b 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -8961,6 +8961,12 @@ class GatewayRunner: from urllib.parse import quote as _quote try: + # Capture [[as_document]] before extract_media strips it, so the + # dispatch partition below can route image-extension files + # through send_document (preserving bytes) instead of + # send_multiple_images (Telegram sendPhoto recompresses to ~1280px). + force_document_attachments = "[[as_document]]" in response + media_files, _ = adapter.extract_media(response) _, cleaned = adapter.extract_images(response) local_files, _ = adapter.extract_local_files(cleaned) @@ -8973,19 +8979,24 @@ class GatewayRunner: _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'} # Partition out images so they can be sent as a single batch - # (e.g. Signal's multi-attachment RPC) + # (e.g. Signal's multi-attachment RPC). When [[as_document]] was + # set, image-extension files skip the photo path and route to + # send_document below — preserving original bytes. image_paths: list = [] non_image_media: list = [] for media_path, is_voice in media_files: ext = Path(media_path).suffix.lower() - if ext in _IMAGE_EXTS and not is_voice: + if (ext in _IMAGE_EXTS + and not is_voice + and not force_document_attachments): image_paths.append(media_path) else: non_image_media.append((media_path, is_voice)) non_image_local: list = [] for file_path in local_files: - if Path(file_path).suffix.lower() in _IMAGE_EXTS: + if (Path(file_path).suffix.lower() in _IMAGE_EXTS + and not force_document_attachments): image_paths.append(file_path) else: non_image_local.append(file_path) diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index 84f3b7239f..23646545bf 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -329,6 +329,37 @@ class TestExtractMedia: assert media == [("/tmp/Jane Doe/speech.flac", False)] assert cleaned == "" + def test_as_document_directive_stripped_from_cleaned_text(self): + """[[as_document]] is a routing directive — strip it from + user-visible text just like [[audio_as_voice]]. Callers detect the + directive on the original content (before extract_media).""" + content = "Here is your infographic:\n[[as_document]]\nMEDIA:/tmp/x.jpg" + media, cleaned = BasePlatformAdapter.extract_media(content) + assert media == [("/tmp/x.jpg", False)] + assert "[[as_document]]" not in cleaned + assert "Here is your infographic" in cleaned + + def test_as_document_directive_alone_does_not_attach_voice_flag(self): + """[[as_document]] is independent of [[audio_as_voice]] — combining + them in the same response should not entangle the flags.""" + content = "[[as_document]]\nMEDIA:/tmp/x.jpg" + media, cleaned = BasePlatformAdapter.extract_media(content) + assert media == [("/tmp/x.jpg", False)] # voice flag stays False + assert "[[as_document]]" not in cleaned + + def test_both_directives_can_coexist(self): + """A response could (rarely) contain both [[audio_as_voice]] for an + ogg file AND [[as_document]] for an attached image. The voice flag + propagates per-tuple; [[as_document]] is detected at dispatch.""" + content = "[[audio_as_voice]]\n[[as_document]]\nMEDIA:/tmp/x.ogg" + media, cleaned = BasePlatformAdapter.extract_media(content) + # Voice flag is propagated to every media tuple (this matches the + # existing extract_media contract) + assert media == [("/tmp/x.ogg", True)] + # Both directives stripped from cleaned text + assert "[[audio_as_voice]]" not in cleaned + assert "[[as_document]]" not in cleaned + # --------------------------------------------------------------------------- # should_send_media_as_audio diff --git a/tools/send_message_tool.py b/tools/send_message_tool.py index 938cb977b6..380208d429 100644 --- a/tools/send_message_tool.py +++ b/tools/send_message_tool.py @@ -242,6 +242,12 @@ def _handle_send(args): from gateway.platforms.base import BasePlatformAdapter + # Capture [[as_document]] directive before extract_media strips it. + # Image-extension files in this batch will route through send_document + # instead of send_photo so the original bytes survive (e.g. info-graph + # JPGs where Telegram's sendPhoto recompresses to 1280px). + force_document_attachments = "[[as_document]]" in message + media_files, cleaned_message = BasePlatformAdapter.extract_media(message) mirror_text = cleaned_message.strip() or _describe_media_for_mirror(media_files) @@ -277,6 +283,7 @@ def _handle_send(args): cleaned_message, thread_id=thread_id, media_files=media_files, + force_document=force_document_attachments, ) ) if used_home_channel and isinstance(result, dict) and result.get("success"): @@ -437,7 +444,7 @@ async def _send_via_adapter(platform, pconfig, chat_id, chunk): return {"error": f"No live adapter for platform '{platform.value}'. Is the gateway running with this platform connected?"} -async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None): +async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None, force_document=False): """Route a message to the appropriate platform sender. Long messages are automatically chunked to fit within platform limits @@ -514,6 +521,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=media_files if is_last else [], thread_id=thread_id, disable_link_previews=disable_link_previews, + force_document=force_document, ) if isinstance(result, dict) and result.get("error"): return result @@ -667,7 +675,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, return last_result -async def _send_telegram(token, chat_id, message, media_files=None, thread_id=None, disable_link_previews=False): +async def _send_telegram(token, chat_id, message, media_files=None, thread_id=None, disable_link_previews=False, force_document=False): """Send via Telegram Bot API (one-shot, no polling needed). Applies markdown→MarkdownV2 formatting (same as the gateway adapter) @@ -750,7 +758,7 @@ async def _send_telegram(token, chat_id, message, media_files=None, thread_id=No ext = os.path.splitext(media_path)[1].lower() try: with open(media_path, "rb") as f: - if ext in _IMAGE_EXTS: + if ext in _IMAGE_EXTS and not force_document: last_msg = await bot.send_photo( chat_id=int_chat_id, photo=f, **thread_kwargs )