feat(gateway): support [[as_document]] directive for skill media routing

Skills that produce large/lossless images (e.g. info-graph, where a
rendered JPG is 1-2 MB) currently lose quality in Telegram delivery
because `_IMAGE_EXTS` membership routes the file through
`send_multiple_images` → `sendMediaGroup`, which Telegram's server
re-encodes to JPEG @ 1280px max edge. The original bytes only survive
when the file goes through `send_document`, which the dispatch tables
in three places (`_process_message_background`, `_deliver_media_from_response`,
and the `send_message` tool's telegram path) only reach for files
whose extension is NOT in `_IMAGE_EXTS`.

This commit adds an `[[as_document]]` directive that mirrors the
existing `[[audio_as_voice]]` shape: a skill emits the directive once
in its response, and every image-extension MEDIA: file in that response
is delivered via `send_document` instead of `send_multiple_images` /
`sendPhoto`. The directive is detected at the dispatch sites (which see
the raw response) and the directive string is stripped from the
user-visible cleaned text in `extract_media` so it never leaks.

Granularity is intentionally all-or-nothing per response, matching
[[audio_as_voice]]'s scope. Skills that need fine control can split into
two responses.

Verified the targeted use case: info-graph emits

    信息图已生成(...)
    [[as_document]]
    MEDIA:/tmp/info-graph-x/infographic.jpg

→ Telegram receives `infographic.jpg` via sendDocument, original 1MB
JPEG bytes preserved, no recompression. Forwarding and download
filenames stay clean (`infographic.jpg`).

Tests: +3 cases in TestExtractMedia covering directive strip, isolation
from voice flag, and coexistence with [[audio_as_voice]]. All
113 pre-existing media/extract/send tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
leon7609 2026-05-03 11:20:00 +08:00 committed by Teknium
parent 8d363f8d54
commit d34f03c32a
4 changed files with 94 additions and 14 deletions

View file

@ -1874,23 +1874,38 @@ class BasePlatformAdapter(ABC):
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
"""
Extract MEDIA:<path> tags and [[audio_as_voice]] directives from response text.
The TTS tool returns responses like:
[[audio_as_voice]]
MEDIA:/path/to/audio.ogg
Skills that produce large/lossless images (e.g. info-graph, where a
rendered JPG is 1-2 MB but Telegram's sendPhoto recompresses to
~200 KB at 1280px) can use ``[[as_document]]`` to request unmodified
delivery via sendDocument instead of sendPhoto/sendMediaGroup. The
directive is detected at the dispatch sites (which have access to the
original response); this method just strips it so it never leaks into
user-visible text. Per-file granularity is intentionally not exposed
when an agent emits ``[[as_document]]`` once, every image path in the
same response is delivered as a document, mirroring the all-or-nothing
scope of ``[[audio_as_voice]]``.
Args:
content: The response text to scan.
Returns:
Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed).
"""
media = []
cleaned = content
# Check for [[audio_as_voice]] directive
has_voice_tag = "[[audio_as_voice]]" in content
cleaned = cleaned.replace("[[audio_as_voice]]", "")
# Strip [[as_document]] directive — callers inspect the original
# ``content`` for it (so they can still react to it); here we just
# keep it out of the user-visible cleaned text.
cleaned = cleaned.replace("[[as_document]]", "")
# Extract MEDIA:<path> tags, allowing optional whitespace after the colon
# and quoted/backticked paths for LLM-formatted outputs.
@ -2815,13 +2830,21 @@ class BasePlatformAdapter(ABC):
if not response:
logger.debug("[%s] Handler returned empty/None response for %s", self.name, event.source.chat_id)
if response:
# Capture [[as_document]] before extract_media strips it, so the
# dispatch partition below can route image-extension files
# through send_document instead of send_multiple_images. Used
# by skills that produce large/lossless images (e.g. info-graph)
# where Telegram's sendPhoto recompression destroys legibility.
force_document_attachments = "[[as_document]]" in response
# Extract MEDIA:<path> tags (from TTS tool) before other processing
media_files, response = self.extract_media(response)
# Extract image URLs and send them as native platform attachments
images, text_content = self.extract_images(response)
# Strip any remaining internal directives from message body (fixes #1561)
text_content = text_content.replace("[[audio_as_voice]]", "").strip()
text_content = text_content.replace("[[as_document]]", "").strip()
text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip()
if images:
logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))
@ -2923,19 +2946,26 @@ class BasePlatformAdapter(ABC):
_IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
# Partition images out of media_files + local_files so they
# can be sent as a single batch (Signal RPC)
# can be sent as a single batch (Signal RPC). When
# ``[[as_document]]`` was set on the original response, image
# files skip the photo path and route to send_document below
# so they're delivered with original bytes (no Telegram
# sendPhoto recompression).
from urllib.parse import quote as _quote
_image_paths: list = []
_non_image_media: list = []
for media_path, is_voice in media_files:
_ext = Path(media_path).suffix.lower()
if _ext in _IMAGE_EXTS and not is_voice:
if (_ext in _IMAGE_EXTS
and not is_voice
and not force_document_attachments):
_image_paths.append(media_path)
else:
_non_image_media.append((media_path, is_voice))
_non_image_local: list = []
for file_path in local_files:
if Path(file_path).suffix.lower() in _IMAGE_EXTS:
if (Path(file_path).suffix.lower() in _IMAGE_EXTS
and not force_document_attachments):
_image_paths.append(file_path)
else:
_non_image_local.append(file_path)

View file

@ -8961,6 +8961,12 @@ class GatewayRunner:
from urllib.parse import quote as _quote
try:
# Capture [[as_document]] before extract_media strips it, so the
# dispatch partition below can route image-extension files
# through send_document (preserving bytes) instead of
# send_multiple_images (Telegram sendPhoto recompresses to ~1280px).
force_document_attachments = "[[as_document]]" in response
media_files, _ = adapter.extract_media(response)
_, cleaned = adapter.extract_images(response)
local_files, _ = adapter.extract_local_files(cleaned)
@ -8973,19 +8979,24 @@ class GatewayRunner:
_IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
# Partition out images so they can be sent as a single batch
# (e.g. Signal's multi-attachment RPC)
# (e.g. Signal's multi-attachment RPC). When [[as_document]] was
# set, image-extension files skip the photo path and route to
# send_document below — preserving original bytes.
image_paths: list = []
non_image_media: list = []
for media_path, is_voice in media_files:
ext = Path(media_path).suffix.lower()
if ext in _IMAGE_EXTS and not is_voice:
if (ext in _IMAGE_EXTS
and not is_voice
and not force_document_attachments):
image_paths.append(media_path)
else:
non_image_media.append((media_path, is_voice))
non_image_local: list = []
for file_path in local_files:
if Path(file_path).suffix.lower() in _IMAGE_EXTS:
if (Path(file_path).suffix.lower() in _IMAGE_EXTS
and not force_document_attachments):
image_paths.append(file_path)
else:
non_image_local.append(file_path)