mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-13 03:52:00 +00:00
feat(gateway): support [[as_document]] directive for skill media routing
Skills that produce large/lossless images (e.g. info-graph, where a
rendered JPG is 1-2 MB) currently lose quality in Telegram delivery
because `_IMAGE_EXTS` membership routes the file through
`send_multiple_images` → `sendMediaGroup`, which Telegram's server
re-encodes to JPEG @ 1280px max edge. The original bytes only survive
when the file goes through `send_document`, which the dispatch tables
in three places (`_process_message_background`, `_deliver_media_from_response`,
and the `send_message` tool's telegram path) only reach for files
whose extension is NOT in `_IMAGE_EXTS`.
This commit adds an `[[as_document]]` directive that mirrors the
existing `[[audio_as_voice]]` shape: a skill emits the directive once
in its response, and every image-extension MEDIA: file in that response
is delivered via `send_document` instead of `send_multiple_images` /
`sendPhoto`. The directive is detected at the dispatch sites (which see
the raw response) and the directive string is stripped from the
user-visible cleaned text in `extract_media` so it never leaks.
Granularity is intentionally all-or-nothing per response, matching
[[audio_as_voice]]'s scope. Skills that need fine control can split into
two responses.
Verified the targeted use case: info-graph emits
信息图已生成(...)
[[as_document]]
MEDIA:/tmp/info-graph-x/infographic.jpg
→ Telegram receives `infographic.jpg` via sendDocument, original 1MB
JPEG bytes preserved, no recompression. Forwarding and download
filenames stay clean (`infographic.jpg`).
Tests: +3 cases in TestExtractMedia covering directive strip, isolation
from voice flag, and coexistence with [[audio_as_voice]]. All
113 pre-existing media/extract/send tests pass.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8d363f8d54
commit
d34f03c32a
4 changed files with 94 additions and 14 deletions
|
|
@ -8961,6 +8961,12 @@ class GatewayRunner:
|
|||
from urllib.parse import quote as _quote
|
||||
|
||||
try:
|
||||
# Capture [[as_document]] before extract_media strips it, so the
|
||||
# dispatch partition below can route image-extension files
|
||||
# through send_document (preserving bytes) instead of
|
||||
# send_multiple_images (Telegram sendPhoto recompresses to ~1280px).
|
||||
force_document_attachments = "[[as_document]]" in response
|
||||
|
||||
media_files, _ = adapter.extract_media(response)
|
||||
_, cleaned = adapter.extract_images(response)
|
||||
local_files, _ = adapter.extract_local_files(cleaned)
|
||||
|
|
@ -8973,19 +8979,24 @@ class GatewayRunner:
|
|||
_IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
|
||||
|
||||
# Partition out images so they can be sent as a single batch
|
||||
# (e.g. Signal's multi-attachment RPC)
|
||||
# (e.g. Signal's multi-attachment RPC). When [[as_document]] was
|
||||
# set, image-extension files skip the photo path and route to
|
||||
# send_document below — preserving original bytes.
|
||||
image_paths: list = []
|
||||
non_image_media: list = []
|
||||
for media_path, is_voice in media_files:
|
||||
ext = Path(media_path).suffix.lower()
|
||||
if ext in _IMAGE_EXTS and not is_voice:
|
||||
if (ext in _IMAGE_EXTS
|
||||
and not is_voice
|
||||
and not force_document_attachments):
|
||||
image_paths.append(media_path)
|
||||
else:
|
||||
non_image_media.append((media_path, is_voice))
|
||||
|
||||
non_image_local: list = []
|
||||
for file_path in local_files:
|
||||
if Path(file_path).suffix.lower() in _IMAGE_EXTS:
|
||||
if (Path(file_path).suffix.lower() in _IMAGE_EXTS
|
||||
and not force_document_attachments):
|
||||
image_paths.append(file_path)
|
||||
else:
|
||||
non_image_local.append(file_path)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue