fix(telegram): chunk formatted messages with UTF-16 length accounting

The standalone send path (_send_telegram, used by the send_message tool,
cron delivery, and out-of-process callers) chunked the *raw* message on
UTF-16 length, then formatted and sent the result un-rechunked. MarkdownV2
escaping inflates the text (`!`/`.`/`-` -> `\!`/`\.`/`\-`), so a
4096 UTF-16-unit raw message can become ~8192 units once formatted and gets
rejected by Telegram as 'Message is too long'.

Move all text chunking into _send_telegram, after formatting: split the
formatted MarkdownV2/HTML text on UTF-16 length so every send is <=4096,
with per-chunk plain-text fallback and thread-not-found retry preserved.
Media attaches after all text chunks. (#28557)
This commit is contained in:
Jeffgithub0029 2026-06-30 03:37:40 -07:00 committed by Teknium
parent af5cea04ab
commit b7c4369ca0
2 changed files with 128 additions and 70 deletions

View file

@ -783,27 +783,75 @@ class TestSendToPlatformChunking:
sent_text = send.await_args.args[2]
assert "<https://en.wikipedia.org/wiki/Foo_(bar)|Foo>" in sent_text
def test_telegram_media_attaches_to_last_chunk(self):
def test_telegram_markdown_expansion_is_chunked_before_send(self, monkeypatch):
"""Telegram chunking must account for MarkdownV2 escaping expansion.
sent_calls = []
A raw message under 4096 UTF-16 units can inflate past the limit once
MarkdownV2-escaped (each `!`/`.`/`-` becomes `\\!`/`\\.`/`\\-`). The
send path must chunk the *formatted* text so no single send exceeds
4096 (issue #28557).
"""
from gateway.platforms.base import utf16_len
async def fake_send(token, chat_id, message, media_files=None, thread_id=None, disable_link_previews=False, force_document=False):
sent_calls.append(media_files or [])
return {"success": True, "platform": "telegram", "chat_id": chat_id, "message_id": str(len(sent_calls))}
send_lengths = []
long_msg = "word " * 2000 # ~10000 chars, well over 4096
media = [("/tmp/photo.png", False)]
with patch("tools.send_message_tool._send_telegram", fake_send):
asyncio.run(
_send_to_platform(
Platform.TELEGRAM,
SimpleNamespace(enabled=True, token="tok", extra={}),
"123", long_msg, media_files=media,
)
async def fake_send_message(**kwargs):
text = kwargs["text"]
send_lengths.append(utf16_len(text))
if utf16_len(text) > 4096:
raise Exception("Message is too long")
return SimpleNamespace(message_id=len(send_lengths))
bot = MagicMock()
bot.send_message = AsyncMock(side_effect=fake_send_message)
bot.send_photo = AsyncMock()
bot.send_video = AsyncMock()
bot.send_voice = AsyncMock()
bot.send_audio = AsyncMock()
bot.send_document = AsyncMock()
_install_telegram_mock(monkeypatch, bot)
result = asyncio.run(
_send_to_platform(
Platform.TELEGRAM,
SimpleNamespace(enabled=True, token="tok", extra={}),
"123",
"!" * 4096, # raw 4096 -> ~8192 after MarkdownV2 escaping
)
assert len(sent_calls) >= 3
assert all(call == [] for call in sent_calls[:-1])
assert sent_calls[-1] == media
)
assert result["success"] is True
assert bot.send_message.await_count >= 2
assert max(send_lengths) <= 4096
def test_telegram_media_attaches_after_long_text_chunks(self, tmp_path, monkeypatch):
"""Long text is split into multiple chunks, then media is attached."""
image_path = tmp_path / "photo.png"
image_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 32)
bot = MagicMock()
bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=1))
bot.send_photo = AsyncMock(return_value=SimpleNamespace(message_id=2))
bot.send_video = AsyncMock()
bot.send_voice = AsyncMock()
bot.send_audio = AsyncMock()
bot.send_document = AsyncMock()
_install_telegram_mock(monkeypatch, bot)
long_msg = "word " * 2000 # ~10000 chars, well over Telegram's 4096 limit
result = asyncio.run(
_send_to_platform(
Platform.TELEGRAM,
SimpleNamespace(enabled=True, token="tok", extra={}),
"123",
long_msg,
media_files=[(str(image_path), False)],
)
)
assert result["success"] is True
assert bot.send_message.await_count >= 3
bot.send_photo.assert_awaited_once()
def test_matrix_media_uses_native_adapter_helper(self, tmp_path):
doc_path = tmp_path / "test-send-message-matrix.pdf"

View file

@ -785,24 +785,22 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
chunks = [message]
# --- Telegram: special handling for media attachments ---
# _send_telegram now owns text chunking internally — it formats the full
# message (MarkdownV2/HTML) and then splits the *formatted* text on UTF-16
# length so escaping inflation can't push a chunk over Telegram's 4096
# limit (issue #28557). Pass the whole message in one call; media attaches
# after all text chunks.
if platform == Platform.TELEGRAM:
last_result = None
disable_link_previews = bool(getattr(pconfig, "extra", {}) and pconfig.extra.get("disable_link_previews"))
for i, chunk in enumerate(chunks):
is_last = (i == len(chunks) - 1)
result = await _send_telegram(
pconfig.token,
chat_id,
chunk,
media_files=media_files if is_last else [],
thread_id=thread_id,
disable_link_previews=disable_link_previews,
force_document=force_document,
)
if isinstance(result, dict) and result.get("error"):
return result
last_result = result
return last_result
return await _send_telegram(
pconfig.token,
chat_id,
message,
media_files=media_files,
thread_id=thread_id,
disable_link_previews=disable_link_previews,
force_document=force_document,
)
# --- Discord: chunked delivery via the registry's standalone_sender_fn.
# The plugin's ``_standalone_send`` (registered in
@ -1110,48 +1108,60 @@ async def _send_telegram(token, chat_id, message, media_files=None, thread_id=No
warnings = []
if formatted.strip():
try:
last_msg = await _send_telegram_message_with_retry(
bot,
chat_id=int_chat_id, text=formatted,
parse_mode=send_parse_mode, **text_kwargs
)
except Exception as md_error:
# Thread not found — retry without message_thread_id so the
# message still delivers (matching the gateway adapter's
# fallback behaviour, issue #27012).
if _is_telegram_thread_not_found(md_error) and thread_kwargs:
logger.warning(
"Thread %s not found in _send_telegram, retrying without message_thread_id",
thread_kwargs.get("message_thread_id"),
)
text_kwargs.pop("message_thread_id", None)
# Chunk *after* formatting: MarkdownV2/HTML escaping inflates the
# text (each escaped char like `!`/`.`/`-` becomes `\!`/`\.`/`\-`),
# so a message that fit under 4096 UTF-16 units raw can exceed the
# Telegram limit once formatted and get rejected as "Message is too
# long". Sizing on the formatted text in UTF-16 units guarantees
# every chunk is deliverable. (issue #28557)
from gateway.platforms.base import BasePlatformAdapter, utf16_len
text_chunks = BasePlatformAdapter.truncate_message(
formatted, 4096, len_fn=utf16_len
)
for chunk in text_chunks:
try:
last_msg = await _send_telegram_message_with_retry(
bot,
chat_id=int_chat_id, text=formatted,
chat_id=int_chat_id, text=chunk,
parse_mode=send_parse_mode, **text_kwargs
)
elif "parse" in str(md_error).lower() or "markdown" in str(md_error).lower() or "html" in str(md_error).lower():
logger.warning(
"Parse mode %s failed in _send_telegram, falling back to plain text: %s",
send_parse_mode,
_sanitize_error_text(md_error),
)
if not _has_html:
try:
from plugins.platforms.telegram.adapter import _strip_mdv2
plain = _strip_mdv2(formatted)
except Exception:
plain = message
except Exception as md_error:
# Thread not found — retry without message_thread_id so the
# message still delivers (matching the gateway adapter's
# fallback behaviour, issue #27012).
if _is_telegram_thread_not_found(md_error) and text_kwargs.get("message_thread_id") is not None:
logger.warning(
"Thread %s not found in _send_telegram, retrying without message_thread_id",
text_kwargs.get("message_thread_id"),
)
text_kwargs.pop("message_thread_id", None)
last_msg = await _send_telegram_message_with_retry(
bot,
chat_id=int_chat_id, text=chunk,
parse_mode=send_parse_mode, **text_kwargs
)
elif "parse" in str(md_error).lower() or "markdown" in str(md_error).lower() or "html" in str(md_error).lower():
logger.warning(
"Parse mode %s failed in _send_telegram, falling back to plain text: %s",
send_parse_mode,
_sanitize_error_text(md_error),
)
if not _has_html:
try:
from plugins.platforms.telegram.adapter import _strip_mdv2
plain = _strip_mdv2(chunk)
except Exception:
plain = chunk
else:
plain = chunk
last_msg = await _send_telegram_message_with_retry(
bot,
chat_id=int_chat_id, text=plain,
parse_mode=None, **text_kwargs
)
else:
plain = message
last_msg = await _send_telegram_message_with_retry(
bot,
chat_id=int_chat_id, text=plain,
parse_mode=None, **text_kwargs
)
else:
raise
raise
for media_path, is_voice in media_files:
if not os.path.exists(media_path):