mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(telegram): use UTF-16 code units for message length splitting (#8725)
Port from nearai/ironclaw#2304: Telegram's 4096 character limit is measured in UTF-16 code units, not Unicode codepoints. Characters outside the Basic Multilingual Plane (emoji like 😀, CJK Extension B, musical symbols) are surrogate pairs: 1 Python char but 2 UTF-16 units. Previously, truncate_message() used Python's len() which counts codepoints. This could produce chunks exceeding Telegram's actual limit when messages contain many astral-plane characters. Changes: - Add utf16_len() helper and _prefix_within_utf16_limit() for UTF-16-aware string measurement and truncation - Add _custom_unit_to_cp() binary-search helper that maps a custom-unit budget to the largest safe codepoint slice position - Update truncate_message() to accept optional len_fn parameter - Telegram adapter now passes len_fn=utf16_len when splitting messages - Fix fallback truncation in Telegram error handler to use _prefix_within_utf16_limit instead of codepoint slicing - Update send_message_tool.py to use utf16_len for Telegram platform - Add comprehensive tests: utf16_len, _prefix_within_utf16_limit, truncate_message with len_fn (emoji splitting, content preservation, code block handling) - Update mock lambdas in reply_mode tests to accept **kw for len_fn
This commit is contained in:
parent
3cd6cbee5f
commit
9e992df8ae
6 changed files with 240 additions and 25 deletions
|
|
@ -322,7 +322,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
|
|||
(preserves code-block boundaries, adds part indicators).
|
||||
"""
|
||||
from gateway.config import Platform
|
||||
from gateway.platforms.base import BasePlatformAdapter
|
||||
from gateway.platforms.base import BasePlatformAdapter, utf16_len
|
||||
from gateway.platforms.telegram import TelegramAdapter
|
||||
from gateway.platforms.discord import DiscordAdapter
|
||||
from gateway.platforms.slack import SlackAdapter
|
||||
|
|
@ -354,9 +354,11 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
|
|||
|
||||
# Smart-chunk the message to fit within platform limits.
|
||||
# For short messages or platforms without a known limit this is a no-op.
|
||||
# Telegram measures length in UTF-16 code units, not Unicode codepoints.
|
||||
max_len = _MAX_LENGTHS.get(platform)
|
||||
if max_len:
|
||||
chunks = BasePlatformAdapter.truncate_message(message, max_len)
|
||||
_len_fn = utf16_len if platform == Platform.TELEGRAM else None
|
||||
chunks = BasePlatformAdapter.truncate_message(message, max_len, len_fn=_len_fn)
|
||||
else:
|
||||
chunks = [message]
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue