From 45735e71a273cfb335adfe4350750789ad4458dc Mon Sep 17 00:00:00 2001 From: Teknium Date: Sun, 12 Apr 2026 17:43:05 -0700 Subject: [PATCH] fix(telegram): use UTF-16 code units for message length splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port from nearai/ironclaw#2304: Telegram's 4096 character limit is measured in UTF-16 code units, not Unicode codepoints. Characters outside the Basic Multilingual Plane (emoji like πŸ˜€, CJK Extension B, musical symbols) are surrogate pairs: 1 Python char but 2 UTF-16 units. Previously, truncate_message() used Python's len() which counts codepoints. This could produce chunks exceeding Telegram's actual limit when messages contain many astral-plane characters. Changes: - Add utf16_len() helper and _prefix_within_utf16_limit() for UTF-16-aware string measurement and truncation - Add _custom_unit_to_cp() binary-search helper that maps a custom-unit budget to the largest safe codepoint slice position - Update truncate_message() to accept optional len_fn parameter - Telegram adapter now passes len_fn=utf16_len when splitting messages - Fix fallback truncation in Telegram error handler to use _prefix_within_utf16_limit instead of codepoint slicing - Update send_message_tool.py to use utf16_len for Telegram platform - Add comprehensive tests: utf16_len, _prefix_within_utf16_limit, truncate_message with len_fn (emoji splitting, content preservation, code block handling) - Update mock lambdas in reply_mode tests to accept **kw for len_fn --- gateway/platforms/base.py | 91 +++++++++++++-- gateway/platforms/telegram.py | 10 +- tests/gateway/test_discord_reply_mode.py | 14 +-- tests/gateway/test_platform_base.py | 134 ++++++++++++++++++++++ tests/gateway/test_telegram_reply_mode.py | 10 +- tools/send_message_tool.py | 6 +- 6 files changed, 240 insertions(+), 25 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 352aecb33..f7943da47 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -21,6 +21,59 @@ from urllib.parse import urlsplit logger = logging.getLogger(__name__) +def utf16_len(s: str) -> int: + """Count UTF-16 code units in *s*. + + Telegram's message-length limit (4 096) is measured in UTF-16 code units, + **not** Unicode code-points. Characters outside the Basic Multilingual + Plane (emoji like πŸ˜€, CJK Extension B, musical symbols, …) are encoded as + surrogate pairs and therefore consume **two** UTF-16 code units each, even + though Python's ``len()`` counts them as one. + + Ported from nearai/ironclaw#2304 which discovered the same discrepancy in + Rust's ``chars().count()``. + """ + return len(s.encode("utf-16-le")) // 2 + + +def _prefix_within_utf16_limit(s: str, limit: int) -> str: + """Return the longest prefix of *s* whose UTF-16 length ≀ *limit*. + + Unlike a plain ``s[:limit]``, this respects surrogate-pair boundaries so + we never slice a multi-code-unit character in half. + """ + if utf16_len(s) <= limit: + return s + # Binary search for the longest safe prefix + lo, hi = 0, len(s) + while lo < hi: + mid = (lo + hi + 1) // 2 + if utf16_len(s[:mid]) <= limit: + lo = mid + else: + hi = mid - 1 + return s[:lo] + + +def _custom_unit_to_cp(s: str, budget: int, len_fn) -> int: + """Return the largest codepoint offset *n* such that ``len_fn(s[:n]) <= budget``. + + Used by :meth:`BasePlatformAdapter.truncate_message` when *len_fn* measures + length in units different from Python codepoints (e.g. UTF-16 code units). + Falls back to binary search which is O(log n) calls to *len_fn*. + """ + if len_fn(s) <= budget: + return len(s) + lo, hi = 0, len(s) + while lo < hi: + mid = (lo + hi + 1) // 2 + if len_fn(s[:mid]) <= budget: + lo = mid + else: + hi = mid - 1 + return lo + + def is_network_accessible(host: str) -> bool: """Return True if *host* would expose the server beyond loopback. @@ -1886,7 +1939,11 @@ class BasePlatformAdapter(ABC): return content @staticmethod - def truncate_message(content: str, max_length: int = 4096) -> List[str]: + def truncate_message( + content: str, + max_length: int = 4096, + len_fn: Optional["Callable[[str], int]"] = None, + ) -> List[str]: """ Split a long message into chunks, preserving code block boundaries. @@ -1898,11 +1955,16 @@ class BasePlatformAdapter(ABC): Args: content: The full message content max_length: Maximum length per chunk (platform-specific) + len_fn: Optional length function for measuring string length. + Defaults to ``len`` (Unicode code-points). Pass + ``utf16_len`` for platforms that measure message + length in UTF-16 code units (e.g. Telegram). Returns: List of message chunks """ - if len(content) <= max_length: + _len = len_fn or len + if _len(content) <= max_length: return [content] INDICATOR_RESERVE = 10 # room for " (XX/XX)" @@ -1921,22 +1983,33 @@ class BasePlatformAdapter(ABC): # How much body text we can fit after accounting for the prefix, # a potential closing fence, and the chunk indicator. - headroom = max_length - INDICATOR_RESERVE - len(prefix) - len(FENCE_CLOSE) + headroom = max_length - INDICATOR_RESERVE - _len(prefix) - _len(FENCE_CLOSE) if headroom < 1: headroom = max_length // 2 # Everything remaining fits in one final chunk - if len(prefix) + len(remaining) <= max_length - INDICATOR_RESERVE: + if _len(prefix) + _len(remaining) <= max_length - INDICATOR_RESERVE: chunks.append(prefix + remaining) break - # Find a natural split point (prefer newlines, then spaces) - region = remaining[:headroom] + # Find a natural split point (prefer newlines, then spaces). + # When _len != len (e.g. utf16_len for Telegram), headroom is + # measured in the custom unit. We need codepoint-based slice + # positions that stay within the custom-unit budget. + # + # _safe_slice_pos() maps a custom-unit budget to the largest + # codepoint offset whose custom length ≀ budget. + if _len is not len: + # Map headroom (custom units) β†’ codepoint slice length + _cp_limit = _custom_unit_to_cp(remaining, headroom, _len) + else: + _cp_limit = headroom + region = remaining[:_cp_limit] split_at = region.rfind("\n") - if split_at < headroom // 2: + if split_at < _cp_limit // 2: split_at = region.rfind(" ") if split_at < 1: - split_at = headroom + split_at = _cp_limit # Avoid splitting inside an inline code span (`...`). # If the text before split_at has an odd number of unescaped @@ -1956,7 +2029,7 @@ class BasePlatformAdapter(ABC): safe_split = candidate.rfind(" ", 0, last_bt) nl_split = candidate.rfind("\n", 0, last_bt) safe_split = max(safe_split, nl_split) - if safe_split > headroom // 4: + if safe_split > _cp_limit // 4: split_at = safe_split chunk_body = remaining[:split_at] diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 265329602..5262e388b 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -66,6 +66,8 @@ from gateway.platforms.base import ( cache_audio_from_bytes, cache_document_from_bytes, SUPPORTED_DOCUMENT_TYPES, + utf16_len, + _prefix_within_utf16_limit, ) from gateway.platforms.telegram_network import ( TelegramFallbackTransport, @@ -799,7 +801,9 @@ class TelegramAdapter(BasePlatformAdapter): try: # Format and split message if needed formatted = self.format_message(content) - chunks = self.truncate_message(formatted, self.MAX_MESSAGE_LENGTH) + chunks = self.truncate_message( + formatted, self.MAX_MESSAGE_LENGTH, len_fn=utf16_len, + ) if len(chunks) > 1: # truncate_message appends a raw " (1/2)" suffix. Escape the # MarkdownV2-special parentheses so Telegram doesn't reject the @@ -970,7 +974,9 @@ class TelegramAdapter(BasePlatformAdapter): # streaming). Truncate and succeed so the stream consumer can # split the overflow into a new message instead of dying. if "message_too_long" in err_str or "too long" in err_str: - truncated = content[: self.MAX_MESSAGE_LENGTH - 20] + "…" + truncated = _prefix_within_utf16_limit( + content, self.MAX_MESSAGE_LENGTH - 20 + ) + "…" try: await self._bot.edit_message_text( chat_id=int(chat_id), diff --git a/tests/gateway/test_discord_reply_mode.py b/tests/gateway/test_discord_reply_mode.py index 5a9bb9cd1..2346d086f 100644 --- a/tests/gateway/test_discord_reply_mode.py +++ b/tests/gateway/test_discord_reply_mode.py @@ -124,7 +124,7 @@ class TestSendWithReplyToMode: @pytest.mark.asyncio async def test_off_mode_no_reply_reference(self): adapter, channel, ref_msg = _make_discord_adapter("off") - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"] await adapter.send("12345", "test content", reply_to="999") @@ -137,7 +137,7 @@ class TestSendWithReplyToMode: @pytest.mark.asyncio async def test_first_mode_only_first_chunk_references(self): adapter, channel, ref_msg = _make_discord_adapter("first") - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"] await adapter.send("12345", "test content", reply_to="999") @@ -152,7 +152,7 @@ class TestSendWithReplyToMode: @pytest.mark.asyncio async def test_all_mode_all_chunks_reference(self): adapter, channel, ref_msg = _make_discord_adapter("all") - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"] await adapter.send("12345", "test content", reply_to="999") @@ -165,7 +165,7 @@ class TestSendWithReplyToMode: @pytest.mark.asyncio async def test_no_reply_to_param_no_reference(self): adapter, channel, ref_msg = _make_discord_adapter("all") - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2"] await adapter.send("12345", "test content", reply_to=None) @@ -176,7 +176,7 @@ class TestSendWithReplyToMode: @pytest.mark.asyncio async def test_single_chunk_respects_first_mode(self): adapter, channel, ref_msg = _make_discord_adapter("first") - adapter.truncate_message = lambda content, max_len: ["single chunk"] + adapter.truncate_message = lambda content, max_len, **kw: ["single chunk"] await adapter.send("12345", "test", reply_to="999") @@ -187,7 +187,7 @@ class TestSendWithReplyToMode: @pytest.mark.asyncio async def test_single_chunk_off_mode(self): adapter, channel, ref_msg = _make_discord_adapter("off") - adapter.truncate_message = lambda content, max_len: ["single chunk"] + adapter.truncate_message = lambda content, max_len, **kw: ["single chunk"] await adapter.send("12345", "test", reply_to="999") @@ -200,7 +200,7 @@ class TestSendWithReplyToMode: async def test_invalid_mode_falls_back_to_first_behavior(self): """Invalid mode behaves like 'first' β€” only first chunk gets reference.""" adapter, channel, ref_msg = _make_discord_adapter("banana") - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2"] await adapter.send("12345", "test", reply_to="999") diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index f2d133ea2..690a82095 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -9,6 +9,8 @@ from gateway.platforms.base import ( MessageEvent, MessageType, safe_url_for_log, + utf16_len, + _prefix_within_utf16_limit, ) @@ -448,3 +450,135 @@ class TestGetHumanDelay: with patch.dict(os.environ, env): delay = BasePlatformAdapter._get_human_delay() assert 0.1 <= delay <= 0.2 + + +# --------------------------------------------------------------------------- +# utf16_len / _prefix_within_utf16_limit / truncate_message with len_fn +# --------------------------------------------------------------------------- +# Ported from nearai/ironclaw#2304 β€” Telegram counts message length in UTF-16 +# code units, not Unicode code-points. Astral-plane characters (emoji, CJK +# Extension B) are surrogate pairs: 1 Python char but 2 UTF-16 units. + + +class TestUtf16Len: + """Verify the UTF-16 length helper.""" + + def test_ascii(self): + assert utf16_len("hello") == 5 + + def test_bmp_cjk(self): + # CJK ideographs in the BMP are 1 code unit each + assert utf16_len("δ½ ε₯½") == 2 + + def test_emoji_surrogate_pair(self): + # πŸ˜€ (U+1F600) is outside BMP β†’ 2 UTF-16 code units + assert utf16_len("πŸ˜€") == 2 + + def test_mixed(self): + # "hiπŸ˜€" = 2 + 2 = 4 UTF-16 units + assert utf16_len("hiπŸ˜€") == 4 + + def test_musical_symbol(self): + # π„ž (U+1D11E) β€” Musical Symbol G Clef, surrogate pair + assert utf16_len("π„ž") == 2 + + def test_empty(self): + assert utf16_len("") == 0 + + +class TestPrefixWithinUtf16Limit: + """Verify UTF-16-aware prefix truncation.""" + + def test_fits_entirely(self): + assert _prefix_within_utf16_limit("hello", 10) == "hello" + + def test_ascii_truncation(self): + result = _prefix_within_utf16_limit("hello world", 5) + assert result == "hello" + assert utf16_len(result) <= 5 + + def test_does_not_split_surrogate_pair(self): + # "aπŸ˜€b" = 1 + 2 + 1 = 4 UTF-16 units; limit 2 should give "a" + result = _prefix_within_utf16_limit("aπŸ˜€b", 2) + assert result == "a" + assert utf16_len(result) <= 2 + + def test_emoji_at_limit(self): + # "πŸ˜€" = 2 UTF-16 units; limit 2 should include it + result = _prefix_within_utf16_limit("πŸ˜€x", 2) + assert result == "πŸ˜€" + + def test_all_emoji(self): + msg = "πŸ˜€" * 10 # 20 UTF-16 units + result = _prefix_within_utf16_limit(msg, 6) + assert result == "πŸ˜€πŸ˜€πŸ˜€" + assert utf16_len(result) == 6 + + def test_empty(self): + assert _prefix_within_utf16_limit("", 5) == "" + + +class TestTruncateMessageUtf16: + """Verify truncate_message respects UTF-16 lengths when len_fn=utf16_len.""" + + def test_short_emoji_message_no_split(self): + """A short message under the UTF-16 limit should not be split.""" + msg = "Hello πŸ˜€ world" + chunks = BasePlatformAdapter.truncate_message(msg, 4096, len_fn=utf16_len) + assert len(chunks) == 1 + assert chunks[0] == msg + + def test_emoji_near_limit_triggers_split(self): + """A message at 4096 codepoints but >4096 UTF-16 units must split.""" + # 2049 emoji = 2049 codepoints but 4098 UTF-16 units β†’ exceeds 4096 + msg = "πŸ˜€" * 2049 + assert len(msg) == 2049 # Python len sees 2049 chars + assert utf16_len(msg) == 4098 # but it's 4098 UTF-16 units + + # Without UTF-16 awareness, this would NOT split (2049 < 4096) + chunks_naive = BasePlatformAdapter.truncate_message(msg, 4096) + assert len(chunks_naive) == 1, "Without len_fn, no split expected" + + # With UTF-16 awareness, it MUST split + chunks = BasePlatformAdapter.truncate_message(msg, 4096, len_fn=utf16_len) + assert len(chunks) > 1, "With utf16_len, message should be split" + + # Each chunk must fit within the UTF-16 limit + for i, chunk in enumerate(chunks): + assert utf16_len(chunk) <= 4096, ( + f"Chunk {i} exceeds 4096 UTF-16 units: {utf16_len(chunk)}" + ) + + def test_each_utf16_chunk_within_limit(self): + """All chunks produced with utf16_len must fit the limit.""" + # Mix of BMP and astral-plane characters + msg = ("Hello πŸ˜€ world 🎡 test π„ž " * 200).strip() + max_len = 200 + chunks = BasePlatformAdapter.truncate_message(msg, max_len, len_fn=utf16_len) + for i, chunk in enumerate(chunks): + u16_len = utf16_len(chunk) + assert u16_len <= max_len + 20, ( + f"Chunk {i} UTF-16 length {u16_len} exceeds {max_len}" + ) + + def test_all_content_preserved(self): + """Splitting with utf16_len must not lose content.""" + words = ["emojiπŸ˜€", "music🎡", "cjkδ½ ε₯½", "plain"] * 100 + msg = " ".join(words) + chunks = BasePlatformAdapter.truncate_message(msg, 200, len_fn=utf16_len) + reassembled = " ".join(chunks) + for word in words: + assert word in reassembled, f"Word '{word}' lost during UTF-16 split" + + def test_code_blocks_preserved_with_utf16(self): + """Code block fence handling should work with utf16_len too.""" + msg = "Before\n```python\n" + "x = 'πŸ˜€'\n" * 200 + "```\nAfter" + chunks = BasePlatformAdapter.truncate_message(msg, 300, len_fn=utf16_len) + assert len(chunks) > 1 + # Each chunk should have balanced fences + for i, chunk in enumerate(chunks): + fence_count = chunk.count("```") + assert fence_count % 2 == 0, ( + f"Chunk {i} has unbalanced fences ({fence_count})" + ) + diff --git a/tests/gateway/test_telegram_reply_mode.py b/tests/gateway/test_telegram_reply_mode.py index 1218afa0c..a433b1801 100644 --- a/tests/gateway/test_telegram_reply_mode.py +++ b/tests/gateway/test_telegram_reply_mode.py @@ -121,7 +121,7 @@ class TestSendWithReplyToMode: adapter = adapter_factory(reply_to_mode="off") adapter._bot = MagicMock() adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1)) - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"] await adapter.send("12345", "test content", reply_to="999") @@ -133,7 +133,7 @@ class TestSendWithReplyToMode: adapter = adapter_factory(reply_to_mode="first") adapter._bot = MagicMock() adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1)) - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"] await adapter.send("12345", "test content", reply_to="999") @@ -148,7 +148,7 @@ class TestSendWithReplyToMode: adapter = adapter_factory(reply_to_mode="all") adapter._bot = MagicMock() adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1)) - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"] await adapter.send("12345", "test content", reply_to="999") @@ -162,7 +162,7 @@ class TestSendWithReplyToMode: adapter = adapter_factory(reply_to_mode="all") adapter._bot = MagicMock() adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1)) - adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2"] + adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2"] await adapter.send("12345", "test content", reply_to=None) @@ -175,7 +175,7 @@ class TestSendWithReplyToMode: adapter = adapter_factory(reply_to_mode="first") adapter._bot = MagicMock() adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1)) - adapter.truncate_message = lambda content, max_len: ["single chunk"] + adapter.truncate_message = lambda content, max_len, **kw: ["single chunk"] await adapter.send("12345", "test", reply_to="999") diff --git a/tools/send_message_tool.py b/tools/send_message_tool.py index 60503c0bc..a2b3e984c 100644 --- a/tools/send_message_tool.py +++ b/tools/send_message_tool.py @@ -322,7 +322,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, (preserves code-block boundaries, adds part indicators). """ from gateway.config import Platform - from gateway.platforms.base import BasePlatformAdapter + from gateway.platforms.base import BasePlatformAdapter, utf16_len from gateway.platforms.telegram import TelegramAdapter from gateway.platforms.discord import DiscordAdapter from gateway.platforms.slack import SlackAdapter @@ -354,9 +354,11 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, # Smart-chunk the message to fit within platform limits. # For short messages or platforms without a known limit this is a no-op. + # Telegram measures length in UTF-16 code units, not Unicode codepoints. max_len = _MAX_LENGTHS.get(platform) if max_len: - chunks = BasePlatformAdapter.truncate_message(message, max_len) + _len_fn = utf16_len if platform == Platform.TELEGRAM else None + chunks = BasePlatformAdapter.truncate_message(message, max_len, len_fn=_len_fn) else: chunks = [message]