fix(telegram): use UTF-16 code units for message length splitting (#8725)

Port from nearai/ironclaw#2304: Telegram's 4096 character limit is measured in UTF-16 code units, not Unicode codepoints. Characters outside the Basic Multilingual Plane (emoji like 😀, CJK Extension B, musical symbols) are surrogate pairs: 1 Python char but 2 UTF-16 units. Previously, truncate_message() used Python's len() which counts codepoints. This could produce chunks exceeding Telegram's actual limit when messages contain many astral-plane characters. Changes: - Add utf16_len() helper and _prefix_within_utf16_limit() for UTF-16-aware string measurement and truncation - Add _custom_unit_to_cp() binary-search helper that maps a custom-unit budget to the largest safe codepoint slice position - Update truncate_message() to accept optional len_fn parameter - Telegram adapter now passes len_fn=utf16_len when splitting messages - Fix fallback truncation in Telegram error handler to use _prefix_within_utf16_limit instead of codepoint slicing - Update send_message_tool.py to use utf16_len for Telegram platform - Add comprehensive tests: utf16_len, _prefix_within_utf16_limit, truncate_message with len_fn (emoji splitting, content preservation, code block handling) - Update mock lambdas in reply_mode tests to accept **kw for len_fn
2026-04-30 01:41:43 +00:00 · 2026-04-12 19:06:20 -07:00 · 2026-04-12 19:06:20 -07:00 · 9e992df8ae
commit 9e992df8ae
parent 3cd6cbee5f
6 changed files with 240 additions and 25 deletions
--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@ -9,6 +9,8 @@ from gateway.platforms.base import (
    MessageEvent,
    MessageType,
    safe_url_for_log,
+    utf16_len,
+    _prefix_within_utf16_limit,
 )


@ -448,3 +450,135 @@ class TestGetHumanDelay:
        with patch.dict(os.environ, env):
            delay = BasePlatformAdapter._get_human_delay()
            assert 0.1 <= delay <= 0.2
+
+
+# ---------------------------------------------------------------------------
+# utf16_len / _prefix_within_utf16_limit / truncate_message with len_fn
+# ---------------------------------------------------------------------------
+# Ported from nearai/ironclaw#2304 — Telegram counts message length in UTF-16
+# code units, not Unicode code-points.  Astral-plane characters (emoji, CJK
+# Extension B) are surrogate pairs: 1 Python char but 2 UTF-16 units.
+
+
+class TestUtf16Len:
+    """Verify the UTF-16 length helper."""
+
+    def test_ascii(self):
+        assert utf16_len("hello") == 5
+
+    def test_bmp_cjk(self):
+        # CJK ideographs in the BMP are 1 code unit each
+        assert utf16_len("你好") == 2
+
+    def test_emoji_surrogate_pair(self):
+        # 😀 (U+1F600) is outside BMP → 2 UTF-16 code units
+        assert utf16_len("😀") == 2
+
+    def test_mixed(self):
+        # "hi😀" = 2 + 2 = 4 UTF-16 units
+        assert utf16_len("hi😀") == 4
+
+    def test_musical_symbol(self):
+        # 𝄞 (U+1D11E) — Musical Symbol G Clef, surrogate pair
+        assert utf16_len("𝄞") == 2
+
+    def test_empty(self):
+        assert utf16_len("") == 0
+
+
+class TestPrefixWithinUtf16Limit:
+    """Verify UTF-16-aware prefix truncation."""
+
+    def test_fits_entirely(self):
+        assert _prefix_within_utf16_limit("hello", 10) == "hello"
+
+    def test_ascii_truncation(self):
+        result = _prefix_within_utf16_limit("hello world", 5)
+        assert result == "hello"
+        assert utf16_len(result) <= 5
+
+    def test_does_not_split_surrogate_pair(self):
+        # "a😀b" = 1 + 2 + 1 = 4 UTF-16 units; limit 2 should give "a"
+        result = _prefix_within_utf16_limit("a😀b", 2)
+        assert result == "a"
+        assert utf16_len(result) <= 2
+
+    def test_emoji_at_limit(self):
+        # "😀" = 2 UTF-16 units; limit 2 should include it
+        result = _prefix_within_utf16_limit("😀x", 2)
+        assert result == "😀"
+
+    def test_all_emoji(self):
+        msg = "😀" * 10  # 20 UTF-16 units
+        result = _prefix_within_utf16_limit(msg, 6)
+        assert result == "😀😀😀"
+        assert utf16_len(result) == 6
+
+    def test_empty(self):
+        assert _prefix_within_utf16_limit("", 5) == ""
+
+
+class TestTruncateMessageUtf16:
+    """Verify truncate_message respects UTF-16 lengths when len_fn=utf16_len."""
+
+    def test_short_emoji_message_no_split(self):
+        """A short message under the UTF-16 limit should not be split."""
+        msg = "Hello 😀 world"
+        chunks = BasePlatformAdapter.truncate_message(msg, 4096, len_fn=utf16_len)
+        assert len(chunks) == 1
+        assert chunks[0] == msg
+
+    def test_emoji_near_limit_triggers_split(self):
+        """A message at 4096 codepoints but >4096 UTF-16 units must split."""
+        # 2049 emoji = 2049 codepoints but 4098 UTF-16 units → exceeds 4096
+        msg = "😀" * 2049
+        assert len(msg) == 2049  # Python len sees 2049 chars
+        assert utf16_len(msg) == 4098  # but it's 4098 UTF-16 units
+
+        # Without UTF-16 awareness, this would NOT split (2049 < 4096)
+        chunks_naive = BasePlatformAdapter.truncate_message(msg, 4096)
+        assert len(chunks_naive) == 1, "Without len_fn, no split expected"
+
+        # With UTF-16 awareness, it MUST split
+        chunks = BasePlatformAdapter.truncate_message(msg, 4096, len_fn=utf16_len)
+        assert len(chunks) > 1, "With utf16_len, message should be split"
+
+        # Each chunk must fit within the UTF-16 limit
+        for i, chunk in enumerate(chunks):
+            assert utf16_len(chunk) <= 4096, (
+                f"Chunk {i} exceeds 4096 UTF-16 units: {utf16_len(chunk)}"
+            )
+
+    def test_each_utf16_chunk_within_limit(self):
+        """All chunks produced with utf16_len must fit the limit."""
+        # Mix of BMP and astral-plane characters
+        msg = ("Hello 😀 world 🎵 test 𝄞 " * 200).strip()
+        max_len = 200
+        chunks = BasePlatformAdapter.truncate_message(msg, max_len, len_fn=utf16_len)
+        for i, chunk in enumerate(chunks):
+            u16_len = utf16_len(chunk)
+            assert u16_len <= max_len + 20, (
+                f"Chunk {i} UTF-16 length {u16_len} exceeds {max_len}"
+            )
+
+    def test_all_content_preserved(self):
+        """Splitting with utf16_len must not lose content."""
+        words = ["emoji😀", "music🎵", "cjk你好", "plain"] * 100
+        msg = " ".join(words)
+        chunks = BasePlatformAdapter.truncate_message(msg, 200, len_fn=utf16_len)
+        reassembled = " ".join(chunks)
+        for word in words:
+            assert word in reassembled, f"Word '{word}' lost during UTF-16 split"
+
+    def test_code_blocks_preserved_with_utf16(self):
+        """Code block fence handling should work with utf16_len too."""
+        msg = "Before\n```python\n" + "x = '😀'\n" * 200 + "```\nAfter"
+        chunks = BasePlatformAdapter.truncate_message(msg, 300, len_fn=utf16_len)
+        assert len(chunks) > 1
+        # Each chunk should have balanced fences
+        for i, chunk in enumerate(chunks):
+            fence_count = chunk.count("```")
+            assert fence_count % 2 == 0, (
+                f"Chunk {i} has unbalanced fences ({fence_count})"
+            )
+