From 45735e71a273cfb335adfe4350750789ad4458dc Mon Sep 17 00:00:00 2001
From: Teknium <teknium1@gmail.com>
Date: Sun, 12 Apr 2026 17:43:05 -0700
Subject: [PATCH] fix(telegram): use UTF-16 code units for message length
 splitting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port from nearai/ironclaw#2304: Telegram's 4096 character limit is
measured in UTF-16 code units, not Unicode codepoints. Characters
outside the Basic Multilingual Plane (emoji like 😀, CJK Extension B,
musical symbols) are surrogate pairs: 1 Python char but 2 UTF-16 units.

Previously, truncate_message() used Python's len() which counts
codepoints. This could produce chunks exceeding Telegram's actual limit
when messages contain many astral-plane characters.

Changes:
- Add utf16_len() helper and _prefix_within_utf16_limit() for
  UTF-16-aware string measurement and truncation
- Add _custom_unit_to_cp() binary-search helper that maps a custom-unit
  budget to the largest safe codepoint slice position
- Update truncate_message() to accept optional len_fn parameter
- Telegram adapter now passes len_fn=utf16_len when splitting messages
- Fix fallback truncation in Telegram error handler to use
  _prefix_within_utf16_limit instead of codepoint slicing
- Update send_message_tool.py to use utf16_len for Telegram platform
- Add comprehensive tests: utf16_len, _prefix_within_utf16_limit,
  truncate_message with len_fn (emoji splitting, content preservation,
  code block handling)
- Update mock lambdas in reply_mode tests to accept **kw for len_fn
---
 gateway/platforms/base.py                 |  91 +++++++++++++--
 gateway/platforms/telegram.py             |  10 +-
 tests/gateway/test_discord_reply_mode.py  |  14 +--
 tests/gateway/test_platform_base.py       | 134 ++++++++++++++++++++++
 tests/gateway/test_telegram_reply_mode.py |  10 +-
 tools/send_message_tool.py                |   6 +-
 6 files changed, 240 insertions(+), 25 deletions(-)

diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py
index 352aecb33..f7943da47 100644
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -21,6 +21,59 @@ from urllib.parse import urlsplit
 logger = logging.getLogger(__name__)
 
 
+def utf16_len(s: str) -> int:
+    """Count UTF-16 code units in *s*.
+
+    Telegram's message-length limit (4 096) is measured in UTF-16 code units,
+    **not** Unicode code-points.  Characters outside the Basic Multilingual
+    Plane (emoji like 😀, CJK Extension B, musical symbols, …) are encoded as
+    surrogate pairs and therefore consume **two** UTF-16 code units each, even
+    though Python's ``len()`` counts them as one.
+
+    Ported from nearai/ironclaw#2304 which discovered the same discrepancy in
+    Rust's ``chars().count()``.
+    """
+    return len(s.encode("utf-16-le")) // 2
+
+
+def _prefix_within_utf16_limit(s: str, limit: int) -> str:
+    """Return the longest prefix of *s* whose UTF-16 length ≤ *limit*.
+
+    Unlike a plain ``s[:limit]``, this respects surrogate-pair boundaries so
+    we never slice a multi-code-unit character in half.
+    """
+    if utf16_len(s) <= limit:
+        return s
+    # Binary search for the longest safe prefix
+    lo, hi = 0, len(s)
+    while lo < hi:
+        mid = (lo + hi + 1) // 2
+        if utf16_len(s[:mid]) <= limit:
+            lo = mid
+        else:
+            hi = mid - 1
+    return s[:lo]
+
+
+def _custom_unit_to_cp(s: str, budget: int, len_fn) -> int:
+    """Return the largest codepoint offset *n* such that ``len_fn(s[:n]) <= budget``.
+
+    Used by :meth:`BasePlatformAdapter.truncate_message` when *len_fn* measures
+    length in units different from Python codepoints (e.g. UTF-16 code units).
+    Falls back to binary search which is O(log n) calls to *len_fn*.
+    """
+    if len_fn(s) <= budget:
+        return len(s)
+    lo, hi = 0, len(s)
+    while lo < hi:
+        mid = (lo + hi + 1) // 2
+        if len_fn(s[:mid]) <= budget:
+            lo = mid
+        else:
+            hi = mid - 1
+    return lo
+
+
 def is_network_accessible(host: str) -> bool:
     """Return True if *host* would expose the server beyond loopback.
 
@@ -1886,7 +1939,11 @@ class BasePlatformAdapter(ABC):
         return content
     
     @staticmethod
-    def truncate_message(content: str, max_length: int = 4096) -> List[str]:
+    def truncate_message(
+        content: str,
+        max_length: int = 4096,
+        len_fn: Optional["Callable[[str], int]"] = None,
+    ) -> List[str]:
         """
         Split a long message into chunks, preserving code block boundaries.
 
@@ -1898,11 +1955,16 @@ class BasePlatformAdapter(ABC):
         Args:
             content: The full message content
             max_length: Maximum length per chunk (platform-specific)
+            len_fn: Optional length function for measuring string length.
+                     Defaults to ``len`` (Unicode code-points).  Pass
+                     ``utf16_len`` for platforms that measure message
+                     length in UTF-16 code units (e.g. Telegram).
 
         Returns:
             List of message chunks
         """
-        if len(content) <= max_length:
+        _len = len_fn or len
+        if _len(content) <= max_length:
             return [content]
 
         INDICATOR_RESERVE = 10   # room for " (XX/XX)"
@@ -1921,22 +1983,33 @@ class BasePlatformAdapter(ABC):
 
             # How much body text we can fit after accounting for the prefix,
             # a potential closing fence, and the chunk indicator.
-            headroom = max_length - INDICATOR_RESERVE - len(prefix) - len(FENCE_CLOSE)
+            headroom = max_length - INDICATOR_RESERVE - _len(prefix) - _len(FENCE_CLOSE)
             if headroom < 1:
                 headroom = max_length // 2
 
             # Everything remaining fits in one final chunk
-            if len(prefix) + len(remaining) <= max_length - INDICATOR_RESERVE:
+            if _len(prefix) + _len(remaining) <= max_length - INDICATOR_RESERVE:
                 chunks.append(prefix + remaining)
                 break
 
-            # Find a natural split point (prefer newlines, then spaces)
-            region = remaining[:headroom]
+            # Find a natural split point (prefer newlines, then spaces).
+            # When _len != len (e.g. utf16_len for Telegram), headroom is
+            # measured in the custom unit.  We need codepoint-based slice
+            # positions that stay within the custom-unit budget.
+            #
+            # _safe_slice_pos() maps a custom-unit budget to the largest
+            # codepoint offset whose custom length ≤ budget.
+            if _len is not len:
+                # Map headroom (custom units) → codepoint slice length
+                _cp_limit = _custom_unit_to_cp(remaining, headroom, _len)
+            else:
+                _cp_limit = headroom
+            region = remaining[:_cp_limit]
             split_at = region.rfind("\n")
-            if split_at < headroom // 2:
+            if split_at < _cp_limit // 2:
                 split_at = region.rfind(" ")
             if split_at < 1:
-                split_at = headroom
+                split_at = _cp_limit
 
             # Avoid splitting inside an inline code span (`...`).
             # If the text before split_at has an odd number of unescaped
@@ -1956,7 +2029,7 @@ class BasePlatformAdapter(ABC):
                     safe_split = candidate.rfind(" ", 0, last_bt)
                     nl_split = candidate.rfind("\n", 0, last_bt)
                     safe_split = max(safe_split, nl_split)
-                    if safe_split > headroom // 4:
+                    if safe_split > _cp_limit // 4:
                         split_at = safe_split
 
             chunk_body = remaining[:split_at]
diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py
index 265329602..5262e388b 100644
--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@@ -66,6 +66,8 @@ from gateway.platforms.base import (
     cache_audio_from_bytes,
     cache_document_from_bytes,
     SUPPORTED_DOCUMENT_TYPES,
+    utf16_len,
+    _prefix_within_utf16_limit,
 )
 from gateway.platforms.telegram_network import (
     TelegramFallbackTransport,
@@ -799,7 +801,9 @@ class TelegramAdapter(BasePlatformAdapter):
         try:
             # Format and split message if needed
             formatted = self.format_message(content)
-            chunks = self.truncate_message(formatted, self.MAX_MESSAGE_LENGTH)
+            chunks = self.truncate_message(
+                formatted, self.MAX_MESSAGE_LENGTH, len_fn=utf16_len,
+            )
             if len(chunks) > 1:
                 # truncate_message appends a raw " (1/2)" suffix. Escape the
                 # MarkdownV2-special parentheses so Telegram doesn't reject the
@@ -970,7 +974,9 @@ class TelegramAdapter(BasePlatformAdapter):
             # streaming).  Truncate and succeed so the stream consumer can
             # split the overflow into a new message instead of dying.
             if "message_too_long" in err_str or "too long" in err_str:
-                truncated = content[: self.MAX_MESSAGE_LENGTH - 20] + "…"
+                truncated = _prefix_within_utf16_limit(
+                    content, self.MAX_MESSAGE_LENGTH - 20
+                ) + "…"
                 try:
                     await self._bot.edit_message_text(
                         chat_id=int(chat_id),
diff --git a/tests/gateway/test_discord_reply_mode.py b/tests/gateway/test_discord_reply_mode.py
index 5a9bb9cd1..2346d086f 100644
--- a/tests/gateway/test_discord_reply_mode.py
+++ b/tests/gateway/test_discord_reply_mode.py
@@ -124,7 +124,7 @@ class TestSendWithReplyToMode:
     @pytest.mark.asyncio
     async def test_off_mode_no_reply_reference(self):
         adapter, channel, ref_msg = _make_discord_adapter("off")
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"]
 
         await adapter.send("12345", "test content", reply_to="999")
 
@@ -137,7 +137,7 @@ class TestSendWithReplyToMode:
     @pytest.mark.asyncio
     async def test_first_mode_only_first_chunk_references(self):
         adapter, channel, ref_msg = _make_discord_adapter("first")
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"]
 
         await adapter.send("12345", "test content", reply_to="999")
 
@@ -152,7 +152,7 @@ class TestSendWithReplyToMode:
     @pytest.mark.asyncio
     async def test_all_mode_all_chunks_reference(self):
         adapter, channel, ref_msg = _make_discord_adapter("all")
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"]
 
         await adapter.send("12345", "test content", reply_to="999")
 
@@ -165,7 +165,7 @@ class TestSendWithReplyToMode:
     @pytest.mark.asyncio
     async def test_no_reply_to_param_no_reference(self):
         adapter, channel, ref_msg = _make_discord_adapter("all")
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2"]
 
         await adapter.send("12345", "test content", reply_to=None)
 
@@ -176,7 +176,7 @@ class TestSendWithReplyToMode:
     @pytest.mark.asyncio
     async def test_single_chunk_respects_first_mode(self):
         adapter, channel, ref_msg = _make_discord_adapter("first")
-        adapter.truncate_message = lambda content, max_len: ["single chunk"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["single chunk"]
 
         await adapter.send("12345", "test", reply_to="999")
 
@@ -187,7 +187,7 @@ class TestSendWithReplyToMode:
     @pytest.mark.asyncio
     async def test_single_chunk_off_mode(self):
         adapter, channel, ref_msg = _make_discord_adapter("off")
-        adapter.truncate_message = lambda content, max_len: ["single chunk"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["single chunk"]
 
         await adapter.send("12345", "test", reply_to="999")
 
@@ -200,7 +200,7 @@ class TestSendWithReplyToMode:
     async def test_invalid_mode_falls_back_to_first_behavior(self):
         """Invalid mode behaves like 'first' — only first chunk gets reference."""
         adapter, channel, ref_msg = _make_discord_adapter("banana")
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2"]
 
         await adapter.send("12345", "test", reply_to="999")
 
diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py
index f2d133ea2..690a82095 100644
--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@@ -9,6 +9,8 @@ from gateway.platforms.base import (
     MessageEvent,
     MessageType,
     safe_url_for_log,
+    utf16_len,
+    _prefix_within_utf16_limit,
 )
 
 
@@ -448,3 +450,135 @@ class TestGetHumanDelay:
         with patch.dict(os.environ, env):
             delay = BasePlatformAdapter._get_human_delay()
             assert 0.1 <= delay <= 0.2
+
+
+# ---------------------------------------------------------------------------
+# utf16_len / _prefix_within_utf16_limit / truncate_message with len_fn
+# ---------------------------------------------------------------------------
+# Ported from nearai/ironclaw#2304 — Telegram counts message length in UTF-16
+# code units, not Unicode code-points.  Astral-plane characters (emoji, CJK
+# Extension B) are surrogate pairs: 1 Python char but 2 UTF-16 units.
+
+
+class TestUtf16Len:
+    """Verify the UTF-16 length helper."""
+
+    def test_ascii(self):
+        assert utf16_len("hello") == 5
+
+    def test_bmp_cjk(self):
+        # CJK ideographs in the BMP are 1 code unit each
+        assert utf16_len("你好") == 2
+
+    def test_emoji_surrogate_pair(self):
+        # 😀 (U+1F600) is outside BMP → 2 UTF-16 code units
+        assert utf16_len("😀") == 2
+
+    def test_mixed(self):
+        # "hi😀" = 2 + 2 = 4 UTF-16 units
+        assert utf16_len("hi😀") == 4
+
+    def test_musical_symbol(self):
+        # 𝄞 (U+1D11E) — Musical Symbol G Clef, surrogate pair
+        assert utf16_len("𝄞") == 2
+
+    def test_empty(self):
+        assert utf16_len("") == 0
+
+
+class TestPrefixWithinUtf16Limit:
+    """Verify UTF-16-aware prefix truncation."""
+
+    def test_fits_entirely(self):
+        assert _prefix_within_utf16_limit("hello", 10) == "hello"
+
+    def test_ascii_truncation(self):
+        result = _prefix_within_utf16_limit("hello world", 5)
+        assert result == "hello"
+        assert utf16_len(result) <= 5
+
+    def test_does_not_split_surrogate_pair(self):
+        # "a😀b" = 1 + 2 + 1 = 4 UTF-16 units; limit 2 should give "a"
+        result = _prefix_within_utf16_limit("a😀b", 2)
+        assert result == "a"
+        assert utf16_len(result) <= 2
+
+    def test_emoji_at_limit(self):
+        # "😀" = 2 UTF-16 units; limit 2 should include it
+        result = _prefix_within_utf16_limit("😀x", 2)
+        assert result == "😀"
+
+    def test_all_emoji(self):
+        msg = "😀" * 10  # 20 UTF-16 units
+        result = _prefix_within_utf16_limit(msg, 6)
+        assert result == "😀😀😀"
+        assert utf16_len(result) == 6
+
+    def test_empty(self):
+        assert _prefix_within_utf16_limit("", 5) == ""
+
+
+class TestTruncateMessageUtf16:
+    """Verify truncate_message respects UTF-16 lengths when len_fn=utf16_len."""
+
+    def test_short_emoji_message_no_split(self):
+        """A short message under the UTF-16 limit should not be split."""
+        msg = "Hello 😀 world"
+        chunks = BasePlatformAdapter.truncate_message(msg, 4096, len_fn=utf16_len)
+        assert len(chunks) == 1
+        assert chunks[0] == msg
+
+    def test_emoji_near_limit_triggers_split(self):
+        """A message at 4096 codepoints but >4096 UTF-16 units must split."""
+        # 2049 emoji = 2049 codepoints but 4098 UTF-16 units → exceeds 4096
+        msg = "😀" * 2049
+        assert len(msg) == 2049  # Python len sees 2049 chars
+        assert utf16_len(msg) == 4098  # but it's 4098 UTF-16 units
+
+        # Without UTF-16 awareness, this would NOT split (2049 < 4096)
+        chunks_naive = BasePlatformAdapter.truncate_message(msg, 4096)
+        assert len(chunks_naive) == 1, "Without len_fn, no split expected"
+
+        # With UTF-16 awareness, it MUST split
+        chunks = BasePlatformAdapter.truncate_message(msg, 4096, len_fn=utf16_len)
+        assert len(chunks) > 1, "With utf16_len, message should be split"
+
+        # Each chunk must fit within the UTF-16 limit
+        for i, chunk in enumerate(chunks):
+            assert utf16_len(chunk) <= 4096, (
+                f"Chunk {i} exceeds 4096 UTF-16 units: {utf16_len(chunk)}"
+            )
+
+    def test_each_utf16_chunk_within_limit(self):
+        """All chunks produced with utf16_len must fit the limit."""
+        # Mix of BMP and astral-plane characters
+        msg = ("Hello 😀 world 🎵 test 𝄞 " * 200).strip()
+        max_len = 200
+        chunks = BasePlatformAdapter.truncate_message(msg, max_len, len_fn=utf16_len)
+        for i, chunk in enumerate(chunks):
+            u16_len = utf16_len(chunk)
+            assert u16_len <= max_len + 20, (
+                f"Chunk {i} UTF-16 length {u16_len} exceeds {max_len}"
+            )
+
+    def test_all_content_preserved(self):
+        """Splitting with utf16_len must not lose content."""
+        words = ["emoji😀", "music🎵", "cjk你好", "plain"] * 100
+        msg = " ".join(words)
+        chunks = BasePlatformAdapter.truncate_message(msg, 200, len_fn=utf16_len)
+        reassembled = " ".join(chunks)
+        for word in words:
+            assert word in reassembled, f"Word '{word}' lost during UTF-16 split"
+
+    def test_code_blocks_preserved_with_utf16(self):
+        """Code block fence handling should work with utf16_len too."""
+        msg = "Before\n```python\n" + "x = '😀'\n" * 200 + "```\nAfter"
+        chunks = BasePlatformAdapter.truncate_message(msg, 300, len_fn=utf16_len)
+        assert len(chunks) > 1
+        # Each chunk should have balanced fences
+        for i, chunk in enumerate(chunks):
+            fence_count = chunk.count("```")
+            assert fence_count % 2 == 0, (
+                f"Chunk {i} has unbalanced fences ({fence_count})"
+            )
+
diff --git a/tests/gateway/test_telegram_reply_mode.py b/tests/gateway/test_telegram_reply_mode.py
index 1218afa0c..a433b1801 100644
--- a/tests/gateway/test_telegram_reply_mode.py
+++ b/tests/gateway/test_telegram_reply_mode.py
@@ -121,7 +121,7 @@ class TestSendWithReplyToMode:
         adapter = adapter_factory(reply_to_mode="off")
         adapter._bot = MagicMock()
         adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1))
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"]
 
         await adapter.send("12345", "test content", reply_to="999")
 
@@ -133,7 +133,7 @@ class TestSendWithReplyToMode:
         adapter = adapter_factory(reply_to_mode="first")
         adapter._bot = MagicMock()
         adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1))
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"]
 
         await adapter.send("12345", "test content", reply_to="999")
 
@@ -148,7 +148,7 @@ class TestSendWithReplyToMode:
         adapter = adapter_factory(reply_to_mode="all")
         adapter._bot = MagicMock()
         adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1))
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2", "chunk3"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2", "chunk3"]
 
         await adapter.send("12345", "test content", reply_to="999")
 
@@ -162,7 +162,7 @@ class TestSendWithReplyToMode:
         adapter = adapter_factory(reply_to_mode="all")
         adapter._bot = MagicMock()
         adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1))
-        adapter.truncate_message = lambda content, max_len: ["chunk1", "chunk2"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["chunk1", "chunk2"]
 
         await adapter.send("12345", "test content", reply_to=None)
 
@@ -175,7 +175,7 @@ class TestSendWithReplyToMode:
         adapter = adapter_factory(reply_to_mode="first")
         adapter._bot = MagicMock()
         adapter._bot.send_message = AsyncMock(return_value=MagicMock(message_id=1))
-        adapter.truncate_message = lambda content, max_len: ["single chunk"]
+        adapter.truncate_message = lambda content, max_len, **kw: ["single chunk"]
 
         await adapter.send("12345", "test", reply_to="999")
 
diff --git a/tools/send_message_tool.py b/tools/send_message_tool.py
index 60503c0bc..a2b3e984c 100644
--- a/tools/send_message_tool.py
+++ b/tools/send_message_tool.py
@@ -322,7 +322,7 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
     (preserves code-block boundaries, adds part indicators).
     """
     from gateway.config import Platform
-    from gateway.platforms.base import BasePlatformAdapter
+    from gateway.platforms.base import BasePlatformAdapter, utf16_len
     from gateway.platforms.telegram import TelegramAdapter
     from gateway.platforms.discord import DiscordAdapter
     from gateway.platforms.slack import SlackAdapter
@@ -354,9 +354,11 @@ async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None,
 
     # Smart-chunk the message to fit within platform limits.
     # For short messages or platforms without a known limit this is a no-op.
+    # Telegram measures length in UTF-16 code units, not Unicode codepoints.
     max_len = _MAX_LENGTHS.get(platform)
     if max_len:
-        chunks = BasePlatformAdapter.truncate_message(message, max_len)
+        _len_fn = utf16_len if platform == Platform.TELEGRAM else None
+        chunks = BasePlatformAdapter.truncate_message(message, max_len, len_fn=_len_fn)
     else:
         chunks = [message]