fix(telegram): use UTF-16 code units for message length splitting (#8725)

Port from nearai/ironclaw#2304: Telegram's 4096 character limit is
measured in UTF-16 code units, not Unicode codepoints. Characters
outside the Basic Multilingual Plane (emoji like 😀, CJK Extension B,
musical symbols) are surrogate pairs: 1 Python char but 2 UTF-16 units.

Previously, truncate_message() used Python's len() which counts
codepoints. This could produce chunks exceeding Telegram's actual limit
when messages contain many astral-plane characters.

Changes:
- Add utf16_len() helper and _prefix_within_utf16_limit() for
  UTF-16-aware string measurement and truncation
- Add _custom_unit_to_cp() binary-search helper that maps a custom-unit
  budget to the largest safe codepoint slice position
- Update truncate_message() to accept optional len_fn parameter
- Telegram adapter now passes len_fn=utf16_len when splitting messages
- Fix fallback truncation in Telegram error handler to use
  _prefix_within_utf16_limit instead of codepoint slicing
- Update send_message_tool.py to use utf16_len for Telegram platform
- Add comprehensive tests: utf16_len, _prefix_within_utf16_limit,
  truncate_message with len_fn (emoji splitting, content preservation,
  code block handling)
- Update mock lambdas in reply_mode tests to accept **kw for len_fn
This commit is contained in:
Teknium 2026-04-12 19:06:20 -07:00 committed by GitHub
parent 3cd6cbee5f
commit 9e992df8ae
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 240 additions and 25 deletions

View file

@ -21,6 +21,59 @@ from urllib.parse import urlsplit
logger = logging.getLogger(__name__)
def utf16_len(s: str) -> int:
"""Count UTF-16 code units in *s*.
Telegram's message-length limit (4 096) is measured in UTF-16 code units,
**not** Unicode code-points. Characters outside the Basic Multilingual
Plane (emoji like 😀, CJK Extension B, musical symbols, ) are encoded as
surrogate pairs and therefore consume **two** UTF-16 code units each, even
though Python's ``len()`` counts them as one.
Ported from nearai/ironclaw#2304 which discovered the same discrepancy in
Rust's ``chars().count()``.
"""
return len(s.encode("utf-16-le")) // 2
def _prefix_within_utf16_limit(s: str, limit: int) -> str:
"""Return the longest prefix of *s* whose UTF-16 length ≤ *limit*.
Unlike a plain ``s[:limit]``, this respects surrogate-pair boundaries so
we never slice a multi-code-unit character in half.
"""
if utf16_len(s) <= limit:
return s
# Binary search for the longest safe prefix
lo, hi = 0, len(s)
while lo < hi:
mid = (lo + hi + 1) // 2
if utf16_len(s[:mid]) <= limit:
lo = mid
else:
hi = mid - 1
return s[:lo]
def _custom_unit_to_cp(s: str, budget: int, len_fn) -> int:
"""Return the largest codepoint offset *n* such that ``len_fn(s[:n]) <= budget``.
Used by :meth:`BasePlatformAdapter.truncate_message` when *len_fn* measures
length in units different from Python codepoints (e.g. UTF-16 code units).
Falls back to binary search which is O(log n) calls to *len_fn*.
"""
if len_fn(s) <= budget:
return len(s)
lo, hi = 0, len(s)
while lo < hi:
mid = (lo + hi + 1) // 2
if len_fn(s[:mid]) <= budget:
lo = mid
else:
hi = mid - 1
return lo
def is_network_accessible(host: str) -> bool:
"""Return True if *host* would expose the server beyond loopback.
@ -1886,7 +1939,11 @@ class BasePlatformAdapter(ABC):
return content
@staticmethod
def truncate_message(content: str, max_length: int = 4096) -> List[str]:
def truncate_message(
content: str,
max_length: int = 4096,
len_fn: Optional["Callable[[str], int]"] = None,
) -> List[str]:
"""
Split a long message into chunks, preserving code block boundaries.
@ -1898,11 +1955,16 @@ class BasePlatformAdapter(ABC):
Args:
content: The full message content
max_length: Maximum length per chunk (platform-specific)
len_fn: Optional length function for measuring string length.
Defaults to ``len`` (Unicode code-points). Pass
``utf16_len`` for platforms that measure message
length in UTF-16 code units (e.g. Telegram).
Returns:
List of message chunks
"""
if len(content) <= max_length:
_len = len_fn or len
if _len(content) <= max_length:
return [content]
INDICATOR_RESERVE = 10 # room for " (XX/XX)"
@ -1921,22 +1983,33 @@ class BasePlatformAdapter(ABC):
# How much body text we can fit after accounting for the prefix,
# a potential closing fence, and the chunk indicator.
headroom = max_length - INDICATOR_RESERVE - len(prefix) - len(FENCE_CLOSE)
headroom = max_length - INDICATOR_RESERVE - _len(prefix) - _len(FENCE_CLOSE)
if headroom < 1:
headroom = max_length // 2
# Everything remaining fits in one final chunk
if len(prefix) + len(remaining) <= max_length - INDICATOR_RESERVE:
if _len(prefix) + _len(remaining) <= max_length - INDICATOR_RESERVE:
chunks.append(prefix + remaining)
break
# Find a natural split point (prefer newlines, then spaces)
region = remaining[:headroom]
# Find a natural split point (prefer newlines, then spaces).
# When _len != len (e.g. utf16_len for Telegram), headroom is
# measured in the custom unit. We need codepoint-based slice
# positions that stay within the custom-unit budget.
#
# _safe_slice_pos() maps a custom-unit budget to the largest
# codepoint offset whose custom length ≤ budget.
if _len is not len:
# Map headroom (custom units) → codepoint slice length
_cp_limit = _custom_unit_to_cp(remaining, headroom, _len)
else:
_cp_limit = headroom
region = remaining[:_cp_limit]
split_at = region.rfind("\n")
if split_at < headroom // 2:
if split_at < _cp_limit // 2:
split_at = region.rfind(" ")
if split_at < 1:
split_at = headroom
split_at = _cp_limit
# Avoid splitting inside an inline code span (`...`).
# If the text before split_at has an odd number of unescaped
@ -1956,7 +2029,7 @@ class BasePlatformAdapter(ABC):
safe_split = candidate.rfind(" ", 0, last_bt)
nl_split = candidate.rfind("\n", 0, last_bt)
safe_split = max(safe_split, nl_split)
if safe_split > headroom // 4:
if safe_split > _cp_limit // 4:
split_at = safe_split
chunk_body = remaining[:split_at]