mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(telegram): use UTF-16 code units for message length splitting (#8725)
Port from nearai/ironclaw#2304: Telegram's 4096 character limit is measured in UTF-16 code units, not Unicode codepoints. Characters outside the Basic Multilingual Plane (emoji like 😀, CJK Extension B, musical symbols) are surrogate pairs: 1 Python char but 2 UTF-16 units. Previously, truncate_message() used Python's len() which counts codepoints. This could produce chunks exceeding Telegram's actual limit when messages contain many astral-plane characters. Changes: - Add utf16_len() helper and _prefix_within_utf16_limit() for UTF-16-aware string measurement and truncation - Add _custom_unit_to_cp() binary-search helper that maps a custom-unit budget to the largest safe codepoint slice position - Update truncate_message() to accept optional len_fn parameter - Telegram adapter now passes len_fn=utf16_len when splitting messages - Fix fallback truncation in Telegram error handler to use _prefix_within_utf16_limit instead of codepoint slicing - Update send_message_tool.py to use utf16_len for Telegram platform - Add comprehensive tests: utf16_len, _prefix_within_utf16_limit, truncate_message with len_fn (emoji splitting, content preservation, code block handling) - Update mock lambdas in reply_mode tests to accept **kw for len_fn
This commit is contained in:
parent
3cd6cbee5f
commit
9e992df8ae
6 changed files with 240 additions and 25 deletions
|
|
@ -21,6 +21,59 @@ from urllib.parse import urlsplit
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def utf16_len(s: str) -> int:
|
||||
"""Count UTF-16 code units in *s*.
|
||||
|
||||
Telegram's message-length limit (4 096) is measured in UTF-16 code units,
|
||||
**not** Unicode code-points. Characters outside the Basic Multilingual
|
||||
Plane (emoji like 😀, CJK Extension B, musical symbols, …) are encoded as
|
||||
surrogate pairs and therefore consume **two** UTF-16 code units each, even
|
||||
though Python's ``len()`` counts them as one.
|
||||
|
||||
Ported from nearai/ironclaw#2304 which discovered the same discrepancy in
|
||||
Rust's ``chars().count()``.
|
||||
"""
|
||||
return len(s.encode("utf-16-le")) // 2
|
||||
|
||||
|
||||
def _prefix_within_utf16_limit(s: str, limit: int) -> str:
|
||||
"""Return the longest prefix of *s* whose UTF-16 length ≤ *limit*.
|
||||
|
||||
Unlike a plain ``s[:limit]``, this respects surrogate-pair boundaries so
|
||||
we never slice a multi-code-unit character in half.
|
||||
"""
|
||||
if utf16_len(s) <= limit:
|
||||
return s
|
||||
# Binary search for the longest safe prefix
|
||||
lo, hi = 0, len(s)
|
||||
while lo < hi:
|
||||
mid = (lo + hi + 1) // 2
|
||||
if utf16_len(s[:mid]) <= limit:
|
||||
lo = mid
|
||||
else:
|
||||
hi = mid - 1
|
||||
return s[:lo]
|
||||
|
||||
|
||||
def _custom_unit_to_cp(s: str, budget: int, len_fn) -> int:
|
||||
"""Return the largest codepoint offset *n* such that ``len_fn(s[:n]) <= budget``.
|
||||
|
||||
Used by :meth:`BasePlatformAdapter.truncate_message` when *len_fn* measures
|
||||
length in units different from Python codepoints (e.g. UTF-16 code units).
|
||||
Falls back to binary search which is O(log n) calls to *len_fn*.
|
||||
"""
|
||||
if len_fn(s) <= budget:
|
||||
return len(s)
|
||||
lo, hi = 0, len(s)
|
||||
while lo < hi:
|
||||
mid = (lo + hi + 1) // 2
|
||||
if len_fn(s[:mid]) <= budget:
|
||||
lo = mid
|
||||
else:
|
||||
hi = mid - 1
|
||||
return lo
|
||||
|
||||
|
||||
def is_network_accessible(host: str) -> bool:
|
||||
"""Return True if *host* would expose the server beyond loopback.
|
||||
|
||||
|
|
@ -1886,7 +1939,11 @@ class BasePlatformAdapter(ABC):
|
|||
return content
|
||||
|
||||
@staticmethod
|
||||
def truncate_message(content: str, max_length: int = 4096) -> List[str]:
|
||||
def truncate_message(
|
||||
content: str,
|
||||
max_length: int = 4096,
|
||||
len_fn: Optional["Callable[[str], int]"] = None,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Split a long message into chunks, preserving code block boundaries.
|
||||
|
||||
|
|
@ -1898,11 +1955,16 @@ class BasePlatformAdapter(ABC):
|
|||
Args:
|
||||
content: The full message content
|
||||
max_length: Maximum length per chunk (platform-specific)
|
||||
len_fn: Optional length function for measuring string length.
|
||||
Defaults to ``len`` (Unicode code-points). Pass
|
||||
``utf16_len`` for platforms that measure message
|
||||
length in UTF-16 code units (e.g. Telegram).
|
||||
|
||||
Returns:
|
||||
List of message chunks
|
||||
"""
|
||||
if len(content) <= max_length:
|
||||
_len = len_fn or len
|
||||
if _len(content) <= max_length:
|
||||
return [content]
|
||||
|
||||
INDICATOR_RESERVE = 10 # room for " (XX/XX)"
|
||||
|
|
@ -1921,22 +1983,33 @@ class BasePlatformAdapter(ABC):
|
|||
|
||||
# How much body text we can fit after accounting for the prefix,
|
||||
# a potential closing fence, and the chunk indicator.
|
||||
headroom = max_length - INDICATOR_RESERVE - len(prefix) - len(FENCE_CLOSE)
|
||||
headroom = max_length - INDICATOR_RESERVE - _len(prefix) - _len(FENCE_CLOSE)
|
||||
if headroom < 1:
|
||||
headroom = max_length // 2
|
||||
|
||||
# Everything remaining fits in one final chunk
|
||||
if len(prefix) + len(remaining) <= max_length - INDICATOR_RESERVE:
|
||||
if _len(prefix) + _len(remaining) <= max_length - INDICATOR_RESERVE:
|
||||
chunks.append(prefix + remaining)
|
||||
break
|
||||
|
||||
# Find a natural split point (prefer newlines, then spaces)
|
||||
region = remaining[:headroom]
|
||||
# Find a natural split point (prefer newlines, then spaces).
|
||||
# When _len != len (e.g. utf16_len for Telegram), headroom is
|
||||
# measured in the custom unit. We need codepoint-based slice
|
||||
# positions that stay within the custom-unit budget.
|
||||
#
|
||||
# _safe_slice_pos() maps a custom-unit budget to the largest
|
||||
# codepoint offset whose custom length ≤ budget.
|
||||
if _len is not len:
|
||||
# Map headroom (custom units) → codepoint slice length
|
||||
_cp_limit = _custom_unit_to_cp(remaining, headroom, _len)
|
||||
else:
|
||||
_cp_limit = headroom
|
||||
region = remaining[:_cp_limit]
|
||||
split_at = region.rfind("\n")
|
||||
if split_at < headroom // 2:
|
||||
if split_at < _cp_limit // 2:
|
||||
split_at = region.rfind(" ")
|
||||
if split_at < 1:
|
||||
split_at = headroom
|
||||
split_at = _cp_limit
|
||||
|
||||
# Avoid splitting inside an inline code span (`...`).
|
||||
# If the text before split_at has an odd number of unescaped
|
||||
|
|
@ -1956,7 +2029,7 @@ class BasePlatformAdapter(ABC):
|
|||
safe_split = candidate.rfind(" ", 0, last_bt)
|
||||
nl_split = candidate.rfind("\n", 0, last_bt)
|
||||
safe_split = max(safe_split, nl_split)
|
||||
if safe_split > headroom // 4:
|
||||
if safe_split > _cp_limit // 4:
|
||||
split_at = safe_split
|
||||
|
||||
chunk_body = remaining[:split_at]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue