mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
test(stream-consumer): add UTF-16 overflow regression tests for #11170
New TestUtf16OverflowDetection class covers two scenarios: - test_emoji_text_exceeding_utf16_limit_triggers_overflow_split: feeds 2200 emoji codepoints (4400 UTF-16 units) — under Telegram's codepoint-equivalent limit but over its UTF-16 limit. Asserts truncate_message was called with len_fn=utf16_len, confirming the consumer detected the overflow. - test_codepoint_only_adapter_falls_back_to_len: documents that adapters which don't subclass BasePlatformAdapter (or test MagicMocks) fall back to plain len for backwards compat. The contributor's PR shipped no tests for the UTF-16 path.
This commit is contained in:
parent
c0da5d09a6
commit
121bbe0385
1 changed files with 93 additions and 0 deletions
|
|
@ -1638,3 +1638,96 @@ class TestOnNewMessageCallback:
|
|||
await consumer.run()
|
||||
|
||||
assert consumer.already_sent is True
|
||||
|
||||
|
||||
class TestUtf16OverflowDetection:
|
||||
"""Regression coverage for #11170 — Telegram counts message length in
|
||||
UTF-16 code units, not Python codepoints. A response with supplementary
|
||||
characters (emoji, CJK in some ranges) can have len()=3000 codepoints
|
||||
but utf16_len()=5000+ units, blowing past Telegram's 4096 limit."""
|
||||
|
||||
def _make_telegram_like_adapter(self):
|
||||
"""Construct a minimal BasePlatformAdapter subclass that overrides
|
||||
message_len_fn like Telegram does."""
|
||||
from gateway.platforms.base import utf16_len, BasePlatformAdapter
|
||||
|
||||
TelegramLikeAdapter = type(
|
||||
"TelegramLikeAdapter",
|
||||
(BasePlatformAdapter,),
|
||||
{
|
||||
"MAX_MESSAGE_LENGTH": 4096,
|
||||
"message_len_fn": property(lambda self: utf16_len),
|
||||
},
|
||||
)
|
||||
# Defeat ABCMeta abstract-instantiation guard by clearing the cached
|
||||
# abstract methods set after class creation.
|
||||
TelegramLikeAdapter.__abstractmethods__ = frozenset()
|
||||
adapter = TelegramLikeAdapter.__new__(TelegramLikeAdapter)
|
||||
adapter._typing_paused = set()
|
||||
adapter._fatal_error_message = None
|
||||
return adapter
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_emoji_text_exceeding_utf16_limit_triggers_overflow_split(self):
|
||||
"""A response that is under 4096 codepoints but over 4096 UTF-16
|
||||
units must trigger the overflow-split path."""
|
||||
from gateway.platforms.base import utf16_len
|
||||
|
||||
adapter = self._make_telegram_like_adapter()
|
||||
# Mock the send/edit methods we actually call
|
||||
adapter.send = AsyncMock(
|
||||
return_value=SimpleNamespace(success=True, message_id="msg_1"),
|
||||
)
|
||||
adapter.edit_message = AsyncMock(
|
||||
return_value=SimpleNamespace(success=True),
|
||||
)
|
||||
# truncate_message: emit two halves so we can assert the split fired
|
||||
adapter.truncate_message = MagicMock(
|
||||
side_effect=lambda text, limit, **kw: [text[:len(text)//2], text[len(text)//2:]],
|
||||
)
|
||||
|
||||
config = StreamConsumerConfig(edit_interval=0.01, buffer_threshold=5)
|
||||
consumer = GatewayStreamConsumer(adapter, "chat_123", config)
|
||||
|
||||
# 🚀 is 1 codepoint = 2 UTF-16 units. 2200 of them = 2200 codepoints,
|
||||
# 4400 UTF-16 units. Under the codepoint-equivalent limit (would not
|
||||
# trigger split with len()) but over Telegram's UTF-16 4096 limit.
|
||||
emoji_text = "🚀" * 2200
|
||||
assert len(emoji_text) < adapter.MAX_MESSAGE_LENGTH, (
|
||||
"Test setup invariant: codepoint count under limit"
|
||||
)
|
||||
assert utf16_len(emoji_text) > adapter.MAX_MESSAGE_LENGTH, (
|
||||
"Test setup invariant: UTF-16 count over limit"
|
||||
)
|
||||
|
||||
consumer.on_delta(emoji_text)
|
||||
task = asyncio.create_task(consumer.run())
|
||||
await asyncio.sleep(0.05)
|
||||
consumer.finish()
|
||||
await task
|
||||
|
||||
# The fix: stream consumer detects UTF-16 overflow and calls
|
||||
# truncate_message to split. Without the fix, len() would return
|
||||
# 2200 (under 4096) and no split would fire — Telegram would then
|
||||
# reject the send or render \x00 artifacts.
|
||||
adapter.truncate_message.assert_called(), (
|
||||
"UTF-16 overflow not detected — emoji text bypassed split path"
|
||||
)
|
||||
# truncate_message must have been called with len_fn=utf16_len
|
||||
call_kwargs = adapter.truncate_message.call_args[1]
|
||||
assert call_kwargs.get("len_fn") is utf16_len, (
|
||||
f"truncate_message called without utf16_len: {call_kwargs}"
|
||||
)
|
||||
|
||||
def test_codepoint_only_adapter_falls_back_to_len(self):
|
||||
"""Adapters without message_len_fn override (or test MagicMocks)
|
||||
must use plain len for backwards compatibility."""
|
||||
adapter = MagicMock()
|
||||
adapter.MAX_MESSAGE_LENGTH = 4096
|
||||
config = StreamConsumerConfig(cursor=" ▉")
|
||||
consumer = GatewayStreamConsumer(adapter, "chat_123", config)
|
||||
# The isinstance guard means MagicMock adapters get len, not the
|
||||
# auto-attr mock. Verified indirectly by all the other tests in
|
||||
# this file passing — they all use MagicMock adapters.
|
||||
assert consumer is not None
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue