mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-26 06:01:49 +00:00
test(stream-consumer): add UTF-16 overflow regression tests for #11170
New TestUtf16OverflowDetection class covers two scenarios: - test_emoji_text_exceeding_utf16_limit_triggers_overflow_split: feeds 2200 emoji codepoints (4400 UTF-16 units) — under Telegram's codepoint-equivalent limit but over its UTF-16 limit. Asserts truncate_message was called with len_fn=utf16_len, confirming the consumer detected the overflow. - test_codepoint_only_adapter_falls_back_to_len: documents that adapters which don't subclass BasePlatformAdapter (or test MagicMocks) fall back to plain len for backwards compat. The contributor's PR shipped no tests for the UTF-16 path.
This commit is contained in:
parent
c0da5d09a6
commit
121bbe0385
1 changed files with 93 additions and 0 deletions
|
|
@ -1638,3 +1638,96 @@ class TestOnNewMessageCallback:
|
||||||
await consumer.run()
|
await consumer.run()
|
||||||
|
|
||||||
assert consumer.already_sent is True
|
assert consumer.already_sent is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestUtf16OverflowDetection:
|
||||||
|
"""Regression coverage for #11170 — Telegram counts message length in
|
||||||
|
UTF-16 code units, not Python codepoints. A response with supplementary
|
||||||
|
characters (emoji, CJK in some ranges) can have len()=3000 codepoints
|
||||||
|
but utf16_len()=5000+ units, blowing past Telegram's 4096 limit."""
|
||||||
|
|
||||||
|
def _make_telegram_like_adapter(self):
|
||||||
|
"""Construct a minimal BasePlatformAdapter subclass that overrides
|
||||||
|
message_len_fn like Telegram does."""
|
||||||
|
from gateway.platforms.base import utf16_len, BasePlatformAdapter
|
||||||
|
|
||||||
|
TelegramLikeAdapter = type(
|
||||||
|
"TelegramLikeAdapter",
|
||||||
|
(BasePlatformAdapter,),
|
||||||
|
{
|
||||||
|
"MAX_MESSAGE_LENGTH": 4096,
|
||||||
|
"message_len_fn": property(lambda self: utf16_len),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# Defeat ABCMeta abstract-instantiation guard by clearing the cached
|
||||||
|
# abstract methods set after class creation.
|
||||||
|
TelegramLikeAdapter.__abstractmethods__ = frozenset()
|
||||||
|
adapter = TelegramLikeAdapter.__new__(TelegramLikeAdapter)
|
||||||
|
adapter._typing_paused = set()
|
||||||
|
adapter._fatal_error_message = None
|
||||||
|
return adapter
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_emoji_text_exceeding_utf16_limit_triggers_overflow_split(self):
|
||||||
|
"""A response that is under 4096 codepoints but over 4096 UTF-16
|
||||||
|
units must trigger the overflow-split path."""
|
||||||
|
from gateway.platforms.base import utf16_len
|
||||||
|
|
||||||
|
adapter = self._make_telegram_like_adapter()
|
||||||
|
# Mock the send/edit methods we actually call
|
||||||
|
adapter.send = AsyncMock(
|
||||||
|
return_value=SimpleNamespace(success=True, message_id="msg_1"),
|
||||||
|
)
|
||||||
|
adapter.edit_message = AsyncMock(
|
||||||
|
return_value=SimpleNamespace(success=True),
|
||||||
|
)
|
||||||
|
# truncate_message: emit two halves so we can assert the split fired
|
||||||
|
adapter.truncate_message = MagicMock(
|
||||||
|
side_effect=lambda text, limit, **kw: [text[:len(text)//2], text[len(text)//2:]],
|
||||||
|
)
|
||||||
|
|
||||||
|
config = StreamConsumerConfig(edit_interval=0.01, buffer_threshold=5)
|
||||||
|
consumer = GatewayStreamConsumer(adapter, "chat_123", config)
|
||||||
|
|
||||||
|
# 🚀 is 1 codepoint = 2 UTF-16 units. 2200 of them = 2200 codepoints,
|
||||||
|
# 4400 UTF-16 units. Under the codepoint-equivalent limit (would not
|
||||||
|
# trigger split with len()) but over Telegram's UTF-16 4096 limit.
|
||||||
|
emoji_text = "🚀" * 2200
|
||||||
|
assert len(emoji_text) < adapter.MAX_MESSAGE_LENGTH, (
|
||||||
|
"Test setup invariant: codepoint count under limit"
|
||||||
|
)
|
||||||
|
assert utf16_len(emoji_text) > adapter.MAX_MESSAGE_LENGTH, (
|
||||||
|
"Test setup invariant: UTF-16 count over limit"
|
||||||
|
)
|
||||||
|
|
||||||
|
consumer.on_delta(emoji_text)
|
||||||
|
task = asyncio.create_task(consumer.run())
|
||||||
|
await asyncio.sleep(0.05)
|
||||||
|
consumer.finish()
|
||||||
|
await task
|
||||||
|
|
||||||
|
# The fix: stream consumer detects UTF-16 overflow and calls
|
||||||
|
# truncate_message to split. Without the fix, len() would return
|
||||||
|
# 2200 (under 4096) and no split would fire — Telegram would then
|
||||||
|
# reject the send or render \x00 artifacts.
|
||||||
|
adapter.truncate_message.assert_called(), (
|
||||||
|
"UTF-16 overflow not detected — emoji text bypassed split path"
|
||||||
|
)
|
||||||
|
# truncate_message must have been called with len_fn=utf16_len
|
||||||
|
call_kwargs = adapter.truncate_message.call_args[1]
|
||||||
|
assert call_kwargs.get("len_fn") is utf16_len, (
|
||||||
|
f"truncate_message called without utf16_len: {call_kwargs}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_codepoint_only_adapter_falls_back_to_len(self):
|
||||||
|
"""Adapters without message_len_fn override (or test MagicMocks)
|
||||||
|
must use plain len for backwards compatibility."""
|
||||||
|
adapter = MagicMock()
|
||||||
|
adapter.MAX_MESSAGE_LENGTH = 4096
|
||||||
|
config = StreamConsumerConfig(cursor=" ▉")
|
||||||
|
consumer = GatewayStreamConsumer(adapter, "chat_123", config)
|
||||||
|
# The isinstance guard means MagicMock adapters get len, not the
|
||||||
|
# auto-attr mock. Verified indirectly by all the other tests in
|
||||||
|
# this file passing — they all use MagicMock adapters.
|
||||||
|
assert consumer is not None
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue