test(stream-consumer): add UTF-16 overflow regression tests for #11170

New TestUtf16OverflowDetection class covers two scenarios: - test_emoji_text_exceeding_utf16_limit_triggers_overflow_split: feeds 2200 emoji codepoints (4400 UTF-16 units) — under Telegram's codepoint-equivalent limit but over its UTF-16 limit. Asserts truncate_message was called with len_fn=utf16_len, confirming the consumer detected the overflow. - test_codepoint_only_adapter_falls_back_to_len: documents that adapters which don't subclass BasePlatformAdapter (or test MagicMocks) fall back to plain len for backwards compat. The contributor's PR shipped no tests for the UTF-16 path.
2026-05-26 06:01:49 +00:00 · 2026-05-10 16:17:48 -07:00 · 2026-05-10 16:17:48 -07:00 · 121bbe0385
commit 121bbe0385
parent c0da5d09a6
1 changed files with 93 additions and 0 deletions
--- a/tests/gateway/test_stream_consumer.py
+++ b/tests/gateway/test_stream_consumer.py
@ -1638,3 +1638,96 @@ class TestOnNewMessageCallback:
        await consumer.run()
        assert consumer.already_sent is True
 class TestUtf16OverflowDetection:
    """Regression coverage for #11170 — Telegram counts message length in
    UTF-16 code units, not Python codepoints. A response with supplementary
    characters (emoji, CJK in some ranges) can have len()=3000 codepoints
    but utf16_len()=5000+ units, blowing past Telegram's 4096 limit."""
    def _make_telegram_like_adapter(self):
        """Construct a minimal BasePlatformAdapter subclass that overrides
        message_len_fn like Telegram does."""
        from gateway.platforms.base import utf16_len, BasePlatformAdapter
        TelegramLikeAdapter = type(
            "TelegramLikeAdapter",
            (BasePlatformAdapter,),
            {
                "MAX_MESSAGE_LENGTH": 4096,
                "message_len_fn": property(lambda self: utf16_len),
            },
        )
        # Defeat ABCMeta abstract-instantiation guard by clearing the cached
        # abstract methods set after class creation.
        TelegramLikeAdapter.__abstractmethods__ = frozenset()
        adapter = TelegramLikeAdapter.__new__(TelegramLikeAdapter)
        adapter._typing_paused = set()
        adapter._fatal_error_message = None
        return adapter
    @pytest.mark.asyncio
    async def test_emoji_text_exceeding_utf16_limit_triggers_overflow_split(self):
        """A response that is under 4096 codepoints but over 4096 UTF-16
        units must trigger the overflow-split path."""
        from gateway.platforms.base import utf16_len
        adapter = self._make_telegram_like_adapter()
        # Mock the send/edit methods we actually call
        adapter.send = AsyncMock(
            return_value=SimpleNamespace(success=True, message_id="msg_1"),
        )
        adapter.edit_message = AsyncMock(
            return_value=SimpleNamespace(success=True),
        )
        # truncate_message: emit two halves so we can assert the split fired
        adapter.truncate_message = MagicMock(
            side_effect=lambda text, limit, **kw: [text[:len(text)//2], text[len(text)//2:]],
        )
        config = StreamConsumerConfig(edit_interval=0.01, buffer_threshold=5)
        consumer = GatewayStreamConsumer(adapter, "chat_123", config)
        # 🚀 is 1 codepoint = 2 UTF-16 units. 2200 of them = 2200 codepoints,
        # 4400 UTF-16 units. Under the codepoint-equivalent limit (would not
        # trigger split with len()) but over Telegram's UTF-16 4096 limit.
        emoji_text = "🚀" * 2200
        assert len(emoji_text) < adapter.MAX_MESSAGE_LENGTH, (
            "Test setup invariant: codepoint count under limit"
        )
        assert utf16_len(emoji_text) > adapter.MAX_MESSAGE_LENGTH, (
            "Test setup invariant: UTF-16 count over limit"
        )
        consumer.on_delta(emoji_text)
        task = asyncio.create_task(consumer.run())
        await asyncio.sleep(0.05)
        consumer.finish()
        await task
        # The fix: stream consumer detects UTF-16 overflow and calls
        # truncate_message to split. Without the fix, len() would return
        # 2200 (under 4096) and no split would fire — Telegram would then
        # reject the send or render \x00 artifacts.
        adapter.truncate_message.assert_called(), (
            "UTF-16 overflow not detected — emoji text bypassed split path"
        )
        # truncate_message must have been called with len_fn=utf16_len
        call_kwargs = adapter.truncate_message.call_args[1]
        assert call_kwargs.get("len_fn") is utf16_len, (
            f"truncate_message called without utf16_len: {call_kwargs}"
        )
    def test_codepoint_only_adapter_falls_back_to_len(self):
        """Adapters without message_len_fn override (or test MagicMocks)
        must use plain len for backwards compatibility."""
        adapter = MagicMock()
        adapter.MAX_MESSAGE_LENGTH = 4096
        config = StreamConsumerConfig(cursor=" ▉")
        consumer = GatewayStreamConsumer(adapter, "chat_123", config)
        # The isinstance guard means MagicMock adapters get len, not the
        # auto-attr mock. Verified indirectly by all the other tests in
        # this file passing — they all use MagicMock adapters.
        assert consumer is not None