test(stream-consumer): add UTF-16 overflow regression tests for #11170

New TestUtf16OverflowDetection class covers two scenarios: - test_emoji_text_exceeding_utf16_limit_triggers_overflow_split: feeds 2200 emoji codepoints (4400 UTF-16 units) — under Telegram's codepoint-equivalent limit but over its UTF-16 limit. Asserts truncate_message was called with len_fn=utf16_len, confirming the consumer detected the overflow. - test_codepoint_only_adapter_falls_back_to_len: documents that adapters which don't subclass BasePlatformAdapter (or test MagicMocks) fall back to plain len for backwards compat. The contributor's PR shipped no tests for the UTF-16 path.
2026-07-14 14:12:44 +00:00 · 2026-05-10 16:17:48 -07:00 · 2026-05-10 16:17:48 -07:00 · 121bbe0385
commit 121bbe0385
parent c0da5d09a6
1 changed files with 93 additions and 0 deletions
--- a/tests/gateway/test_stream_consumer.py
+++ b/tests/gateway/test_stream_consumer.py
@ -1638,3 +1638,96 @@ class TestOnNewMessageCallback:
        await consumer.run()

        assert consumer.already_sent is True
+
+
+class TestUtf16OverflowDetection:
+    """Regression coverage for #11170 — Telegram counts message length in
+    UTF-16 code units, not Python codepoints. A response with supplementary
+    characters (emoji, CJK in some ranges) can have len()=3000 codepoints
+    but utf16_len()=5000+ units, blowing past Telegram's 4096 limit."""
+
+    def _make_telegram_like_adapter(self):
+        """Construct a minimal BasePlatformAdapter subclass that overrides
+        message_len_fn like Telegram does."""
+        from gateway.platforms.base import utf16_len, BasePlatformAdapter
+
+        TelegramLikeAdapter = type(
+            "TelegramLikeAdapter",
+            (BasePlatformAdapter,),
+            {
+                "MAX_MESSAGE_LENGTH": 4096,
+                "message_len_fn": property(lambda self: utf16_len),
+            },
+        )
+        # Defeat ABCMeta abstract-instantiation guard by clearing the cached
+        # abstract methods set after class creation.
+        TelegramLikeAdapter.__abstractmethods__ = frozenset()
+        adapter = TelegramLikeAdapter.__new__(TelegramLikeAdapter)
+        adapter._typing_paused = set()
+        adapter._fatal_error_message = None
+        return adapter
+
+    @pytest.mark.asyncio
+    async def test_emoji_text_exceeding_utf16_limit_triggers_overflow_split(self):
+        """A response that is under 4096 codepoints but over 4096 UTF-16
+        units must trigger the overflow-split path."""
+        from gateway.platforms.base import utf16_len
+
+        adapter = self._make_telegram_like_adapter()
+        # Mock the send/edit methods we actually call
+        adapter.send = AsyncMock(
+            return_value=SimpleNamespace(success=True, message_id="msg_1"),
+        )
+        adapter.edit_message = AsyncMock(
+            return_value=SimpleNamespace(success=True),
+        )
+        # truncate_message: emit two halves so we can assert the split fired
+        adapter.truncate_message = MagicMock(
+            side_effect=lambda text, limit, **kw: [text[:len(text)//2], text[len(text)//2:]],
+        )
+
+        config = StreamConsumerConfig(edit_interval=0.01, buffer_threshold=5)
+        consumer = GatewayStreamConsumer(adapter, "chat_123", config)
+
+        # 🚀 is 1 codepoint = 2 UTF-16 units. 2200 of them = 2200 codepoints,
+        # 4400 UTF-16 units. Under the codepoint-equivalent limit (would not
+        # trigger split with len()) but over Telegram's UTF-16 4096 limit.
+        emoji_text = "🚀" * 2200
+        assert len(emoji_text) < adapter.MAX_MESSAGE_LENGTH, (
+            "Test setup invariant: codepoint count under limit"
+        )
+        assert utf16_len(emoji_text) > adapter.MAX_MESSAGE_LENGTH, (
+            "Test setup invariant: UTF-16 count over limit"
+        )
+
+        consumer.on_delta(emoji_text)
+        task = asyncio.create_task(consumer.run())
+        await asyncio.sleep(0.05)
+        consumer.finish()
+        await task
+
+        # The fix: stream consumer detects UTF-16 overflow and calls
+        # truncate_message to split. Without the fix, len() would return
+        # 2200 (under 4096) and no split would fire — Telegram would then
+        # reject the send or render \x00 artifacts.
+        adapter.truncate_message.assert_called(), (
+            "UTF-16 overflow not detected — emoji text bypassed split path"
+        )
+        # truncate_message must have been called with len_fn=utf16_len
+        call_kwargs = adapter.truncate_message.call_args[1]
+        assert call_kwargs.get("len_fn") is utf16_len, (
+            f"truncate_message called without utf16_len: {call_kwargs}"
+        )
+
+    def test_codepoint_only_adapter_falls_back_to_len(self):
+        """Adapters without message_len_fn override (or test MagicMocks)
+        must use plain len for backwards compatibility."""
+        adapter = MagicMock()
+        adapter.MAX_MESSAGE_LENGTH = 4096
+        config = StreamConsumerConfig(cursor=" ▉")
+        consumer = GatewayStreamConsumer(adapter, "chat_123", config)
+        # The isinstance guard means MagicMock adapters get len, not the
+        # auto-attr mock. Verified indirectly by all the other tests in
+        # this file passing — they all use MagicMock adapters.
+        assert consumer is not None
+