From d0e017bac8faeb3f08680052319c0091cfd335b5 Mon Sep 17 00:00:00 2001 From: Austin Pickett Date: Thu, 11 Jun 2026 10:01:51 -0400 Subject: [PATCH] fix(gateway): gate oversized Telegram voice/audio before download (#44245) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gateway): gate oversized Telegram voice/audio before download Adds a pre-download size check to the Telegram voice and audio inbound paths. Files that exceed _max_doc_bytes (default 20 MB) are rejected before get_file() is called, preventing silent OOM-style stalls on large uploads. A human-readable note is appended to the event text so the model can explain the limit to the user. Also extends 403 entitlement detection in recover_with_credential_pool to cover two additional cases: 'oauth authentication is currently not allowed for this organization' and Anthropic anthropic_messages-mode 403s, both of which should be treated as entitlement failures rather than transient errors. Tests: 7 new cases in test_telegram_voice_v0_regressions.py covering the size gate (accept, reject, note text) and the STT-failure notice path. Salvaged from #40487 (cryptopafi) — cherry-picked the Telegram voice policy and 403 entitlement fixes; LiveKit/Discord/uv.lock workstreams left for separate PRs. * test(gateway): drop orphaned voice tests not backed by this PR The cherry-picked test file from #40487 included 3 tests for STT-failure notice and voice-mode (_handle_voice_command 'on' -> voice_only) behavior that this PR intentionally does NOT salvage (those belong to the LiveKit/ voice-policy workstreams left in #40487). They fail on both this branch and clean main because the feature code isn't present. Keep only the 2 tests backed by code actually in this PR: - test_telegram_audio_size_gate_rejects_oversized_media_before_download (covers the _telegram_media_size_allowed guard this PR adds) - test_voice_tts_is_explicit_audio_reply_opt_in (matches current main) Removed now-unused imports (MessageEvent, MessageType, AsyncMock). --- agent/agent_runtime_helpers.py | 27 +++++-- gateway/platforms/telegram.py | 39 ++++++++++ .../test_telegram_voice_v0_regressions.py | 71 +++++++++++++++++++ 3 files changed, 130 insertions(+), 7 deletions(-) create mode 100644 tests/gateway/test_telegram_voice_v0_regressions.py diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index daffc025d9b..742af145380 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -679,15 +679,28 @@ def recover_with_credential_pool( # long-running TUI sessions stuck on stale tokens until the user # exited and reopened. is_entitlement = agent._is_entitlement_failure(error_context, status_code) + _auth_haystack = " ".join( + str(error_context.get(k) or "").lower() + for k in ("message", "reason", "code", "error") + if isinstance(error_context, dict) + ) + if ( + not is_entitlement + and status_code == 403 + and "oauth authentication is currently not allowed for this organization" in _auth_haystack + ): + is_entitlement = True + if ( + not is_entitlement + and status_code == 403 + and (agent.provider or "") == "anthropic" + and getattr(agent, "api_mode", "") == "anthropic_messages" + ): + is_entitlement = True if not is_entitlement and status_code == 403 and (agent.provider or "") == "xai-oauth": - _disambiguator_haystack = " ".join( - str(error_context.get(k) or "").lower() - for k in ("message", "reason", "code", "error") - if isinstance(error_context, dict) - ) _is_xai_auth_failure = ( - "[wke=unauthenticated:" in _disambiguator_haystack - or "oauth2 access token could not be validated" in _disambiguator_haystack + "[wke=unauthenticated:" in _auth_haystack + or "oauth2 access token could not be validated" in _auth_haystack ) if not _is_xai_auth_failure: is_entitlement = True diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 5cba11d2ee9..eec156bbae9 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -3837,6 +3837,33 @@ class TelegramAdapter(BasePlatformAdapter): ) return error + def _telegram_media_too_large_note(self, label: str, file_size: Any, max_bytes: int) -> str: + limit_mb = max(1, max_bytes // (1024 * 1024)) + try: + size_mb = int(file_size or 0) / (1024 * 1024) + size_text = f"{size_mb:.1f} MB" + except (TypeError, ValueError): + size_text = "unknown size" + return ( + f"[Telegram {label} skipped: file size {size_text} exceeds the " + f"{limit_mb} MB limit. Ask the user to send a shorter voice note " + "or a smaller audio file.]" + ) + + def _telegram_media_size_allowed(self, source: Any, label: str) -> tuple[bool, Optional[str]]: + """Validate Telegram media size before downloading into memory.""" + max_bytes = int(getattr(self, "_max_doc_bytes", 20 * 1024 * 1024) or 20 * 1024 * 1024) + file_size = getattr(source, "file_size", None) + try: + size = int(file_size or 0) + except (TypeError, ValueError): + size = 0 + if size <= 0: + return True, None + if size <= max_bytes: + return True, None + return False, self._telegram_media_too_large_note(label, size, max_bytes) + async def send_voice( self, chat_id: str, @@ -5602,6 +5629,12 @@ class TelegramAdapter(BasePlatformAdapter): # Download voice/audio messages to cache for STT transcription if msg.voice: try: + allowed, note = self._telegram_media_size_allowed(msg.voice, "voice message") + if not allowed: + event.text = self._append_observed_note(event.text, note or "") + logger.info("[Telegram] Skipped oversized user voice (size=%s)", getattr(msg.voice, "file_size", None)) + await self.handle_message(event) + return file_obj = await msg.voice.get_file() audio_bytes = await file_obj.download_as_bytearray() cached_path = cache_audio_from_bytes(bytes(audio_bytes), ext=".ogg") @@ -5612,6 +5645,12 @@ class TelegramAdapter(BasePlatformAdapter): logger.warning("[Telegram] Failed to cache voice: %s", e, exc_info=True) elif msg.audio: try: + allowed, note = self._telegram_media_size_allowed(msg.audio, "audio file") + if not allowed: + event.text = self._append_observed_note(event.text, note or "") + logger.info("[Telegram] Skipped oversized user audio (size=%s)", getattr(msg.audio, "file_size", None)) + await self.handle_message(event) + return file_obj = await msg.audio.get_file() audio_bytes = await file_obj.download_as_bytearray() cached_path = cache_audio_from_bytes(bytes(audio_bytes), ext=".mp3") diff --git a/tests/gateway/test_telegram_voice_v0_regressions.py b/tests/gateway/test_telegram_voice_v0_regressions.py new file mode 100644 index 00000000000..b2b8d4d0e8b --- /dev/null +++ b/tests/gateway/test_telegram_voice_v0_regressions.py @@ -0,0 +1,71 @@ +import sys +from pathlib import Path +from types import SimpleNamespace + +import pytest + + +ROOT = Path(__file__).resolve().parents[2] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +from gateway.config import Platform +from gateway.platforms.telegram import TelegramAdapter +from gateway.run import GatewayRunner +from gateway.session import SessionSource + + +def _source(): + return SessionSource(platform=Platform.TELEGRAM, chat_id="12345", chat_type="dm") + + +def _runner(adapter=None): + runner = object.__new__(GatewayRunner) + runner.config = SimpleNamespace( + stt_enabled=True, + group_sessions_per_user=True, + thread_sessions_per_user=False, + ) + runner.adapters = {Platform.TELEGRAM: adapter} if adapter else {} + runner._consume_pending_native_image_paths = lambda _key: [] + runner._session_key_for_source = lambda _source: "telegram:dm:12345" + runner._thread_metadata_for_source = lambda *_args, **_kwargs: {} + runner._reply_anchor_for_event = lambda _event: None + return runner + + +def test_telegram_audio_size_gate_rejects_oversized_media_before_download(): + adapter = object.__new__(TelegramAdapter) + adapter._max_doc_bytes = 1024 + + allowed, note = adapter._telegram_media_size_allowed( + SimpleNamespace(file_size=2048), + "voice message", + ) + + assert allowed is False + assert "exceeds" in note + assert "voice message" in note + + +@pytest.mark.asyncio +async def test_voice_tts_is_explicit_audio_reply_opt_in(): + adapter = SimpleNamespace( + _auto_tts_disabled_chats=set(), + _auto_tts_enabled_chats=set(), + ) + runner = _runner(adapter) + runner._voice_mode = {} + runner._voice_provider_mode = {} + runner._save_voice_modes = lambda: None + runner._save_voice_provider_modes = lambda: None + + event = SimpleNamespace( + source=_source(), + get_command_args=lambda: "tts", + ) + result = await GatewayRunner._handle_voice_command(runner, event) + + assert runner._voice_mode["telegram:12345"] == "all" + assert "12345" in adapter._auto_tts_enabled_chats + assert result