diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 35c8d386c89..e69600dbbf2 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -439,6 +439,14 @@ class TelegramAdapter(BasePlatformAdapter): self._dm_topic_chat_ids: Set[str] = { str(e["chat_id"]) for e in self._dm_topics_config if "chat_id" in e } + # Document size cap. Telegram's public Bot API caps getFile at 20MB; a + # locally-hosted telegram-bot-api server (configured via extra.base_url) + # raises that to 2GB, so the presence of base_url is the opt-in. + self._max_doc_bytes: int = ( + 2 * 1024 * 1024 * 1024 + if self.config.extra.get("base_url") + else 20 * 1024 * 1024 + ) # Interactive model picker state per chat self._model_picker_state: Dict[str, dict] = {} # Approval button state: message_id → session_key @@ -1315,6 +1323,14 @@ class TelegramAdapter(BasePlatformAdapter): "[%s] Using custom Telegram base_url: %s", self.name, custom_base_url, ) + # In local-mode telegram-bot-api, file_path is an absolute path on the + # server's filesystem rather than a relative HTTP path. PTB needs + # local_mode=True so download_*() reads from disk instead of issuing + # an HTTP GET that would 404. Requires that the same path is + # readable by the Hermes process (shared mount, same machine, etc.). + if self.config.extra.get("local_mode"): + builder = builder.local_mode(True) + logger.info("[%s] Using Telegram local_mode (read files from disk)", self.name) # PTB defaults (pool_timeout=1s) are too aggressive on flaky networks and # can trigger "Pool timeout: All connections in the connection pool are occupied" @@ -4894,11 +4910,11 @@ class TelegramAdapter(BasePlatformAdapter): # Check file size early so image documents cannot bypass the # document size limit by taking the image path. - MAX_DOC_BYTES = 20 * 1024 * 1024 - if not doc.file_size or doc.file_size > MAX_DOC_BYTES: + if not doc.file_size or doc.file_size > self._max_doc_bytes: + limit_mb = self._max_doc_bytes // (1024 * 1024) event.text = ( "The document is too large or its size could not be verified. " - "Maximum: 20 MB." + f"Maximum: {limit_mb} MB." ) logger.info("[Telegram] Document too large: %s bytes", doc.file_size) await self.handle_message(event) diff --git a/gateway/run.py b/gateway/run.py index 43d53915863..73af453b128 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -918,6 +918,59 @@ def _build_media_placeholder(event) -> str: return "\n".join(parts) +def _format_duration(seconds: float) -> str: + total = int(round(seconds)) + if total < 0: + total = 0 + hours, rem = divmod(total, 3600) + minutes, secs = divmod(rem, 60) + if hours: + return f"{hours}:{minutes:02d}:{secs:02d}" + return f"{minutes}:{secs:02d}" + + +async def _probe_audio_duration(path: str) -> Optional[str]: + """Best-effort duration probe. Returns formatted MM:SS / HH:MM:SS, or None on failure.""" + ext = os.path.splitext(path)[1].lower() + + if ext == ".wav": + try: + def _wav_duration() -> float: + import wave + with wave.open(path, "rb") as wf: + frames = wf.getnframes() + rate = wf.getframerate() or 1 + return frames / float(rate) + secs = await asyncio.to_thread(_wav_duration) + return _format_duration(secs) + except Exception: + pass + + if ext in (".ogg", ".opus", ".oga"): + try: + def _ogg_duration() -> float: + from mutagen.oggopus import OggOpus + return float(OggOpus(path).info.length) + secs = await asyncio.to_thread(_ogg_duration) + return _format_duration(secs) + except Exception: + pass + + try: + proc = await asyncio.create_subprocess_exec( + "ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=noprint_wrappers=1:nokey=1", path, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await asyncio.wait_for(proc.communicate(), timeout=5.0) + if proc.returncode == 0: + return _format_duration(float(stdout.decode().strip())) + except Exception: + pass + + return None + + def _dequeue_pending_event(adapter, session_key: str) -> MessageEvent | None: """Consume and return the full pending event for a session. @@ -14151,16 +14204,25 @@ class GatewayRunner: The enriched message string with transcriptions prepended. """ if not getattr(self.config, "stt_enabled", True): - disabled_note = "[The user sent voice message(s), but transcription is disabled in config." - if self._has_setup_skill(): - disabled_note += ( - " You have a skill called hermes-agent-setup that can help " - "users configure Hermes features including voice, tools, and more." - ) - disabled_note += "]" + notes = [] + for path in audio_paths: + abs_path = os.path.abspath(path) + duration_str = await _probe_audio_duration(abs_path) + if duration_str: + notes.append( + f"[The user sent a voice message: {abs_path} (duration: {duration_str})]" + ) + else: + notes.append(f"[The user sent a voice message: {abs_path}]") + if not notes: + return user_text + prefix = "\n\n".join(notes) + _placeholder = "(The user sent a message with no text content)" + if user_text and user_text.strip() == _placeholder: + return prefix if user_text: - return f"{disabled_note}\n\n{user_text}" - return disabled_note + return f"{prefix}\n\n{user_text}" + return prefix from tools.transcription_tools import transcribe_audio diff --git a/tests/gateway/test_stt_config.py b/tests/gateway/test_stt_config.py index 23ba06af226..44dd5950f3c 100644 --- a/tests/gateway/test_stt_config.py +++ b/tests/gateway/test_stt_config.py @@ -33,25 +33,51 @@ def test_load_gateway_config_bridges_stt_enabled_from_config_yaml(tmp_path, monk @pytest.mark.asyncio -async def test_enrich_message_with_transcription_skips_when_stt_disabled(): +async def test_enrich_message_with_transcription_surfaces_path_when_stt_disabled(): from gateway.run import GatewayRunner runner = GatewayRunner.__new__(GatewayRunner) runner.config = GatewayConfig(stt_enabled=False) + runner._has_setup_skill = lambda: True # Should NOT be consulted in disabled branch. with patch( "tools.transcription_tools.transcribe_audio", side_effect=AssertionError("transcribe_audio should not be called when STT is disabled"), + ), patch( + "gateway.run._probe_audio_duration", + new=AsyncMock(return_value="0:12"), ): result = await runner._enrich_message_with_transcription( "caption", ["/tmp/voice.ogg"], ) - assert "transcription is disabled" in result.lower() + assert "/tmp/voice.ogg" in result + assert "voice message" in result.lower() + assert "(duration: 0:12)" in result assert "caption" in result +@pytest.mark.asyncio +async def test_enrich_message_with_transcription_omits_duration_on_probe_failure(): + from gateway.run import GatewayRunner + + runner = GatewayRunner.__new__(GatewayRunner) + runner.config = GatewayConfig(stt_enabled=False) + + with patch( + "gateway.run._probe_audio_duration", + new=AsyncMock(return_value=None), + ): + result = await runner._enrich_message_with_transcription( + "", + ["/tmp/voice.ogg"], + ) + + assert "/tmp/voice.ogg" in result + assert "duration" not in result.lower() + + @pytest.mark.asyncio async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors(): from gateway.run import GatewayRunner diff --git a/tests/gateway/test_telegram_max_doc_bytes.py b/tests/gateway/test_telegram_max_doc_bytes.py new file mode 100644 index 00000000000..163dcc9f576 --- /dev/null +++ b/tests/gateway/test_telegram_max_doc_bytes.py @@ -0,0 +1,56 @@ +"""Tests for Telegram document-size cap. + +The public Telegram Bot API caps `getFile` at 20MB. A locally-hosted +`telegram-bot-api` server raises that ceiling to 2GB. We treat the presence +of `extra.base_url` as the explicit opt-in to the higher cap. +""" + +import sys +from unittest.mock import MagicMock + +from gateway.config import PlatformConfig + + +def _ensure_telegram_mock(): + if "telegram" in sys.modules and hasattr(sys.modules["telegram"], "__file__"): + return + + telegram_mod = MagicMock() + telegram_mod.ext.ContextTypes.DEFAULT_TYPE = type(None) + telegram_mod.constants.ParseMode.MARKDOWN_V2 = "MarkdownV2" + telegram_mod.constants.ChatType.GROUP = "group" + telegram_mod.constants.ChatType.SUPERGROUP = "supergroup" + telegram_mod.constants.ChatType.CHANNEL = "channel" + telegram_mod.constants.ChatType.PRIVATE = "private" + + for name in ("telegram", "telegram.ext", "telegram.constants", "telegram.request"): + sys.modules.setdefault(name, telegram_mod) + + +_ensure_telegram_mock() + +from gateway.platforms.telegram import TelegramAdapter # noqa: E402 + + +def test_max_doc_bytes_defaults_to_20mb_without_base_url(): + adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***", extra={})) + assert adapter._max_doc_bytes == 20 * 1024 * 1024 + + +def test_max_doc_bytes_raised_to_2gb_when_base_url_set(): + adapter = TelegramAdapter( + PlatformConfig( + enabled=True, + token="***", + extra={"base_url": "http://localhost:8081/bot"}, + ) + ) + assert adapter._max_doc_bytes == 2 * 1024 * 1024 * 1024 + + +def test_max_doc_bytes_empty_base_url_keeps_default(): + """An empty/falsy `base_url` should not flip the cap — only a real URL does.""" + adapter = TelegramAdapter( + PlatformConfig(enabled=True, token="***", extra={"base_url": ""}), + ) + assert adapter._max_doc_bytes == 20 * 1024 * 1024 diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index 90997e09f6e..f163b291491 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -391,6 +391,11 @@ voice: # Speech-to-Text stt: + enabled: true # set to false to skip auto-transcription — + # the gateway still caches the audio file and + # passes its path to the agent as part of the + # inbound message, useful for custom pipelines + # (diarization, alignment, archival, etc.) provider: "local" # "local" (free) | "groq" | "openai" local: model: "base" # tiny, base, small, medium, large-v3 diff --git a/website/docs/user-guide/messaging/telegram.md b/website/docs/user-guide/messaging/telegram.md index 7525a57eec6..4a9dace7d8d 100644 --- a/website/docs/user-guide/messaging/telegram.md +++ b/website/docs/user-guide/messaging/telegram.md @@ -276,6 +276,25 @@ Voice messages you send on Telegram are automatically transcribed by Hermes's co - `groq` uses Groq Whisper and requires `GROQ_API_KEY` - `openai` uses OpenAI Whisper and requires `VOICE_TOOLS_OPENAI_KEY` +#### Skipping STT: pass the raw audio file to the agent + +If you'd rather have the **agent itself** handle audio — for diarization, a custom transcription tool, or just archiving the recording — set `stt.enabled: false` in `~/.hermes/config.yaml`: + +```yaml +stt: + enabled: false +``` + +With STT disabled, the gateway still downloads the voice/audio attachment into Hermes's audio cache, but **does not transcribe it**. The agent receives the message with a marker like: + +``` +[The user sent a voice message: /home//.hermes/cache/audio/.ogg] +``` + +Your tools or skills can then read that path directly (e.g., hand it off to a local diarization pipeline, a richer transcription model, or upload it to long-term storage). The file extension reflects the original format Telegram delivered (`.ogg` for voice notes, `.mp3`/`.m4a`/etc. for audio attachments). + +This pairs naturally with the [local Bot API server](#large-files-20mb--via-local-bot-api-server) section below, which lifts Telegram's 20MB getFile ceiling to 2GB — useful when the recordings you want to process are longer than a couple of minutes. + ### Outgoing Voice (Text-to-Speech) When the agent generates audio via TTS, it's delivered as native Telegram **voice bubbles** — the round, inline-playable kind. @@ -295,6 +314,135 @@ Without ffmpeg, Edge TTS audio is sent as a regular audio file (still playable, Configure the TTS provider in your `config.yaml` under the `tts.provider` key. +## Large Files (>20MB) via Local Bot API Server + +Telegram's **public** Bot API caps `getFile` downloads at **20 MB**, so any voice note, audio file, video, or document larger than that is silently rejected by Hermes with a "too large" reply. The documented way around this is to run a **local** [telegram-bot-api](https://github.com/tdlib/telegram-bot-api) daemon — the same server software Telegram uses, but running on your network. A local server raises the file ceiling to **2 GB** and Hermes auto-lifts its own internal cap when it sees a custom `base_url` configured. + +This unlocks workflows like: + +- Sending long voice memos (45-minute meetings, podcasts) to the bot +- Uploading large videos for vision-tool processing +- Archiving raw audio for offline pipelines like diarization, alignment, or training data + +### Step 1: Obtain Telegram API credentials + +The local server talks directly to Telegram's MTProto layer (not the public Bot API), so it needs **MTProto credentials**: + +1. Visit [my.telegram.org/apps](https://my.telegram.org/apps) and sign in with your Telegram account. +2. Create a new application (any name and short description will do). +3. Copy the `api_id` and `api_hash` — both are required. + +### Step 2: Run the telegram-bot-api server + +The community-maintained [`aiogram/telegram-bot-api`](https://hub.docker.com/r/aiogram/telegram-bot-api) Docker image is the easiest path. A minimal `docker-compose.yaml` (use `--local` mode to enable the higher limits): + +```yaml +services: + tg-bot-api: + image: aiogram/telegram-bot-api:latest + container_name: tg-bot-api + restart: unless-stopped + ports: + - "127.0.0.1:8081:8081" # bind to loopback only; see security note + environment: + TELEGRAM_API_ID: "12345" # your api_id from Step 1 + TELEGRAM_API_HASH: "abcdef..." # your api_hash from Step 1 + TELEGRAM_LOCAL: "1" # enable --local mode (raises 20MB → 2GB) + volumes: + - ./tg-bot-api-data:/var/lib/telegram-bot-api +``` + +Bring it up: + +```bash +docker compose up -d tg-bot-api +docker logs --tail 20 tg-bot-api +``` + +:::warning Security +The local Bot API server takes your bot token in the URL path (e.g. `/bot/getMe`) with **no additional auth**. Anyone who can reach the port can fully control your bot — read every message it can see, send messages as it, etc. Bind the container to `127.0.0.1` and/or front it with a reverse proxy on a private network. **Never expose port 8081 to the public internet.** +::: + +### Step 3: Log the bot out of the public API (one-time) + +A bot can only be active on **one** Bot API server at a time. If your bot was already running against `api.telegram.org` (which it almost certainly was), you must explicitly log it out there before the local server will accept it: + +```bash +curl "https://api.telegram.org/bot/logOut" +# expected response: {"ok":true,"result":true} +``` + +This is a one-shot migration step — you don't repeat it on every restart. Telegram delivers any messages received after `logOut` through the new server instead. + +Verify the local server can talk to Telegram on the bot's behalf: + +```bash +curl "http://127.0.0.1:8081/bot/getMe" +# expected response: {"ok":true,"result":{"id":...,"is_bot":true,...}} +``` + +### Step 4: Point Hermes at the local server + +Add the URLs under `platforms.telegram.extra` in `~/.hermes/config.yaml`: + +```yaml +platforms: + telegram: + extra: + base_url: "http://127.0.0.1:8081/bot" + base_file_url: "http://127.0.0.1:8081/file/bot" + local_mode: true # see Step 5 below — only set this if the bot's data + # directory is readable by the Hermes process +``` + +:::caution Use `platforms.telegram.extra`, not `telegram.extra` +At the moment only the `platforms..extra` form is deep-merged into the platform config. Keys placed directly under a top-level `telegram.extra` block are silently dropped. +::: + +When `base_url` is set, Hermes: + +- Builds the python-telegram-bot client against the local server +- Auto-lifts its internal document/audio size cap from 20 MB → 2 GB +- Reports the active limit in the "too large" error message (`Maximum: 2048 MB.`) so it's obvious which mode you're in + +Restart the gateway and look for a confirmation log line: + +```bash +hermes gateway restart +grep -E "Using custom Telegram base_url|Using Telegram local_mode" ~/.hermes/logs/gateway.log | tail +``` + +### Step 5: `local_mode` — file access on disk + +The local server has **two ways** to deliver files: + +1. **Without `--local`** (the default): files are served over HTTP at `/file/bot/`, same as the public Bot API. The 20MB ceiling stays in effect. Useful as a network-fix only (e.g. when `api.telegram.org` is unreachable but you can self-host); not what you want for the size lift. +2. **With `--local`** (set via `TELEGRAM_LOCAL=1` above): files are written to the server's filesystem and the `getFile` response returns an **absolute path** instead of an HTTP URL. The 20MB ceiling is lifted. Hermes must then read the bytes **from disk**, not over HTTP. + +To make the disk-read path work, set `local_mode: true` in the config above **and** make sure the Hermes process can read the path the server returns. Two scenarios: + +- **Same machine** — telegram-bot-api and Hermes run on the same host. Bind-mount the data volume to a directory that Hermes can read (e.g., `/var/lib/telegram-bot-api`), and make sure the file ownership matches. The container drops privileges to its internal `telegram-bot-api` user (uid varies by image); the simplest fix is to add `user: ":"` to the compose service so files are owned by a uid Hermes already runs as. +- **Different machines** — the bot server runs on one host (e.g., a NAS, a separate VM) and Hermes on another. The server's data directory must be shared with the Hermes machine at the **same absolute path** the server reports (typically `/var/lib/telegram-bot-api`). NFS works well for this; CIFS/SMB with `uid=` mount remapping is friendlier if you don't want to deal with uid mismatches at the filesystem level. + +If `local_mode: true` is set but Hermes can't `stat` the returned file path (permissions or wrong mount), python-telegram-bot silently falls back to an HTTP `getFile` against the local server — which in `--local` mode responds with `404 Not Found`. The symptom shows up in `gateway.log` as: + +``` +[Telegram] Failed to cache voice: Not Found +telegram.error.InvalidToken: Not Found +``` + +If you see that, the cap-lift is working but the file-share isn't. Verify `ls -la /var/lib/telegram-bot-api//voice/` from the Hermes host as the user the gateway runs as, and confirm a single file is `cat`-able without a permission error. + +### Step 6: Test it + +Send the bot a voice note or audio file that's bigger than 20 MB. Tail the gateway log: + +```bash +tail -f ~/.hermes/logs/gateway.log | grep -iE "telegram|cache" +``` + +You should see a `[Telegram] Cached user voice at /home//.hermes/cache/audio/...` line and **no** "too large" rejection. Combined with `stt.enabled: false` (above), the path to the original audio file then lands in the agent's inbound message for downstream processing. + ## Group Chat Usage Hermes Agent works in Telegram group chats with a few considerations: