From 0afab4a32b3b371ac3b5ab17d745aab823444ae3 Mon Sep 17 00:00:00 2001 From: Franci Penov Date: Thu, 14 May 2026 22:37:51 -0700 Subject: [PATCH] feat(gateway): extract auto-TTS markdown strip into prepare_tts_text() hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the inlined `re.sub(...)[:4000].strip()` cleanup at the auto-TTS site in `_process_message_background` into an overridable method `BasePlatformAdapter.prepare_tts_text(text: str) -> str`. The default implementation is byte-identical to the previous inline expression — strip `* _ \` # [ ] ( )` and truncate to 4000 chars — so every existing adapter (Telegram, Discord, Slack, Matrix, IRC, etc.) gets exactly the same behaviour as before. Zero behaviour change for any consumer that doesn't override the method. Why add the hook: voice-first platform adapters need stricter cleanup than text-bubble platforms. The default strips a handful of markdown sigils, which is fine when the output goes into a Discord embed or a Telegram message bubble — but read aloud by a TTS engine, URLs (`https://example.com/foo`), fenced code blocks, file paths (`/Users/x/foo.py`), and `MEDIA:` tags turn into long sequences of unintelligible characters. With this hook an adapter can drop those spans before TTS while leaving the data-channel transcript intact for visual rendering. Without the hook, voice adapters have to either - duplicate the auto-TTS flow inside their own `handle_response` pipeline, which means re-implementing the entire `extract_media`, `extract_images`, `extract_local_files`, attachment routing and error-handling sequence in `_process_message_background`, or - live with TTS speaking URLs character-by-character. Both are worse than a 7-line method addition. Example consumer: https://github.com/kortexa-ai/hermes-livekit — LiveKit WebRTC voice gateway plugin. Its `LiveKitAdapter.prepare_tts_text()` additionally strips fenced code blocks, inline code, URLs, file paths, and `MEDIA:` tags before TTS synthesis, while the full response still reaches connected clients via the data channel. Drop-in installable via `pip install git+https://github.com/kortexa-ai/hermes-livekit.git`. Carved out of #3894 (LiveKit WebRTC gateway PR) so the generic hook can land independently of the LiveKit platform itself. --- gateway/platforms/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 7b3147e21f4..96b56d29cc7 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -2014,6 +2014,13 @@ class BasePlatformAdapter(ABC): text = f"{caption}\n{text}" return await self.send(chat_id=chat_id, content=text, reply_to=reply_to, metadata=metadata) + def prepare_tts_text(self, text: str) -> str: + """Prepare text for TTS. Override to filter tool output, code, etc. + + Default strips markdown formatting and truncates to 4000 chars. + """ + return re.sub(r'[*_`#\[\]()]', '', text)[:4000].strip() + async def play_tts( self, chat_id: str, @@ -3144,7 +3151,7 @@ class BasePlatformAdapter(ABC): from tools.tts_tool import text_to_speech_tool, check_tts_requirements if check_tts_requirements(): import json as _json - speech_text = re.sub(r'[*_`#\[\]()]', '', text_content)[:4000].strip() + speech_text = self.prepare_tts_text(text_content) if not speech_text: raise ValueError("Empty text after markdown cleanup") tts_result_str = await asyncio.to_thread(