diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 718f99795..71e97285e 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -537,6 +537,20 @@ class BasePlatformAdapter(ABC): text = f"{caption}\n{text}" return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + async def play_tts( + self, + chat_id: str, + audio_path: str, + **kwargs, + ) -> SendResult: + """ + Play auto-TTS audio for voice replies. + + Override in subclasses for invisible playback (e.g. Web UI). + Default falls back to send_voice (shows audio player). + """ + return await self.send_voice(chat_id=chat_id, audio_path=audio_path, **kwargs) + async def send_video( self, chat_id: str, @@ -718,7 +732,31 @@ class BasePlatformAdapter(ABC): if images: logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response)) - # Send the text portion first (if any remains after extractions) + # Auto-TTS: if voice message, generate audio FIRST (before sending text) + _tts_path = None + if event.message_type == MessageType.VOICE and text_content and not media_files: + try: + from tools.tts_tool import text_to_speech_tool, check_tts_requirements + if check_tts_requirements(): + import json as _json + speech_text = re.sub(r'[*_`#\[\]()]', '', text_content)[:4000] + tts_result_str = await asyncio.to_thread( + text_to_speech_tool, text=speech_text + ) + tts_data = _json.loads(tts_result_str) + _tts_path = tts_data.get("file_path") + except Exception as tts_err: + logger.warning("[%s] Auto-TTS failed: %s", self.name, tts_err) + + # Play TTS audio before text (voice-first experience) + if _tts_path and Path(_tts_path).exists(): + await self.play_tts( + chat_id=event.source.chat_id, + audio_path=_tts_path, + metadata=_thread_metadata, + ) + + # Send the text portion if text_content: logger.info("[%s] Sending response (%d chars) to %s", self.name, len(text_content), event.source.chat_id) result = await self.send( @@ -727,7 +765,7 @@ class BasePlatformAdapter(ABC): reply_to=event.message_id, metadata=_thread_metadata, ) - + # Log send failures (don't raise - user already saw tool progress) if not result.success: print(f"[{self.name}] Failed to send response: {result.error}") @@ -740,10 +778,10 @@ class BasePlatformAdapter(ABC): ) if not fallback_result.success: print(f"[{self.name}] Fallback send also failed: {fallback_result.error}") - + # Human-like pacing delay between text and media human_delay = self._get_human_delay() - + # Send extracted images as native attachments if images: logger.info("[%s] Extracted %d image(s) to send as attachments", self.name, len(images)) @@ -771,7 +809,7 @@ class BasePlatformAdapter(ABC): logger.error("[%s] Failed to send image: %s", self.name, img_result.error) except Exception as img_err: logger.error("[%s] Error sending image: %s", self.name, img_err, exc_info=True) - + # Send extracted media files — route by file type _AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'} _VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'} diff --git a/gateway/platforms/web.py b/gateway/platforms/web.py index e8a94892c..741ec62d9 100644 --- a/gateway/platforms/web.py +++ b/gateway/platforms/web.py @@ -218,6 +218,27 @@ class WebAdapter(BasePlatformAdapter): await self._broadcast(payload) return SendResult(success=True, message_id=msg_id) + async def play_tts( + self, + chat_id: str, + audio_path: str, + **kwargs, + ) -> SendResult: + """Play TTS audio invisibly — no bubble in chat, just audio playback.""" + filename = f"tts_{uuid.uuid4().hex[:8]}{Path(audio_path).suffix}" + dest = self._media_dir / filename + try: + shutil.copy2(audio_path, dest) + except Exception as e: + return SendResult(success=False, error=f"Failed to copy audio: {e}") + + payload = { + "type": "play_audio", + "url": f"/media/{filename}", + } + await self._broadcast(payload) + return SendResult(success=True) + async def send_image_file( self, chat_id: str, @@ -551,27 +572,36 @@ def _build_chat_html() -> str: