diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 0cd79733e..bcd9f09bf 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -2,12 +2,15 @@ """ Transcription Tools Module -Provides speech-to-text transcription with three providers: +Provides speech-to-text transcription with six providers: - **local** (default, free) — faster-whisper running locally, no API key needed. Auto-downloads the model (~150 MB for ``base``) on first use. - **groq** (free tier) — Groq Whisper API, requires ``GROQ_API_KEY``. - **openai** (paid) — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``. + - **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``. + - **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy, + Inverse Text Normalization, diarization, 26 languages. Used by the messaging gateway to automatically transcribe voice messages sent by users on Telegram, Discord, WhatsApp, Slack, and Signal. @@ -73,6 +76,7 @@ COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin") GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1") OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1") +XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1") SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"} LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"} @@ -242,6 +246,14 @@ def _get_provider(stt_config: dict) -> str: ) return "none" + if provider == "xai": + if os.getenv("XAI_API_KEY"): + return "xai" + logger.warning( + "STT provider 'xai' configured but XAI_API_KEY not set" + ) + return "none" + return provider # Unknown — let it fail downstream # --- Auto-detect (no explicit provider): local > groq > openai > mistral - @@ -259,6 +271,9 @@ def _get_provider(stt_config: dict) -> str: if _HAS_MISTRAL and os.getenv("MISTRAL_API_KEY"): logger.info("No local STT available, using Mistral Voxtral Transcribe API") return "mistral" + if os.getenv("XAI_API_KEY"): + logger.info("No local STT available, using xAI Grok STT API") + return "xai" return "none" # --------------------------------------------------------------------------- @@ -573,6 +588,103 @@ def _transcribe_mistral(file_path: str, model_name: str) -> Dict[str, Any]: return {"success": False, "transcript": "", "error": f"Mistral transcription failed: {type(e).__name__}"} +# --------------------------------------------------------------------------- +# Provider: xAI (Grok STT API) +# --------------------------------------------------------------------------- + + +def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]: + """Transcribe using xAI Grok STT API. + + Uses the ``POST /v1/stt`` REST endpoint with multipart/form-data. + Supports Inverse Text Normalization, diarization, and word-level timestamps. + Requires ``XAI_API_KEY`` environment variable. + """ + api_key = os.getenv("XAI_API_KEY") + if not api_key: + return {"success": False, "transcript": "", "error": "XAI_API_KEY not set"} + + stt_config = _load_stt_config() + xai_config = stt_config.get("xai", {}) + base_url = str( + xai_config.get("base_url") + or os.getenv("XAI_STT_BASE_URL") + or XAI_STT_BASE_URL + ).strip().rstrip("/") + language = str( + xai_config.get("language") + or os.getenv("HERMES_LOCAL_STT_LANGUAGE") + or "fr" + ).strip() + use_format = is_truthy_value(xai_config.get("format", True), default=True) + use_diarize = is_truthy_value(xai_config.get("diarize", False), default=False) + + try: + import requests + from tools.xai_http import hermes_xai_user_agent + + data: Dict[str, str] = {} + if language: + data["language"] = language + if use_format: + data["format"] = "true" + if use_diarize: + data["diarize"] = "true" + + with open(file_path, "rb") as audio_file: + response = requests.post( + f"{base_url}/stt", + headers={ + "Authorization": f"Bearer {api_key}", + "User-Agent": hermes_xai_user_agent(), + }, + files={ + "file": (Path(file_path).name, audio_file), + }, + data=data, + timeout=120, + ) + + if response.status_code != 200: + detail = "" + try: + err_body = response.json() + detail = err_body.get("error", {}).get("message", "") or response.text[:300] + except Exception: + detail = response.text[:300] + return { + "success": False, + "transcript": "", + "error": f"xAI STT API error (HTTP {response.status_code}): {detail}", + } + + result = response.json() + transcript_text = result.get("text", "").strip() + + if not transcript_text: + return { + "success": False, + "transcript": "", + "error": "xAI STT returned empty transcript", + } + + logger.info( + "Transcribed %s via xAI Grok STT (lang=%s, %.1fs audio, %d chars)", + Path(file_path).name, + result.get("language", language), + result.get("duration", 0), + len(transcript_text), + ) + + return {"success": True, "transcript": transcript_text, "provider": "xai"} + + except PermissionError: + return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} + except Exception as e: + logger.error("xAI STT transcription failed: %s", e, exc_info=True) + return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"} + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -641,6 +753,12 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A model_name = model or mistral_cfg.get("model", DEFAULT_MISTRAL_STT_MODEL) return _transcribe_mistral(file_path, model_name) + if provider == "xai": + xai_cfg = stt_config.get("xai", {}) + # xAI Grok STT doesn't use a model parameter — pass through for logging + model_name = model or "grok-stt" + return _transcribe_xai(file_path, model_name) + # No provider available return { "success": False, @@ -649,7 +767,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A "No STT provider available. Install faster-whisper for free local " f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, " "set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral " - "Voxtral Transcribe, or set VOICE_TOOLS_OPENAI_KEY " + "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY " "or OPENAI_API_KEY for the OpenAI Whisper API." ), }