feat(stt): add xAI Grok STT provider

Add xAI as a sixth STT provider using the POST /v1/stt endpoint.

Features:
- Multipart/form-data upload to api.x.ai/v1/stt
- Inverse Text Normalization (ITN) via format=true (default)
- Optional diarization via config (stt.xai.diarize)
- Language configuration (default: fr, overridable via config or env)
- Custom base_url support (XAI_STT_BASE_URL env or stt.xai.base_url)
- Full provider integration: explicit config + auto-detect fallback chain
- Consistent error handling matching existing provider patterns

Config (config.yaml):
  stt:
    provider: xai
    xai:
      language: fr
      format: true
      diarize: false
      base_url: https://api.x.ai/v1   # optional override

Auto-detect priority: local > groq > openai > mistral > xai > none
This commit is contained in:
Julien Talbot 2026-04-18 15:07:55 +04:00 committed by Teknium
parent bace220d29
commit a6ffa994cd

View file

@ -2,12 +2,15 @@
"""
Transcription Tools Module
Provides speech-to-text transcription with three providers:
Provides speech-to-text transcription with six providers:
- **local** (default, free) faster-whisper running locally, no API key needed.
Auto-downloads the model (~150 MB for ``base``) on first use.
- **groq** (free tier) Groq Whisper API, requires ``GROQ_API_KEY``.
- **openai** (paid) OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
- **mistral** Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``.
- **xai** xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy,
Inverse Text Normalization, diarization, 26 languages.
Used by the messaging gateway to automatically transcribe voice messages
sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
@ -73,6 +76,7 @@ COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1")
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
@ -242,6 +246,14 @@ def _get_provider(stt_config: dict) -> str:
)
return "none"
if provider == "xai":
if os.getenv("XAI_API_KEY"):
return "xai"
logger.warning(
"STT provider 'xai' configured but XAI_API_KEY not set"
)
return "none"
return provider # Unknown — let it fail downstream
# --- Auto-detect (no explicit provider): local > groq > openai > mistral -
@ -259,6 +271,9 @@ def _get_provider(stt_config: dict) -> str:
if _HAS_MISTRAL and os.getenv("MISTRAL_API_KEY"):
logger.info("No local STT available, using Mistral Voxtral Transcribe API")
return "mistral"
if os.getenv("XAI_API_KEY"):
logger.info("No local STT available, using xAI Grok STT API")
return "xai"
return "none"
# ---------------------------------------------------------------------------
@ -573,6 +588,103 @@ def _transcribe_mistral(file_path: str, model_name: str) -> Dict[str, Any]:
return {"success": False, "transcript": "", "error": f"Mistral transcription failed: {type(e).__name__}"}
# ---------------------------------------------------------------------------
# Provider: xAI (Grok STT API)
# ---------------------------------------------------------------------------
def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]:
"""Transcribe using xAI Grok STT API.
Uses the ``POST /v1/stt`` REST endpoint with multipart/form-data.
Supports Inverse Text Normalization, diarization, and word-level timestamps.
Requires ``XAI_API_KEY`` environment variable.
"""
api_key = os.getenv("XAI_API_KEY")
if not api_key:
return {"success": False, "transcript": "", "error": "XAI_API_KEY not set"}
stt_config = _load_stt_config()
xai_config = stt_config.get("xai", {})
base_url = str(
xai_config.get("base_url")
or os.getenv("XAI_STT_BASE_URL")
or XAI_STT_BASE_URL
).strip().rstrip("/")
language = str(
xai_config.get("language")
or os.getenv("HERMES_LOCAL_STT_LANGUAGE")
or "fr"
).strip()
use_format = is_truthy_value(xai_config.get("format", True), default=True)
use_diarize = is_truthy_value(xai_config.get("diarize", False), default=False)
try:
import requests
from tools.xai_http import hermes_xai_user_agent
data: Dict[str, str] = {}
if language:
data["language"] = language
if use_format:
data["format"] = "true"
if use_diarize:
data["diarize"] = "true"
with open(file_path, "rb") as audio_file:
response = requests.post(
f"{base_url}/stt",
headers={
"Authorization": f"Bearer {api_key}",
"User-Agent": hermes_xai_user_agent(),
},
files={
"file": (Path(file_path).name, audio_file),
},
data=data,
timeout=120,
)
if response.status_code != 200:
detail = ""
try:
err_body = response.json()
detail = err_body.get("error", {}).get("message", "") or response.text[:300]
except Exception:
detail = response.text[:300]
return {
"success": False,
"transcript": "",
"error": f"xAI STT API error (HTTP {response.status_code}): {detail}",
}
result = response.json()
transcript_text = result.get("text", "").strip()
if not transcript_text:
return {
"success": False,
"transcript": "",
"error": "xAI STT returned empty transcript",
}
logger.info(
"Transcribed %s via xAI Grok STT (lang=%s, %.1fs audio, %d chars)",
Path(file_path).name,
result.get("language", language),
result.get("duration", 0),
len(transcript_text),
)
return {"success": True, "transcript": transcript_text, "provider": "xai"}
except PermissionError:
return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
except Exception as e:
logger.error("xAI STT transcription failed: %s", e, exc_info=True)
return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"}
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
@ -641,6 +753,12 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
model_name = model or mistral_cfg.get("model", DEFAULT_MISTRAL_STT_MODEL)
return _transcribe_mistral(file_path, model_name)
if provider == "xai":
xai_cfg = stt_config.get("xai", {})
# xAI Grok STT doesn't use a model parameter — pass through for logging
model_name = model or "grok-stt"
return _transcribe_xai(file_path, model_name)
# No provider available
return {
"success": False,
@ -649,7 +767,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
"No STT provider available. Install faster-whisper for free local "
f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
"set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral "
"Voxtral Transcribe, or set VOICE_TOOLS_OPENAI_KEY "
"Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY "
"or OPENAI_API_KEY for the OpenAI Whisper API."
),
}