mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(stt): add xAI Grok STT provider
Add xAI as a sixth STT provider using the POST /v1/stt endpoint.
Features:
- Multipart/form-data upload to api.x.ai/v1/stt
- Inverse Text Normalization (ITN) via format=true (default)
- Optional diarization via config (stt.xai.diarize)
- Language configuration (default: fr, overridable via config or env)
- Custom base_url support (XAI_STT_BASE_URL env or stt.xai.base_url)
- Full provider integration: explicit config + auto-detect fallback chain
- Consistent error handling matching existing provider patterns
Config (config.yaml):
stt:
provider: xai
xai:
language: fr
format: true
diarize: false
base_url: https://api.x.ai/v1 # optional override
Auto-detect priority: local > groq > openai > mistral > xai > none
This commit is contained in:
parent
bace220d29
commit
a6ffa994cd
1 changed files with 120 additions and 2 deletions
|
|
@ -2,12 +2,15 @@
|
|||
"""
|
||||
Transcription Tools Module
|
||||
|
||||
Provides speech-to-text transcription with three providers:
|
||||
Provides speech-to-text transcription with six providers:
|
||||
|
||||
- **local** (default, free) — faster-whisper running locally, no API key needed.
|
||||
Auto-downloads the model (~150 MB for ``base``) on first use.
|
||||
- **groq** (free tier) — Groq Whisper API, requires ``GROQ_API_KEY``.
|
||||
- **openai** (paid) — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
|
||||
- **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``.
|
||||
- **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy,
|
||||
Inverse Text Normalization, diarization, 26 languages.
|
||||
|
||||
Used by the messaging gateway to automatically transcribe voice messages
|
||||
sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
|
||||
|
|
@ -73,6 +76,7 @@ COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
|
|||
|
||||
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
||||
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1")
|
||||
|
||||
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
|
||||
LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
|
||||
|
|
@ -242,6 +246,14 @@ def _get_provider(stt_config: dict) -> str:
|
|||
)
|
||||
return "none"
|
||||
|
||||
if provider == "xai":
|
||||
if os.getenv("XAI_API_KEY"):
|
||||
return "xai"
|
||||
logger.warning(
|
||||
"STT provider 'xai' configured but XAI_API_KEY not set"
|
||||
)
|
||||
return "none"
|
||||
|
||||
return provider # Unknown — let it fail downstream
|
||||
|
||||
# --- Auto-detect (no explicit provider): local > groq > openai > mistral -
|
||||
|
|
@ -259,6 +271,9 @@ def _get_provider(stt_config: dict) -> str:
|
|||
if _HAS_MISTRAL and os.getenv("MISTRAL_API_KEY"):
|
||||
logger.info("No local STT available, using Mistral Voxtral Transcribe API")
|
||||
return "mistral"
|
||||
if os.getenv("XAI_API_KEY"):
|
||||
logger.info("No local STT available, using xAI Grok STT API")
|
||||
return "xai"
|
||||
return "none"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -573,6 +588,103 @@ def _transcribe_mistral(file_path: str, model_name: str) -> Dict[str, Any]:
|
|||
return {"success": False, "transcript": "", "error": f"Mistral transcription failed: {type(e).__name__}"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider: xAI (Grok STT API)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
"""Transcribe using xAI Grok STT API.
|
||||
|
||||
Uses the ``POST /v1/stt`` REST endpoint with multipart/form-data.
|
||||
Supports Inverse Text Normalization, diarization, and word-level timestamps.
|
||||
Requires ``XAI_API_KEY`` environment variable.
|
||||
"""
|
||||
api_key = os.getenv("XAI_API_KEY")
|
||||
if not api_key:
|
||||
return {"success": False, "transcript": "", "error": "XAI_API_KEY not set"}
|
||||
|
||||
stt_config = _load_stt_config()
|
||||
xai_config = stt_config.get("xai", {})
|
||||
base_url = str(
|
||||
xai_config.get("base_url")
|
||||
or os.getenv("XAI_STT_BASE_URL")
|
||||
or XAI_STT_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
language = str(
|
||||
xai_config.get("language")
|
||||
or os.getenv("HERMES_LOCAL_STT_LANGUAGE")
|
||||
or "fr"
|
||||
).strip()
|
||||
use_format = is_truthy_value(xai_config.get("format", True), default=True)
|
||||
use_diarize = is_truthy_value(xai_config.get("diarize", False), default=False)
|
||||
|
||||
try:
|
||||
import requests
|
||||
from tools.xai_http import hermes_xai_user_agent
|
||||
|
||||
data: Dict[str, str] = {}
|
||||
if language:
|
||||
data["language"] = language
|
||||
if use_format:
|
||||
data["format"] = "true"
|
||||
if use_diarize:
|
||||
data["diarize"] = "true"
|
||||
|
||||
with open(file_path, "rb") as audio_file:
|
||||
response = requests.post(
|
||||
f"{base_url}/stt",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"User-Agent": hermes_xai_user_agent(),
|
||||
},
|
||||
files={
|
||||
"file": (Path(file_path).name, audio_file),
|
||||
},
|
||||
data=data,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
detail = ""
|
||||
try:
|
||||
err_body = response.json()
|
||||
detail = err_body.get("error", {}).get("message", "") or response.text[:300]
|
||||
except Exception:
|
||||
detail = response.text[:300]
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": f"xAI STT API error (HTTP {response.status_code}): {detail}",
|
||||
}
|
||||
|
||||
result = response.json()
|
||||
transcript_text = result.get("text", "").strip()
|
||||
|
||||
if not transcript_text:
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": "xAI STT returned empty transcript",
|
||||
}
|
||||
|
||||
logger.info(
|
||||
"Transcribed %s via xAI Grok STT (lang=%s, %.1fs audio, %d chars)",
|
||||
Path(file_path).name,
|
||||
result.get("language", language),
|
||||
result.get("duration", 0),
|
||||
len(transcript_text),
|
||||
)
|
||||
|
||||
return {"success": True, "transcript": transcript_text, "provider": "xai"}
|
||||
|
||||
except PermissionError:
|
||||
return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
|
||||
except Exception as e:
|
||||
logger.error("xAI STT transcription failed: %s", e, exc_info=True)
|
||||
return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -641,6 +753,12 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
model_name = model or mistral_cfg.get("model", DEFAULT_MISTRAL_STT_MODEL)
|
||||
return _transcribe_mistral(file_path, model_name)
|
||||
|
||||
if provider == "xai":
|
||||
xai_cfg = stt_config.get("xai", {})
|
||||
# xAI Grok STT doesn't use a model parameter — pass through for logging
|
||||
model_name = model or "grok-stt"
|
||||
return _transcribe_xai(file_path, model_name)
|
||||
|
||||
# No provider available
|
||||
return {
|
||||
"success": False,
|
||||
|
|
@ -649,7 +767,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
"No STT provider available. Install faster-whisper for free local "
|
||||
f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
|
||||
"set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral "
|
||||
"Voxtral Transcribe, or set VOICE_TOOLS_OPENAI_KEY "
|
||||
"Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY "
|
||||
"or OPENAI_API_KEY for the OpenAI Whisper API."
|
||||
),
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue