mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat: add Groq STT support and fix voice mode keybinding
- Add multi-provider STT support (OpenAI > Groq fallback) in transcription_tools - Auto-correct model selection when provider doesn't support the configured model - Change voice record key from Ctrl+Space to Ctrl+R (macOS compatibility) - Fix duplicate transcript echo in voice pipeline - Add GROQ_API_KEY to .env.example
This commit is contained in:
parent
1a6fbef8a9
commit
ec32e9a540
5 changed files with 173 additions and 225 deletions
|
|
@ -275,3 +275,6 @@ WANDB_API_KEY=
|
||||||
# GITHUB_APP_ID=
|
# GITHUB_APP_ID=
|
||||||
# GITHUB_APP_PRIVATE_KEY_PATH=
|
# GITHUB_APP_PRIVATE_KEY_PATH=
|
||||||
# GITHUB_APP_INSTALLATION_ID=
|
# GITHUB_APP_INSTALLATION_ID=
|
||||||
|
|
||||||
|
# Groq API key (free tier — used for Whisper STT in voice mode)
|
||||||
|
# GROQ_API_KEY=
|
||||||
|
|
|
||||||
11
cli.py
11
cli.py
|
|
@ -3539,7 +3539,7 @@ class HermesCLI:
|
||||||
|
|
||||||
self._voice_recorder.start()
|
self._voice_recorder.start()
|
||||||
self._voice_recording = True
|
self._voice_recording = True
|
||||||
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+Space to stop, Ctrl+C to cancel){_RST}")
|
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}")
|
||||||
|
|
||||||
def _voice_stop_and_transcribe(self):
|
def _voice_stop_and_transcribe(self):
|
||||||
"""Stop recording, transcribe via STT, and queue the transcript as input."""
|
"""Stop recording, transcribe via STT, and queue the transcript as input."""
|
||||||
|
|
@ -3573,7 +3573,6 @@ class HermesCLI:
|
||||||
|
|
||||||
if result.get("success") and result.get("transcript", "").strip():
|
if result.get("success") and result.get("transcript", "").strip():
|
||||||
transcript = result["transcript"].strip()
|
transcript = result["transcript"].strip()
|
||||||
_cprint(f"\n{_GOLD}●{_RST} {_BOLD}{transcript}{_RST}")
|
|
||||||
self._pending_input.put(transcript)
|
self._pending_input.put(transcript)
|
||||||
elif result.get("success"):
|
elif result.get("success"):
|
||||||
_cprint(f"{_DIM}No speech detected.{_RST}")
|
_cprint(f"{_DIM}No speech detected.{_RST}")
|
||||||
|
|
@ -3663,7 +3662,7 @@ class HermesCLI:
|
||||||
|
|
||||||
tts_status = " (TTS enabled)" if self._voice_tts else ""
|
tts_status = " (TTS enabled)" if self._voice_tts else ""
|
||||||
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
|
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
|
||||||
_cprint(f" {_DIM}Ctrl+Space to start/stop recording{_RST}")
|
_cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}")
|
||||||
_cprint(f" {_DIM}/voice tts to toggle speech output{_RST}")
|
_cprint(f" {_DIM}/voice tts to toggle speech output{_RST}")
|
||||||
_cprint(f" {_DIM}/voice off to disable voice mode{_RST}")
|
_cprint(f" {_DIM}/voice off to disable voice mode{_RST}")
|
||||||
|
|
||||||
|
|
@ -3703,7 +3702,7 @@ class HermesCLI:
|
||||||
_cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}")
|
_cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}")
|
||||||
_cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}")
|
_cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}")
|
||||||
_cprint(f" Recording: {'YES' if self._voice_recording else 'no'}")
|
_cprint(f" Recording: {'YES' if self._voice_recording else 'no'}")
|
||||||
_cprint(f" Record key: Ctrl+Space")
|
_cprint(f" Record key: Ctrl+R")
|
||||||
_cprint(f"\n {_BOLD}Requirements:{_RST}")
|
_cprint(f"\n {_BOLD}Requirements:{_RST}")
|
||||||
for line in reqs["details"].split("\n"):
|
for line in reqs["details"].split("\n"):
|
||||||
_cprint(f" {line}")
|
_cprint(f" {line}")
|
||||||
|
|
@ -4715,7 +4714,7 @@ class HermesCLI:
|
||||||
|
|
||||||
def _get_placeholder():
|
def _get_placeholder():
|
||||||
if cli_ref._voice_recording:
|
if cli_ref._voice_recording:
|
||||||
return "recording... Ctrl+Space to stop, Ctrl+C to cancel"
|
return "recording... Ctrl+R to stop, Ctrl+C to cancel"
|
||||||
if cli_ref._voice_processing:
|
if cli_ref._voice_processing:
|
||||||
return "transcribing..."
|
return "transcribing..."
|
||||||
if cli_ref._sudo_state:
|
if cli_ref._sudo_state:
|
||||||
|
|
@ -4735,7 +4734,7 @@ class HermesCLI:
|
||||||
if cli_ref._agent_running:
|
if cli_ref._agent_running:
|
||||||
return "type a message + Enter to interrupt, Ctrl+C to cancel"
|
return "type a message + Enter to interrupt, Ctrl+C to cancel"
|
||||||
if cli_ref._voice_mode:
|
if cli_ref._voice_mode:
|
||||||
return "type or Ctrl+Space to record"
|
return "type or Ctrl+R to record"
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder))
|
input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder))
|
||||||
|
|
|
||||||
|
|
@ -204,7 +204,7 @@ DEFAULT_CONFIG = {
|
||||||
},
|
},
|
||||||
|
|
||||||
"voice": {
|
"voice": {
|
||||||
"record_key": "ctrl+space",
|
"record_key": "ctrl+r",
|
||||||
"max_recording_seconds": 120,
|
"max_recording_seconds": 120,
|
||||||
"auto_tts": False,
|
"auto_tts": False,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -2,19 +2,21 @@
|
||||||
"""
|
"""
|
||||||
Transcription Tools Module
|
Transcription Tools Module
|
||||||
|
|
||||||
Provides speech-to-text transcription with two providers:
|
Provides speech-to-text transcription using OpenAI-compatible Whisper APIs.
|
||||||
|
Supports multiple providers with automatic fallback:
|
||||||
- **local** (default, free) — faster-whisper running locally, no API key needed.
|
1. OpenAI (VOICE_TOOLS_OPENAI_KEY) -- paid
|
||||||
Auto-downloads the model (~150 MB for ``base``) on first use.
|
2. Groq (GROQ_API_KEY) -- free tier available
|
||||||
- **openai** — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``.
|
|
||||||
|
|
||||||
Used by the messaging gateway to automatically transcribe voice messages
|
Used by the messaging gateway to automatically transcribe voice messages
|
||||||
sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
|
sent by users on Telegram, Discord, WhatsApp, and Slack.
|
||||||
|
|
||||||
|
Supported models:
|
||||||
|
OpenAI: whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe
|
||||||
|
Groq: whisper-large-v3, whisper-large-v3-turbo, distil-whisper-large-v3-en
|
||||||
|
|
||||||
Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg
|
Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg
|
||||||
|
|
||||||
Usage::
|
Usage:
|
||||||
|
|
||||||
from tools.transcription_tools import transcribe_audio
|
from tools.transcription_tools import transcribe_audio
|
||||||
|
|
||||||
result = transcribe_audio("/path/to/audio.ogg")
|
result = transcribe_audio("/path/to/audio.ogg")
|
||||||
|
|
@ -25,241 +27,181 @@ Usage::
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any, Tuple
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Optional imports — graceful degradation
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
try:
|
# Default STT models per provider
|
||||||
from faster_whisper import WhisperModel
|
DEFAULT_STT_MODEL = "whisper-1"
|
||||||
_HAS_FASTER_WHISPER = True
|
DEFAULT_GROQ_STT_MODEL = "whisper-large-v3-turbo"
|
||||||
except ImportError:
|
|
||||||
_HAS_FASTER_WHISPER = False
|
|
||||||
WhisperModel = None # type: ignore[assignment,misc]
|
|
||||||
|
|
||||||
try:
|
# Provider endpoints
|
||||||
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
GROQ_BASE_URL = "https://api.groq.com/openai/v1"
|
||||||
_HAS_OPENAI = True
|
OPENAI_BASE_URL = "https://api.openai.com/v1"
|
||||||
except ImportError:
|
|
||||||
_HAS_OPENAI = False
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Constants
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
DEFAULT_PROVIDER = "local"
|
|
||||||
DEFAULT_LOCAL_MODEL = "base"
|
|
||||||
DEFAULT_OPENAI_MODEL = "whisper-1"
|
|
||||||
|
|
||||||
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
|
||||||
MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB
|
|
||||||
|
|
||||||
# Singleton for the local model — loaded once, reused across calls
|
|
||||||
_local_model: Optional["WhisperModel"] = None
|
|
||||||
_local_model_name: Optional[str] = None
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Config helpers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _load_stt_config() -> dict:
|
def _resolve_stt_provider() -> Tuple[Optional[str], Optional[str], str]:
|
||||||
"""Load the ``stt`` section from user config, falling back to defaults."""
|
"""Resolve which STT provider to use based on available API keys.
|
||||||
try:
|
|
||||||
from hermes_cli.config import load_config
|
|
||||||
return load_config().get("stt", {})
|
|
||||||
except Exception:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
def _get_provider(stt_config: dict) -> str:
|
Tuple of (api_key, base_url, provider_name).
|
||||||
"""Determine which STT provider to use.
|
api_key is None if no provider is available.
|
||||||
|
|
||||||
Priority:
|
|
||||||
1. Explicit config value (``stt.provider``)
|
|
||||||
2. Auto-detect: local if faster-whisper available, else openai if key set
|
|
||||||
3. Disabled (returns "none")
|
|
||||||
"""
|
"""
|
||||||
provider = stt_config.get("provider", DEFAULT_PROVIDER)
|
openai_key = os.getenv("VOICE_TOOLS_OPENAI_KEY")
|
||||||
|
if openai_key:
|
||||||
|
return openai_key, OPENAI_BASE_URL, "openai"
|
||||||
|
|
||||||
if provider == "local":
|
groq_key = os.getenv("GROQ_API_KEY")
|
||||||
if _HAS_FASTER_WHISPER:
|
if groq_key:
|
||||||
return "local"
|
return groq_key, GROQ_BASE_URL, "groq"
|
||||||
# Local requested but not available — fall back to openai if possible
|
|
||||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
|
||||||
logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
|
|
||||||
return "openai"
|
|
||||||
return "none"
|
|
||||||
|
|
||||||
if provider == "openai":
|
return None, None, "none"
|
||||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
|
||||||
return "openai"
|
|
||||||
# OpenAI requested but no key — fall back to local if possible
|
|
||||||
if _HAS_FASTER_WHISPER:
|
|
||||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper")
|
|
||||||
return "local"
|
|
||||||
return "none"
|
|
||||||
|
|
||||||
return provider # Unknown — let it fail downstream
|
# Supported audio formats
|
||||||
|
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Maximum file size (25MB - OpenAI limit)
|
||||||
# Shared validation
|
MAX_FILE_SIZE = 25 * 1024 * 1024
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Validate the audio file. Returns an error dict or None if OK."""
|
|
||||||
audio_path = Path(file_path)
|
|
||||||
|
|
||||||
if not audio_path.exists():
|
|
||||||
return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"}
|
|
||||||
if not audio_path.is_file():
|
|
||||||
return {"success": False, "transcript": "", "error": f"Path is not a file: {file_path}"}
|
|
||||||
if audio_path.suffix.lower() not in SUPPORTED_FORMATS:
|
|
||||||
return {
|
|
||||||
"success": False,
|
|
||||||
"transcript": "",
|
|
||||||
"error": f"Unsupported format: {audio_path.suffix}. Supported: {', '.join(sorted(SUPPORTED_FORMATS))}",
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
file_size = audio_path.stat().st_size
|
|
||||||
if file_size > MAX_FILE_SIZE:
|
|
||||||
return {
|
|
||||||
"success": False,
|
|
||||||
"transcript": "",
|
|
||||||
"error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024):.0f}MB)",
|
|
||||||
}
|
|
||||||
except OSError as e:
|
|
||||||
return {"success": False, "transcript": "", "error": f"Failed to access file: {e}"}
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Provider: local (faster-whisper)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
|
||||||
"""Transcribe using faster-whisper (local, free)."""
|
|
||||||
global _local_model, _local_model_name
|
|
||||||
|
|
||||||
if not _HAS_FASTER_WHISPER:
|
|
||||||
return {"success": False, "transcript": "", "error": "faster-whisper not installed"}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Lazy-load the model (downloads on first use, ~150 MB for 'base')
|
|
||||||
if _local_model is None or _local_model_name != model_name:
|
|
||||||
logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name)
|
|
||||||
_local_model = WhisperModel(model_name, device="auto", compute_type="auto")
|
|
||||||
_local_model_name = model_name
|
|
||||||
|
|
||||||
segments, info = _local_model.transcribe(file_path, beam_size=5)
|
|
||||||
transcript = " ".join(segment.text.strip() for segment in segments)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)",
|
|
||||||
Path(file_path).name, model_name, info.language, info.duration,
|
|
||||||
)
|
|
||||||
|
|
||||||
return {"success": True, "transcript": transcript}
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("Local transcription failed: %s", e, exc_info=True)
|
|
||||||
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Provider: openai (Whisper API)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
|
|
||||||
"""Transcribe using OpenAI Whisper API (paid)."""
|
|
||||||
api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY")
|
|
||||||
if not api_key:
|
|
||||||
return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"}
|
|
||||||
|
|
||||||
if not _HAS_OPENAI:
|
|
||||||
return {"success": False, "transcript": "", "error": "openai package not installed"}
|
|
||||||
|
|
||||||
try:
|
|
||||||
client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1")
|
|
||||||
|
|
||||||
with open(file_path, "rb") as audio_file:
|
|
||||||
transcription = client.audio.transcriptions.create(
|
|
||||||
model=model_name,
|
|
||||||
file=audio_file,
|
|
||||||
response_format="text",
|
|
||||||
)
|
|
||||||
|
|
||||||
transcript_text = str(transcription).strip()
|
|
||||||
logger.info("Transcribed %s via OpenAI API (%s, %d chars)",
|
|
||||||
Path(file_path).name, model_name, len(transcript_text))
|
|
||||||
|
|
||||||
return {"success": True, "transcript": transcript_text}
|
|
||||||
|
|
||||||
except PermissionError:
|
|
||||||
return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
|
|
||||||
except APIConnectionError as e:
|
|
||||||
return {"success": False, "transcript": "", "error": f"Connection error: {e}"}
|
|
||||||
except APITimeoutError as e:
|
|
||||||
return {"success": False, "transcript": "", "error": f"Request timeout: {e}"}
|
|
||||||
except APIError as e:
|
|
||||||
return {"success": False, "transcript": "", "error": f"API error: {e}"}
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("OpenAI transcription failed: %s", e, exc_info=True)
|
|
||||||
return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"}
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Public API
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]:
|
def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Transcribe an audio file using the configured STT provider.
|
Transcribe an audio file using an OpenAI-compatible Whisper API.
|
||||||
|
|
||||||
Provider priority:
|
Automatically selects the provider based on available API keys:
|
||||||
1. User config (``stt.provider`` in config.yaml)
|
VOICE_TOOLS_OPENAI_KEY (OpenAI) > GROQ_API_KEY (Groq).
|
||||||
2. Auto-detect: local faster-whisper if available, else OpenAI API
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: Absolute path to the audio file to transcribe.
|
file_path: Absolute path to the audio file to transcribe.
|
||||||
model: Override the model. If None, uses config or provider default.
|
model: Whisper model to use. Defaults per provider if not specified.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict with keys:
|
dict with keys:
|
||||||
- "success" (bool): Whether transcription succeeded
|
- "success" (bool): Whether transcription succeeded
|
||||||
- "transcript" (str): The transcribed text (empty on failure)
|
- "transcript" (str): The transcribed text (empty on failure)
|
||||||
- "error" (str, optional): Error message if success is False
|
- "error" (str, optional): Error message if success is False
|
||||||
|
- "provider" (str, optional): Which provider was used
|
||||||
"""
|
"""
|
||||||
# Validate input
|
api_key, base_url, provider = _resolve_stt_provider()
|
||||||
error = _validate_audio_file(file_path)
|
if not api_key:
|
||||||
if error:
|
return {
|
||||||
return error
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": "No STT API key set. Set VOICE_TOOLS_OPENAI_KEY or GROQ_API_KEY.",
|
||||||
|
}
|
||||||
|
|
||||||
# Load config and determine provider
|
audio_path = Path(file_path)
|
||||||
stt_config = _load_stt_config()
|
|
||||||
provider = _get_provider(stt_config)
|
# Validate file exists
|
||||||
|
if not audio_path.exists():
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Audio file not found: {file_path}",
|
||||||
|
}
|
||||||
|
|
||||||
|
if not audio_path.is_file():
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Path is not a file: {file_path}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Validate file extension
|
||||||
|
if audio_path.suffix.lower() not in SUPPORTED_FORMATS:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Unsupported file format: {audio_path.suffix}. Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Validate file size
|
||||||
|
try:
|
||||||
|
file_size = audio_path.stat().st_size
|
||||||
|
if file_size > MAX_FILE_SIZE:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024)}MB)",
|
||||||
|
}
|
||||||
|
except OSError as e:
|
||||||
|
logger.error("Failed to get file size for %s: %s", file_path, e, exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Failed to access file: {e}",
|
||||||
|
}
|
||||||
|
|
||||||
if provider == "local":
|
# Use provided model, or fall back to provider default.
|
||||||
local_cfg = stt_config.get("local", {})
|
# If the caller passed an OpenAI-only model but we resolved to Groq, override it.
|
||||||
model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"}
|
||||||
return _transcribe_local(file_path, model_name)
|
GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"}
|
||||||
|
|
||||||
if provider == "openai":
|
if model is None:
|
||||||
openai_cfg = stt_config.get("openai", {})
|
model = DEFAULT_GROQ_STT_MODEL if provider == "groq" else DEFAULT_STT_MODEL
|
||||||
model_name = model or openai_cfg.get("model", DEFAULT_OPENAI_MODEL)
|
elif provider == "groq" and model in OPENAI_MODELS:
|
||||||
return _transcribe_openai(file_path, model_name)
|
logger.info("Model %s not available on Groq, using %s", model, DEFAULT_GROQ_STT_MODEL)
|
||||||
|
model = DEFAULT_GROQ_STT_MODEL
|
||||||
|
elif provider == "openai" and model in GROQ_MODELS:
|
||||||
|
logger.info("Model %s not available on OpenAI, using %s", model, DEFAULT_STT_MODEL)
|
||||||
|
model = DEFAULT_STT_MODEL
|
||||||
|
|
||||||
# No provider available
|
try:
|
||||||
return {
|
from openai import OpenAI, APIError, APIConnectionError, APITimeoutError
|
||||||
"success": False,
|
|
||||||
"transcript": "",
|
client = OpenAI(api_key=api_key, base_url=base_url)
|
||||||
"error": (
|
|
||||||
"No STT provider available. Install faster-whisper for free local "
|
with open(file_path, "rb") as audio_file:
|
||||||
"transcription, or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
|
transcription = client.audio.transcriptions.create(
|
||||||
),
|
model=model,
|
||||||
}
|
file=audio_file,
|
||||||
|
response_format="text",
|
||||||
|
)
|
||||||
|
|
||||||
|
# The response is a plain string when response_format="text"
|
||||||
|
transcript_text = str(transcription).strip()
|
||||||
|
|
||||||
|
logger.info("Transcribed %s (%d chars, provider=%s)", audio_path.name, len(transcript_text), provider)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"transcript": transcript_text,
|
||||||
|
"provider": provider,
|
||||||
|
}
|
||||||
|
|
||||||
|
except PermissionError:
|
||||||
|
logger.error("Permission denied accessing file: %s", file_path, exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Permission denied: {file_path}",
|
||||||
|
}
|
||||||
|
except APIConnectionError as e:
|
||||||
|
logger.error("API connection error during transcription: %s", e, exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Connection error: {e}",
|
||||||
|
}
|
||||||
|
except APITimeoutError as e:
|
||||||
|
logger.error("API timeout during transcription: %s", e, exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Request timeout: {e}",
|
||||||
|
}
|
||||||
|
except APIError as e:
|
||||||
|
logger.error("OpenAI API error during transcription: %s", e, exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"API error: {e}",
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unexpected error during transcription: %s", e, exc_info=True)
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Transcription failed: {e}",
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -283,7 +283,9 @@ def check_voice_requirements() -> Dict[str, Any]:
|
||||||
Dict with ``available``, ``audio_available``, ``stt_key_set``,
|
Dict with ``available``, ``audio_available``, ``stt_key_set``,
|
||||||
``missing_packages``, and ``details``.
|
``missing_packages``, and ``details``.
|
||||||
"""
|
"""
|
||||||
stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY"))
|
openai_key = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY"))
|
||||||
|
groq_key = bool(os.getenv("GROQ_API_KEY"))
|
||||||
|
stt_key_set = openai_key or groq_key
|
||||||
missing: List[str] = []
|
missing: List[str] = []
|
||||||
|
|
||||||
if not _HAS_AUDIO:
|
if not _HAS_AUDIO:
|
||||||
|
|
@ -297,10 +299,12 @@ def check_voice_requirements() -> Dict[str, Any]:
|
||||||
else:
|
else:
|
||||||
details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
|
details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
|
||||||
|
|
||||||
if stt_key_set:
|
if openai_key:
|
||||||
details_parts.append("STT API key: OK")
|
details_parts.append("STT API key: OK (OpenAI)")
|
||||||
|
elif groq_key:
|
||||||
|
details_parts.append("STT API key: OK (Groq)")
|
||||||
else:
|
else:
|
||||||
details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)")
|
details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"available": available,
|
"available": available,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue