diff --git a/.env.example b/.env.example index a5153d1d0..3d3ad1de9 100644 --- a/.env.example +++ b/.env.example @@ -275,3 +275,6 @@ WANDB_API_KEY= # GITHUB_APP_ID= # GITHUB_APP_PRIVATE_KEY_PATH= # GITHUB_APP_INSTALLATION_ID= + +# Groq API key (free tier — used for Whisper STT in voice mode) +# GROQ_API_KEY= diff --git a/cli.py b/cli.py index b86e2bb82..1eb9e3510 100755 --- a/cli.py +++ b/cli.py @@ -3539,7 +3539,7 @@ class HermesCLI: self._voice_recorder.start() self._voice_recording = True - _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+Space to stop, Ctrl+C to cancel){_RST}") + _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}") def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" @@ -3573,7 +3573,6 @@ class HermesCLI: if result.get("success") and result.get("transcript", "").strip(): transcript = result["transcript"].strip() - _cprint(f"\n{_GOLD}●{_RST} {_BOLD}{transcript}{_RST}") self._pending_input.put(transcript) elif result.get("success"): _cprint(f"{_DIM}No speech detected.{_RST}") @@ -3663,7 +3662,7 @@ class HermesCLI: tts_status = " (TTS enabled)" if self._voice_tts else "" _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") - _cprint(f" {_DIM}Ctrl+Space to start/stop recording{_RST}") + _cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}") _cprint(f" {_DIM}/voice tts to toggle speech output{_RST}") _cprint(f" {_DIM}/voice off to disable voice mode{_RST}") @@ -3703,7 +3702,7 @@ class HermesCLI: _cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}") _cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}") _cprint(f" Recording: {'YES' if self._voice_recording else 'no'}") - _cprint(f" Record key: Ctrl+Space") + _cprint(f" Record key: Ctrl+R") _cprint(f"\n {_BOLD}Requirements:{_RST}") for line in reqs["details"].split("\n"): _cprint(f" {line}") @@ -4715,7 +4714,7 @@ class HermesCLI: def _get_placeholder(): if cli_ref._voice_recording: - return "recording... Ctrl+Space to stop, Ctrl+C to cancel" + return "recording... Ctrl+R to stop, Ctrl+C to cancel" if cli_ref._voice_processing: return "transcribing..." if cli_ref._sudo_state: @@ -4735,7 +4734,7 @@ class HermesCLI: if cli_ref._agent_running: return "type a message + Enter to interrupt, Ctrl+C to cancel" if cli_ref._voice_mode: - return "type or Ctrl+Space to record" + return "type or Ctrl+R to record" return "" input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder)) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index dbbe41c10..174e4326e 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -204,7 +204,7 @@ DEFAULT_CONFIG = { }, "voice": { - "record_key": "ctrl+space", + "record_key": "ctrl+r", "max_recording_seconds": 120, "auto_tts": False, }, diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 96b7a95e2..7f217bc77 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -2,19 +2,21 @@ """ Transcription Tools Module -Provides speech-to-text transcription with two providers: - - - **local** (default, free) — faster-whisper running locally, no API key needed. - Auto-downloads the model (~150 MB for ``base``) on first use. - - **openai** — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``. +Provides speech-to-text transcription using OpenAI-compatible Whisper APIs. +Supports multiple providers with automatic fallback: + 1. OpenAI (VOICE_TOOLS_OPENAI_KEY) -- paid + 2. Groq (GROQ_API_KEY) -- free tier available Used by the messaging gateway to automatically transcribe voice messages -sent by users on Telegram, Discord, WhatsApp, Slack, and Signal. +sent by users on Telegram, Discord, WhatsApp, and Slack. + +Supported models: + OpenAI: whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe + Groq: whisper-large-v3, whisper-large-v3-turbo, distil-whisper-large-v3-en Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg -Usage:: - +Usage: from tools.transcription_tools import transcribe_audio result = transcribe_audio("/path/to/audio.ogg") @@ -25,241 +27,181 @@ Usage:: import logging import os from pathlib import Path -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Tuple logger = logging.getLogger(__name__) -# --------------------------------------------------------------------------- -# Optional imports — graceful degradation -# --------------------------------------------------------------------------- -try: - from faster_whisper import WhisperModel - _HAS_FASTER_WHISPER = True -except ImportError: - _HAS_FASTER_WHISPER = False - WhisperModel = None # type: ignore[assignment,misc] +# Default STT models per provider +DEFAULT_STT_MODEL = "whisper-1" +DEFAULT_GROQ_STT_MODEL = "whisper-large-v3-turbo" -try: - from openai import OpenAI, APIError, APIConnectionError, APITimeoutError - _HAS_OPENAI = True -except ImportError: - _HAS_OPENAI = False - -# --------------------------------------------------------------------------- -# Constants -# --------------------------------------------------------------------------- - -DEFAULT_PROVIDER = "local" -DEFAULT_LOCAL_MODEL = "base" -DEFAULT_OPENAI_MODEL = "whisper-1" - -SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} -MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB - -# Singleton for the local model — loaded once, reused across calls -_local_model: Optional["WhisperModel"] = None -_local_model_name: Optional[str] = None - -# --------------------------------------------------------------------------- -# Config helpers -# --------------------------------------------------------------------------- +# Provider endpoints +GROQ_BASE_URL = "https://api.groq.com/openai/v1" +OPENAI_BASE_URL = "https://api.openai.com/v1" -def _load_stt_config() -> dict: - """Load the ``stt`` section from user config, falling back to defaults.""" - try: - from hermes_cli.config import load_config - return load_config().get("stt", {}) - except Exception: - return {} +def _resolve_stt_provider() -> Tuple[Optional[str], Optional[str], str]: + """Resolve which STT provider to use based on available API keys. - -def _get_provider(stt_config: dict) -> str: - """Determine which STT provider to use. - - Priority: - 1. Explicit config value (``stt.provider``) - 2. Auto-detect: local if faster-whisper available, else openai if key set - 3. Disabled (returns "none") + Returns: + Tuple of (api_key, base_url, provider_name). + api_key is None if no provider is available. """ - provider = stt_config.get("provider", DEFAULT_PROVIDER) + openai_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") + if openai_key: + return openai_key, OPENAI_BASE_URL, "openai" - if provider == "local": - if _HAS_FASTER_WHISPER: - return "local" - # Local requested but not available — fall back to openai if possible - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): - logger.info("faster-whisper not installed, falling back to OpenAI Whisper API") - return "openai" - return "none" + groq_key = os.getenv("GROQ_API_KEY") + if groq_key: + return groq_key, GROQ_BASE_URL, "groq" - if provider == "openai": - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): - return "openai" - # OpenAI requested but no key — fall back to local if possible - if _HAS_FASTER_WHISPER: - logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper") - return "local" - return "none" + return None, None, "none" - return provider # Unknown — let it fail downstream +# Supported audio formats +SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} -# --------------------------------------------------------------------------- -# Shared validation -# --------------------------------------------------------------------------- - - -def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]: - """Validate the audio file. Returns an error dict or None if OK.""" - audio_path = Path(file_path) - - if not audio_path.exists(): - return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"} - if not audio_path.is_file(): - return {"success": False, "transcript": "", "error": f"Path is not a file: {file_path}"} - if audio_path.suffix.lower() not in SUPPORTED_FORMATS: - return { - "success": False, - "transcript": "", - "error": f"Unsupported format: {audio_path.suffix}. Supported: {', '.join(sorted(SUPPORTED_FORMATS))}", - } - try: - file_size = audio_path.stat().st_size - if file_size > MAX_FILE_SIZE: - return { - "success": False, - "transcript": "", - "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024):.0f}MB)", - } - except OSError as e: - return {"success": False, "transcript": "", "error": f"Failed to access file: {e}"} - - return None - -# --------------------------------------------------------------------------- -# Provider: local (faster-whisper) -# --------------------------------------------------------------------------- - - -def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: - """Transcribe using faster-whisper (local, free).""" - global _local_model, _local_model_name - - if not _HAS_FASTER_WHISPER: - return {"success": False, "transcript": "", "error": "faster-whisper not installed"} - - try: - # Lazy-load the model (downloads on first use, ~150 MB for 'base') - if _local_model is None or _local_model_name != model_name: - logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) - _local_model = WhisperModel(model_name, device="auto", compute_type="auto") - _local_model_name = model_name - - segments, info = _local_model.transcribe(file_path, beam_size=5) - transcript = " ".join(segment.text.strip() for segment in segments) - - logger.info( - "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)", - Path(file_path).name, model_name, info.language, info.duration, - ) - - return {"success": True, "transcript": transcript} - - except Exception as e: - logger.error("Local transcription failed: %s", e, exc_info=True) - return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} - -# --------------------------------------------------------------------------- -# Provider: openai (Whisper API) -# --------------------------------------------------------------------------- - - -def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: - """Transcribe using OpenAI Whisper API (paid).""" - api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") - if not api_key: - return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"} - - if not _HAS_OPENAI: - return {"success": False, "transcript": "", "error": "openai package not installed"} - - try: - client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1") - - with open(file_path, "rb") as audio_file: - transcription = client.audio.transcriptions.create( - model=model_name, - file=audio_file, - response_format="text", - ) - - transcript_text = str(transcription).strip() - logger.info("Transcribed %s via OpenAI API (%s, %d chars)", - Path(file_path).name, model_name, len(transcript_text)) - - return {"success": True, "transcript": transcript_text} - - except PermissionError: - return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} - except APIConnectionError as e: - return {"success": False, "transcript": "", "error": f"Connection error: {e}"} - except APITimeoutError as e: - return {"success": False, "transcript": "", "error": f"Request timeout: {e}"} - except APIError as e: - return {"success": False, "transcript": "", "error": f"API error: {e}"} - except Exception as e: - logger.error("OpenAI transcription failed: %s", e, exc_info=True) - return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"} - -# --------------------------------------------------------------------------- -# Public API -# --------------------------------------------------------------------------- +# Maximum file size (25MB - OpenAI limit) +MAX_FILE_SIZE = 25 * 1024 * 1024 def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]: """ - Transcribe an audio file using the configured STT provider. + Transcribe an audio file using an OpenAI-compatible Whisper API. - Provider priority: - 1. User config (``stt.provider`` in config.yaml) - 2. Auto-detect: local faster-whisper if available, else OpenAI API + Automatically selects the provider based on available API keys: + VOICE_TOOLS_OPENAI_KEY (OpenAI) > GROQ_API_KEY (Groq). Args: file_path: Absolute path to the audio file to transcribe. - model: Override the model. If None, uses config or provider default. + model: Whisper model to use. Defaults per provider if not specified. Returns: dict with keys: - "success" (bool): Whether transcription succeeded - "transcript" (str): The transcribed text (empty on failure) - "error" (str, optional): Error message if success is False + - "provider" (str, optional): Which provider was used """ - # Validate input - error = _validate_audio_file(file_path) - if error: - return error + api_key, base_url, provider = _resolve_stt_provider() + if not api_key: + return { + "success": False, + "transcript": "", + "error": "No STT API key set. Set VOICE_TOOLS_OPENAI_KEY or GROQ_API_KEY.", + } - # Load config and determine provider - stt_config = _load_stt_config() - provider = _get_provider(stt_config) + audio_path = Path(file_path) + + # Validate file exists + if not audio_path.exists(): + return { + "success": False, + "transcript": "", + "error": f"Audio file not found: {file_path}", + } + + if not audio_path.is_file(): + return { + "success": False, + "transcript": "", + "error": f"Path is not a file: {file_path}", + } + + # Validate file extension + if audio_path.suffix.lower() not in SUPPORTED_FORMATS: + return { + "success": False, + "transcript": "", + "error": f"Unsupported file format: {audio_path.suffix}. Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}", + } + + # Validate file size + try: + file_size = audio_path.stat().st_size + if file_size > MAX_FILE_SIZE: + return { + "success": False, + "transcript": "", + "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024)}MB)", + } + except OSError as e: + logger.error("Failed to get file size for %s: %s", file_path, e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Failed to access file: {e}", + } - if provider == "local": - local_cfg = stt_config.get("local", {}) - model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) - return _transcribe_local(file_path, model_name) + # Use provided model, or fall back to provider default. + # If the caller passed an OpenAI-only model but we resolved to Groq, override it. + OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"} + GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"} - if provider == "openai": - openai_cfg = stt_config.get("openai", {}) - model_name = model or openai_cfg.get("model", DEFAULT_OPENAI_MODEL) - return _transcribe_openai(file_path, model_name) + if model is None: + model = DEFAULT_GROQ_STT_MODEL if provider == "groq" else DEFAULT_STT_MODEL + elif provider == "groq" and model in OPENAI_MODELS: + logger.info("Model %s not available on Groq, using %s", model, DEFAULT_GROQ_STT_MODEL) + model = DEFAULT_GROQ_STT_MODEL + elif provider == "openai" and model in GROQ_MODELS: + logger.info("Model %s not available on OpenAI, using %s", model, DEFAULT_STT_MODEL) + model = DEFAULT_STT_MODEL - # No provider available - return { - "success": False, - "transcript": "", - "error": ( - "No STT provider available. Install faster-whisper for free local " - "transcription, or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API." - ), - } + try: + from openai import OpenAI, APIError, APIConnectionError, APITimeoutError + + client = OpenAI(api_key=api_key, base_url=base_url) + + with open(file_path, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model=model, + file=audio_file, + response_format="text", + ) + + # The response is a plain string when response_format="text" + transcript_text = str(transcription).strip() + + logger.info("Transcribed %s (%d chars, provider=%s)", audio_path.name, len(transcript_text), provider) + + return { + "success": True, + "transcript": transcript_text, + "provider": provider, + } + + except PermissionError: + logger.error("Permission denied accessing file: %s", file_path, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Permission denied: {file_path}", + } + except APIConnectionError as e: + logger.error("API connection error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Connection error: {e}", + } + except APITimeoutError as e: + logger.error("API timeout during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Request timeout: {e}", + } + except APIError as e: + logger.error("OpenAI API error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"API error: {e}", + } + except Exception as e: + logger.error("Unexpected error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Transcription failed: {e}", + } diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 213802013..7a7bb6b05 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -283,7 +283,9 @@ def check_voice_requirements() -> Dict[str, Any]: Dict with ``available``, ``audio_available``, ``stt_key_set``, ``missing_packages``, and ``details``. """ - stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY")) + openai_key = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY")) + groq_key = bool(os.getenv("GROQ_API_KEY")) + stt_key_set = openai_key or groq_key missing: List[str] = [] if not _HAS_AUDIO: @@ -297,10 +299,12 @@ def check_voice_requirements() -> Dict[str, Any]: else: details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)") - if stt_key_set: - details_parts.append("STT API key: OK") + if openai_key: + details_parts.append("STT API key: OK (OpenAI)") + elif groq_key: + details_parts.append("STT API key: OK (Groq)") else: - details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)") + details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)") return { "available": available,