diff --git a/agent/context_compressor.py b/agent/context_compressor.py index b2dff9c85..5c0e0edf4 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -132,7 +132,11 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" if self.summary_model: call_kwargs["model"] = self.summary_model response = call_llm(**call_kwargs) - summary = response.choices[0].message.content.strip() + content = response.choices[0].message.content + # Handle cases where content is not a string (e.g., dict from llama.cpp) + if not isinstance(content, str): + content = str(content) if content else "" + summary = content.strip() if not summary.startswith("[CONTEXT SUMMARY]:"): summary = "[CONTEXT SUMMARY]: " + summary return summary diff --git a/hermes_cli/config.py b/hermes_cli/config.py index aa86bbea2..994263e28 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -194,8 +194,13 @@ DEFAULT_CONFIG = { }, "stt": { - "enabled": True, - "model": "whisper-1", + "provider": "local", # "local" (free, faster-whisper) | "openai" (Whisper API) + "local": { + "model": "base", # tiny, base, small, medium, large-v3 + }, + "openai": { + "model": "whisper-1", # whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe + }, }, "human_delay": { diff --git a/pyproject.toml b/pyproject.toml index dbd0273cd..54e75c957 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "fal-client", # Text-to-speech (Edge TTS is free, no API key needed) "edge-tts", + "faster-whisper>=1.0.0", # mini-swe-agent deps (terminal tool) "litellm>=1.75.5", "typer", diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 82ee93503..dac64aaf6 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -153,6 +153,47 @@ class TestGenerateSummaryNoneContent: assert len(result) < len(msgs) +class TestNonStringContent: + """Regression: content as dict (e.g., llama.cpp tool calls) must not crash.""" + + def test_dict_content_coerced_to_string(self): + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = {"text": "some summary"} + + with patch("agent.context_compressor.get_model_context_length", return_value=100000): + c = ContextCompressor(model="test", quiet_mode=True) + + messages = [ + {"role": "user", "content": "do something"}, + {"role": "assistant", "content": "ok"}, + ] + + with patch("agent.context_compressor.call_llm", return_value=mock_response): + summary = c._generate_summary(messages) + assert isinstance(summary, str) + assert "CONTEXT SUMMARY" in summary + + def test_none_content_coerced_to_empty(self): + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = None + + with patch("agent.context_compressor.get_model_context_length", return_value=100000): + c = ContextCompressor(model="test", quiet_mode=True) + + messages = [ + {"role": "user", "content": "do something"}, + {"role": "assistant", "content": "ok"}, + ] + + with patch("agent.context_compressor.call_llm", return_value=mock_response): + summary = c._generate_summary(messages) + # None content → empty string → "[CONTEXT SUMMARY]: " prefix added + assert summary is not None + assert "CONTEXT SUMMARY" in summary + + class TestCompressWithClient: def test_summarization_path(self): mock_client = MagicMock() diff --git a/tests/tools/test_transcription.py b/tests/tools/test_transcription.py new file mode 100644 index 000000000..e6cceb083 --- /dev/null +++ b/tests/tools/test_transcription.py @@ -0,0 +1,223 @@ +"""Tests for transcription_tools.py — local (faster-whisper) and OpenAI providers. + +Tests cover provider selection, config loading, validation, and transcription +dispatch. All external dependencies (faster_whisper, openai) are mocked. +""" + +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch, mock_open + +import pytest + + +# --------------------------------------------------------------------------- +# Provider selection +# --------------------------------------------------------------------------- + + +class TestGetProvider: + """_get_provider() picks the right backend based on config + availability.""" + + def test_local_when_available(self): + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True): + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "local"}) == "local" + + def test_local_fallback_to_openai(self, monkeypatch): + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test") + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \ + patch("tools.transcription_tools._HAS_OPENAI", True): + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "local"}) == "openai" + + def test_local_nothing_available(self, monkeypatch): + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \ + patch("tools.transcription_tools._HAS_OPENAI", False): + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "local"}) == "none" + + def test_openai_when_key_set(self, monkeypatch): + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test") + with patch("tools.transcription_tools._HAS_OPENAI", True): + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "openai"}) == "openai" + + def test_openai_fallback_to_local(self, monkeypatch): + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ + patch("tools.transcription_tools._HAS_OPENAI", True): + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "openai"}) == "local" + + def test_default_provider_is_local(self): + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True): + from tools.transcription_tools import _get_provider + assert _get_provider({}) == "local" + + +# --------------------------------------------------------------------------- +# File validation +# --------------------------------------------------------------------------- + + +class TestValidateAudioFile: + + def test_missing_file(self, tmp_path): + from tools.transcription_tools import _validate_audio_file + result = _validate_audio_file(str(tmp_path / "nope.ogg")) + assert result is not None + assert "not found" in result["error"] + + def test_unsupported_format(self, tmp_path): + f = tmp_path / "test.xyz" + f.write_bytes(b"data") + from tools.transcription_tools import _validate_audio_file + result = _validate_audio_file(str(f)) + assert result is not None + assert "Unsupported" in result["error"] + + def test_valid_file_returns_none(self, tmp_path): + f = tmp_path / "test.ogg" + f.write_bytes(b"fake audio data") + from tools.transcription_tools import _validate_audio_file + assert _validate_audio_file(str(f)) is None + + def test_too_large(self, tmp_path): + import stat as stat_mod + f = tmp_path / "big.ogg" + f.write_bytes(b"x") + from tools.transcription_tools import _validate_audio_file, MAX_FILE_SIZE + real_stat = f.stat() + with patch.object(type(f), "stat", return_value=os.stat_result(( + real_stat.st_mode, real_stat.st_ino, real_stat.st_dev, + real_stat.st_nlink, real_stat.st_uid, real_stat.st_gid, + MAX_FILE_SIZE + 1, # st_size + real_stat.st_atime, real_stat.st_mtime, real_stat.st_ctime, + ))): + result = _validate_audio_file(str(f)) + assert result is not None + assert "too large" in result["error"] + + +# --------------------------------------------------------------------------- +# Local transcription +# --------------------------------------------------------------------------- + + +class TestTranscribeLocal: + + def test_successful_transcription(self, tmp_path): + audio_file = tmp_path / "test.ogg" + audio_file.write_bytes(b"fake audio") + + mock_segment = MagicMock() + mock_segment.text = "Hello world" + mock_info = MagicMock() + mock_info.language = "en" + mock_info.duration = 2.5 + + mock_model = MagicMock() + mock_model.transcribe.return_value = ([mock_segment], mock_info) + + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ + patch("tools.transcription_tools.WhisperModel", return_value=mock_model), \ + patch("tools.transcription_tools._local_model", None): + from tools.transcription_tools import _transcribe_local + result = _transcribe_local(str(audio_file), "base") + + assert result["success"] is True + assert result["transcript"] == "Hello world" + + def test_not_installed(self): + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False): + from tools.transcription_tools import _transcribe_local + result = _transcribe_local("/tmp/test.ogg", "base") + assert result["success"] is False + assert "not installed" in result["error"] + + +# --------------------------------------------------------------------------- +# OpenAI transcription +# --------------------------------------------------------------------------- + + +class TestTranscribeOpenAI: + + def test_no_key(self, monkeypatch): + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + from tools.transcription_tools import _transcribe_openai + result = _transcribe_openai("/tmp/test.ogg", "whisper-1") + assert result["success"] is False + assert "VOICE_TOOLS_OPENAI_KEY" in result["error"] + + def test_successful_transcription(self, monkeypatch, tmp_path): + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test") + audio_file = tmp_path / "test.ogg" + audio_file.write_bytes(b"fake audio") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = "Hello from OpenAI" + + with patch("tools.transcription_tools._HAS_OPENAI", True), \ + patch("tools.transcription_tools.OpenAI", return_value=mock_client): + from tools.transcription_tools import _transcribe_openai + result = _transcribe_openai(str(audio_file), "whisper-1") + + assert result["success"] is True + assert result["transcript"] == "Hello from OpenAI" + + +# --------------------------------------------------------------------------- +# Main transcribe_audio() dispatch +# --------------------------------------------------------------------------- + + +class TestTranscribeAudio: + + def test_dispatches_to_local(self, tmp_path): + audio_file = tmp_path / "test.ogg" + audio_file.write_bytes(b"fake audio") + + with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "local"}), \ + patch("tools.transcription_tools._get_provider", return_value="local"), \ + patch("tools.transcription_tools._transcribe_local", return_value={"success": True, "transcript": "hi"}) as mock_local: + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(str(audio_file)) + + assert result["success"] is True + mock_local.assert_called_once() + + def test_dispatches_to_openai(self, tmp_path): + audio_file = tmp_path / "test.ogg" + audio_file.write_bytes(b"fake audio") + + with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "openai"}), \ + patch("tools.transcription_tools._get_provider", return_value="openai"), \ + patch("tools.transcription_tools._transcribe_openai", return_value={"success": True, "transcript": "hi"}) as mock_openai: + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(str(audio_file)) + + assert result["success"] is True + mock_openai.assert_called_once() + + def test_no_provider_returns_error(self, tmp_path): + audio_file = tmp_path / "test.ogg" + audio_file.write_bytes(b"fake audio") + + with patch("tools.transcription_tools._load_stt_config", return_value={}), \ + patch("tools.transcription_tools._get_provider", return_value="none"): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(str(audio_file)) + + assert result["success"] is False + assert "No STT provider" in result["error"] + + def test_invalid_file_returns_error(self): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio("/nonexistent/file.ogg") + assert result["success"] is False + assert "not found" in result["error"] diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 8e26e0941..96b7a95e2 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -2,18 +2,19 @@ """ Transcription Tools Module -Provides speech-to-text transcription using OpenAI's Whisper API. -Used by the messaging gateway to automatically transcribe voice messages -sent by users on Telegram, Discord, WhatsApp, and Slack. +Provides speech-to-text transcription with two providers: -Supported models: - - whisper-1 (cheapest, good quality) - - gpt-4o-mini-transcribe (better quality, higher cost) - - gpt-4o-transcribe (best quality, highest cost) + - **local** (default, free) — faster-whisper running locally, no API key needed. + Auto-downloads the model (~150 MB for ``base``) on first use. + - **openai** — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``. + +Used by the messaging gateway to automatically transcribe voice messages +sent by users on Telegram, Discord, WhatsApp, Slack, and Signal. Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg -Usage: +Usage:: + from tools.transcription_tools import transcribe_audio result = transcribe_audio("/path/to/audio.ogg") @@ -28,27 +29,205 @@ from typing import Optional, Dict, Any logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Optional imports — graceful degradation +# --------------------------------------------------------------------------- -# Default STT model -- cheapest and widely available -DEFAULT_STT_MODEL = "whisper-1" +try: + from faster_whisper import WhisperModel + _HAS_FASTER_WHISPER = True +except ImportError: + _HAS_FASTER_WHISPER = False + WhisperModel = None # type: ignore[assignment,misc] + +try: + from openai import OpenAI, APIError, APIConnectionError, APITimeoutError + _HAS_OPENAI = True +except ImportError: + _HAS_OPENAI = False + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +DEFAULT_PROVIDER = "local" +DEFAULT_LOCAL_MODEL = "base" +DEFAULT_OPENAI_MODEL = "whisper-1" -# Supported audio formats SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} +MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB -# Maximum file size (25MB - OpenAI limit) -MAX_FILE_SIZE = 25 * 1024 * 1024 +# Singleton for the local model — loaded once, reused across calls +_local_model: Optional["WhisperModel"] = None +_local_model_name: Optional[str] = None + +# --------------------------------------------------------------------------- +# Config helpers +# --------------------------------------------------------------------------- + + +def _load_stt_config() -> dict: + """Load the ``stt`` section from user config, falling back to defaults.""" + try: + from hermes_cli.config import load_config + return load_config().get("stt", {}) + except Exception: + return {} + + +def _get_provider(stt_config: dict) -> str: + """Determine which STT provider to use. + + Priority: + 1. Explicit config value (``stt.provider``) + 2. Auto-detect: local if faster-whisper available, else openai if key set + 3. Disabled (returns "none") + """ + provider = stt_config.get("provider", DEFAULT_PROVIDER) + + if provider == "local": + if _HAS_FASTER_WHISPER: + return "local" + # Local requested but not available — fall back to openai if possible + if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + logger.info("faster-whisper not installed, falling back to OpenAI Whisper API") + return "openai" + return "none" + + if provider == "openai": + if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + return "openai" + # OpenAI requested but no key — fall back to local if possible + if _HAS_FASTER_WHISPER: + logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper") + return "local" + return "none" + + return provider # Unknown — let it fail downstream + +# --------------------------------------------------------------------------- +# Shared validation +# --------------------------------------------------------------------------- + + +def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]: + """Validate the audio file. Returns an error dict or None if OK.""" + audio_path = Path(file_path) + + if not audio_path.exists(): + return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"} + if not audio_path.is_file(): + return {"success": False, "transcript": "", "error": f"Path is not a file: {file_path}"} + if audio_path.suffix.lower() not in SUPPORTED_FORMATS: + return { + "success": False, + "transcript": "", + "error": f"Unsupported format: {audio_path.suffix}. Supported: {', '.join(sorted(SUPPORTED_FORMATS))}", + } + try: + file_size = audio_path.stat().st_size + if file_size > MAX_FILE_SIZE: + return { + "success": False, + "transcript": "", + "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024):.0f}MB)", + } + except OSError as e: + return {"success": False, "transcript": "", "error": f"Failed to access file: {e}"} + + return None + +# --------------------------------------------------------------------------- +# Provider: local (faster-whisper) +# --------------------------------------------------------------------------- + + +def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: + """Transcribe using faster-whisper (local, free).""" + global _local_model, _local_model_name + + if not _HAS_FASTER_WHISPER: + return {"success": False, "transcript": "", "error": "faster-whisper not installed"} + + try: + # Lazy-load the model (downloads on first use, ~150 MB for 'base') + if _local_model is None or _local_model_name != model_name: + logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) + _local_model = WhisperModel(model_name, device="auto", compute_type="auto") + _local_model_name = model_name + + segments, info = _local_model.transcribe(file_path, beam_size=5) + transcript = " ".join(segment.text.strip() for segment in segments) + + logger.info( + "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)", + Path(file_path).name, model_name, info.language, info.duration, + ) + + return {"success": True, "transcript": transcript} + + except Exception as e: + logger.error("Local transcription failed: %s", e, exc_info=True) + return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} + +# --------------------------------------------------------------------------- +# Provider: openai (Whisper API) +# --------------------------------------------------------------------------- + + +def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: + """Transcribe using OpenAI Whisper API (paid).""" + api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") + if not api_key: + return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"} + + if not _HAS_OPENAI: + return {"success": False, "transcript": "", "error": "openai package not installed"} + + try: + client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1") + + with open(file_path, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model=model_name, + file=audio_file, + response_format="text", + ) + + transcript_text = str(transcription).strip() + logger.info("Transcribed %s via OpenAI API (%s, %d chars)", + Path(file_path).name, model_name, len(transcript_text)) + + return {"success": True, "transcript": transcript_text} + + except PermissionError: + return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} + except APIConnectionError as e: + return {"success": False, "transcript": "", "error": f"Connection error: {e}"} + except APITimeoutError as e: + return {"success": False, "transcript": "", "error": f"Request timeout: {e}"} + except APIError as e: + return {"success": False, "transcript": "", "error": f"API error: {e}"} + except Exception as e: + logger.error("OpenAI transcription failed: %s", e, exc_info=True) + return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"} + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]: """ - Transcribe an audio file using OpenAI's Whisper API. + Transcribe an audio file using the configured STT provider. - This function calls the OpenAI Audio Transcriptions endpoint directly - (not via OpenRouter, since Whisper isn't available there). + Provider priority: + 1. User config (``stt.provider`` in config.yaml) + 2. Auto-detect: local faster-whisper if available, else OpenAI API Args: file_path: Absolute path to the audio file to transcribe. - model: Whisper model to use. Defaults to config or "whisper-1". + model: Override the model. If None, uses config or provider default. Returns: dict with keys: @@ -56,114 +235,31 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A - "transcript" (str): The transcribed text (empty on failure) - "error" (str, optional): Error message if success is False """ - api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") - if not api_key: - return { - "success": False, - "transcript": "", - "error": "VOICE_TOOLS_OPENAI_KEY not set", - } + # Validate input + error = _validate_audio_file(file_path) + if error: + return error - audio_path = Path(file_path) - - # Validate file exists - if not audio_path.exists(): - return { - "success": False, - "transcript": "", - "error": f"Audio file not found: {file_path}", - } - - if not audio_path.is_file(): - return { - "success": False, - "transcript": "", - "error": f"Path is not a file: {file_path}", - } - - # Validate file extension - if audio_path.suffix.lower() not in SUPPORTED_FORMATS: - return { - "success": False, - "transcript": "", - "error": f"Unsupported file format: {audio_path.suffix}. Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}", - } - - # Validate file size - try: - file_size = audio_path.stat().st_size - if file_size > MAX_FILE_SIZE: - return { - "success": False, - "transcript": "", - "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024)}MB)", - } - except OSError as e: - logger.error("Failed to get file size for %s: %s", file_path, e, exc_info=True) - return { - "success": False, - "transcript": "", - "error": f"Failed to access file: {e}", - } + # Load config and determine provider + stt_config = _load_stt_config() + provider = _get_provider(stt_config) - # Use provided model, or fall back to default - if model is None: - model = DEFAULT_STT_MODEL + if provider == "local": + local_cfg = stt_config.get("local", {}) + model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) + return _transcribe_local(file_path, model_name) - try: - from openai import OpenAI, APIError, APIConnectionError, APITimeoutError + if provider == "openai": + openai_cfg = stt_config.get("openai", {}) + model_name = model or openai_cfg.get("model", DEFAULT_OPENAI_MODEL) + return _transcribe_openai(file_path, model_name) - client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1") - - with open(file_path, "rb") as audio_file: - transcription = client.audio.transcriptions.create( - model=model, - file=audio_file, - response_format="text", - ) - - # The response is a plain string when response_format="text" - transcript_text = str(transcription).strip() - - logger.info("Transcribed %s (%d chars)", audio_path.name, len(transcript_text)) - - return { - "success": True, - "transcript": transcript_text, - } - - except PermissionError: - logger.error("Permission denied accessing file: %s", file_path, exc_info=True) - return { - "success": False, - "transcript": "", - "error": f"Permission denied: {file_path}", - } - except APIConnectionError as e: - logger.error("API connection error during transcription: %s", e, exc_info=True) - return { - "success": False, - "transcript": "", - "error": f"Connection error: {e}", - } - except APITimeoutError as e: - logger.error("API timeout during transcription: %s", e, exc_info=True) - return { - "success": False, - "transcript": "", - "error": f"Request timeout: {e}", - } - except APIError as e: - logger.error("OpenAI API error during transcription: %s", e, exc_info=True) - return { - "success": False, - "transcript": "", - "error": f"API error: {e}", - } - except Exception as e: - logger.error("Unexpected error during transcription: %s", e, exc_info=True) - return { - "success": False, - "transcript": "", - "error": f"Transcription failed: {e}", - } + # No provider available + return { + "success": False, + "transcript": "", + "error": ( + "No STT provider available. Install faster-whisper for free local " + "transcription, or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API." + ), + } diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 620f8c1c6..6634ba2ab 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -67,23 +67,48 @@ Without ffmpeg, Edge TTS audio is sent as a regular audio file (playable, but sh If you want voice bubbles without installing ffmpeg, switch to the OpenAI or ElevenLabs provider. ::: -## Voice Message Transcription +## Voice Message Transcription (STT) -Voice messages sent on Telegram, Discord, WhatsApp, or Slack are automatically transcribed and injected as text into the conversation. The agent sees the transcript as normal text. +Voice messages sent on Telegram, Discord, WhatsApp, Slack, or Signal are automatically transcribed and injected as text into the conversation. The agent sees the transcript as normal text. -| Provider | Model | Quality | Cost | -|----------|-------|---------|------| -| **OpenAI Whisper** | `whisper-1` (default) | Good | Low | -| **OpenAI GPT-4o** | `gpt-4o-mini-transcribe` | Better | Medium | -| **OpenAI GPT-4o** | `gpt-4o-transcribe` | Best | Higher | +| Provider | Quality | Cost | API Key | +|----------|---------|------|---------| +| **Local Whisper** (default) | Good | Free | None needed | +| **OpenAI Whisper API** | Good–Best | Paid | `VOICE_TOOLS_OPENAI_KEY` | -Requires `VOICE_TOOLS_OPENAI_KEY` in `~/.hermes/.env`. +:::info Zero Config +Local transcription works out of the box — no API key needed. The `faster-whisper` model (~150 MB for `base`) is auto-downloaded on first voice message. +::: ### Configuration ```yaml # In ~/.hermes/config.yaml stt: - enabled: true - model: "whisper-1" + provider: "local" # "local" (free, faster-whisper) | "openai" (API) + local: + model: "base" # tiny, base, small, medium, large-v3 + openai: + model: "whisper-1" # whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe ``` + +### Provider Details + +**Local (faster-whisper)** — Runs Whisper locally via [faster-whisper](https://github.com/SYSTRAN/faster-whisper). Uses CPU by default, GPU if available. Model sizes: + +| Model | Size | Speed | Quality | +|-------|------|-------|---------| +| `tiny` | ~75 MB | Fastest | Basic | +| `base` | ~150 MB | Fast | Good (default) | +| `small` | ~500 MB | Medium | Better | +| `medium` | ~1.5 GB | Slower | Great | +| `large-v3` | ~3 GB | Slowest | Best | + +**OpenAI API** — Requires `VOICE_TOOLS_OPENAI_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + +### Fallback Behavior + +If your configured provider isn't available, Hermes automatically falls back: +- **Local not installed** → Falls back to OpenAI API (if key is set) +- **OpenAI key not set** → Falls back to local Whisper (if installed) +- **Neither available** → Voice messages pass through with a note to the user