Merge pull request #1490 from NousResearch/fix/1033-telegram-voice-fallback

fix: restore local STT fallback for gateway voice notes
This commit is contained in:
Teknium 2026-03-15 21:58:32 -07:00 committed by GitHub
commit 70e24d77a1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 324 additions and 18 deletions

View file

@ -3635,7 +3635,10 @@ class GatewayRunner:
) )
else: else:
error = result.get("error", "unknown error") error = result.get("error", "unknown error")
if "No STT provider" in error or "not set" in error: if (
"No STT provider" in error
or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set")
):
enriched_parts.append( enriched_parts.append(
"[The user sent a voice message but I can't listen " "[The user sent a voice message but I can't listen "
"to it right now~ No STT provider is configured " "to it right now~ No STT provider is configured "

View file

@ -51,3 +51,27 @@ async def test_enrich_message_with_transcription_skips_when_stt_disabled():
assert "transcription is disabled" in result.lower() assert "transcription is disabled" in result.lower()
assert "caption" in result assert "caption" in result
@pytest.mark.asyncio
async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors():
from gateway.run import GatewayRunner
runner = GatewayRunner.__new__(GatewayRunner)
runner.config = GatewayConfig(stt_enabled=True)
with patch(
"tools.transcription_tools.transcribe_audio",
return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"},
), patch(
"tools.transcription_tools.get_stt_model_from_config",
return_value=None,
):
result = await runner._enrich_message_with_transcription(
"caption",
["/tmp/voice.ogg"],
)
assert "No STT provider is configured" not in result
assert "trouble transcribing" in result
assert "caption" in result

View file

@ -7,6 +7,7 @@ end-to-end dispatch. All external dependencies are mocked.
import os import os
import struct import struct
import subprocess
import wave import wave
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
@ -45,7 +46,10 @@ def sample_ogg(tmp_path):
def clean_env(monkeypatch): def clean_env(monkeypatch):
"""Ensure no real API keys leak into tests.""" """Ensure no real API keys leak into tests."""
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
monkeypatch.delenv("GROQ_API_KEY", raising=False) monkeypatch.delenv("GROQ_API_KEY", raising=False)
monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False)
monkeypatch.delenv("HERMES_LOCAL_STT_LANGUAGE", raising=False)
# ============================================================================ # ============================================================================
@ -132,6 +136,19 @@ class TestGetProviderFallbackPriority:
from tools.transcription_tools import _get_provider from tools.transcription_tools import _get_provider
assert _get_provider({}) == "local" assert _get_provider({}) == "local"
def test_openai_fallback_to_local_command(self, monkeypatch):
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
monkeypatch.delenv("GROQ_API_KEY", raising=False)
monkeypatch.setenv(
"HERMES_LOCAL_STT_COMMAND",
"whisper {input_path} --output_dir {output_dir} --language {language}",
)
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
patch("tools.transcription_tools._HAS_OPENAI", True):
from tools.transcription_tools import _get_provider
assert _get_provider({"provider": "openai"}) == "local_command"
# ============================================================================ # ============================================================================
# _transcribe_groq # _transcribe_groq
@ -279,6 +296,63 @@ class TestTranscribeOpenAIExtended:
assert "Permission denied" in result["error"] assert "Permission denied" in result["error"]
class TestTranscribeLocalCommand:
def test_auto_detects_local_whisper_binary(self, monkeypatch):
monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False)
monkeypatch.setattr("tools.transcription_tools._find_whisper_binary", lambda: "/opt/homebrew/bin/whisper")
from tools.transcription_tools import _get_local_command_template
template = _get_local_command_template()
assert template is not None
assert template.startswith("/opt/homebrew/bin/whisper ")
assert "{model}" in template
assert "{output_dir}" in template
def test_command_fallback_with_template(self, monkeypatch, sample_ogg, tmp_path):
out_dir = tmp_path / "local-out"
out_dir.mkdir()
monkeypatch.setenv(
"HERMES_LOCAL_STT_COMMAND",
"whisper {input_path} --model {model} --output_dir {output_dir} --language {language}",
)
monkeypatch.setenv("HERMES_LOCAL_STT_LANGUAGE", "en")
def fake_tempdir(prefix=None):
class _TempDir:
def __enter__(self_inner):
return str(out_dir)
def __exit__(self_inner, exc_type, exc, tb):
return False
return _TempDir()
def fake_run(cmd, *args, **kwargs):
if isinstance(cmd, list):
output_path = cmd[-1]
with open(output_path, "wb") as handle:
handle.write(b"RIFF....WAVEfmt ")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
(out_dir / "test.txt").write_text("hello from local command\n", encoding="utf-8")
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
monkeypatch.setattr("tools.transcription_tools.tempfile.TemporaryDirectory", fake_tempdir)
monkeypatch.setattr("tools.transcription_tools._find_ffmpeg_binary", lambda: "/opt/homebrew/bin/ffmpeg")
monkeypatch.setattr("tools.transcription_tools.subprocess.run", fake_run)
from tools.transcription_tools import _transcribe_local_command
result = _transcribe_local_command(sample_ogg, "base")
assert result["success"] is True
assert result["transcript"] == "hello from local command"
assert result["provider"] == "local_command"
# ============================================================================ # ============================================================================
# _transcribe_local — additional tests # _transcribe_local — additional tests
# ============================================================================ # ============================================================================
@ -612,6 +686,29 @@ class TestTranscribeAudioDispatch:
assert "faster-whisper" in result["error"] assert "faster-whisper" in result["error"]
assert "GROQ_API_KEY" in result["error"] assert "GROQ_API_KEY" in result["error"]
def test_openai_provider_falls_back_to_local_command(self, monkeypatch, sample_ogg):
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
monkeypatch.setenv(
"HERMES_LOCAL_STT_COMMAND",
"whisper {input_path} --model {model} --output_dir {output_dir} --language {language}",
)
with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "openai"}), \
patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
patch("tools.transcription_tools._HAS_OPENAI", True), \
patch("tools.transcription_tools._transcribe_local_command", return_value={
"success": True,
"transcript": "hello from fallback",
"provider": "local_command",
}) as mock_local_command:
from tools.transcription_tools import transcribe_audio
result = transcribe_audio(sample_ogg)
assert result["success"] is True
assert result["transcript"] == "hello from fallback"
mock_local_command.assert_called_once_with(sample_ogg, "base")
def test_invalid_file_short_circuits(self): def test_invalid_file_short_circuits(self):
from tools.transcription_tools import transcribe_audio from tools.transcription_tools import transcribe_audio
result = transcribe_audio("/nonexistent/audio.wav") result = transcribe_audio("/nonexistent/audio.wav")

View file

@ -25,6 +25,10 @@ Usage::
import logging import logging
import os import os
import shlex
import shutil
import subprocess
import tempfile
from pathlib import Path from pathlib import Path
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
@ -44,13 +48,18 @@ _HAS_OPENAI = _ilu.find_spec("openai") is not None
DEFAULT_PROVIDER = "local" DEFAULT_PROVIDER = "local"
DEFAULT_LOCAL_MODEL = "base" DEFAULT_LOCAL_MODEL = "base"
DEFAULT_LOCAL_STT_LANGUAGE = "en"
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1") DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo") DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND"
LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE"
COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1") GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1") OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB
# Known model sets for auto-correction # Known model sets for auto-correction
@ -105,6 +114,53 @@ def is_stt_enabled(stt_config: Optional[dict] = None) -> bool:
return bool(enabled) return bool(enabled)
def _resolve_openai_api_key() -> str:
"""Prefer the voice-tools key, but fall back to the normal OpenAI key."""
return os.getenv("VOICE_TOOLS_OPENAI_KEY", "") or os.getenv("OPENAI_API_KEY", "")
def _find_binary(binary_name: str) -> Optional[str]:
"""Find a local binary, checking common Homebrew/local prefixes as well as PATH."""
for directory in COMMON_LOCAL_BIN_DIRS:
candidate = Path(directory) / binary_name
if candidate.exists() and os.access(candidate, os.X_OK):
return str(candidate)
return shutil.which(binary_name)
def _find_ffmpeg_binary() -> Optional[str]:
return _find_binary("ffmpeg")
def _find_whisper_binary() -> Optional[str]:
return _find_binary("whisper")
def _get_local_command_template() -> Optional[str]:
configured = os.getenv(LOCAL_STT_COMMAND_ENV, "").strip()
if configured:
return configured
whisper_binary = _find_whisper_binary()
if whisper_binary:
quoted_binary = shlex.quote(whisper_binary)
return (
f"{quoted_binary} {{input_path}} --model {{model}} --output_format txt "
"--output_dir {output_dir} --language {language}"
)
return None
def _has_local_command() -> bool:
return _get_local_command_template() is not None
def _normalize_local_command_model(model_name: Optional[str]) -> str:
if not model_name or model_name in OPENAI_MODELS or model_name in GROQ_MODELS:
return DEFAULT_LOCAL_MODEL
return model_name
def _get_provider(stt_config: dict) -> str: def _get_provider(stt_config: dict) -> str:
"""Determine which STT provider to use. """Determine which STT provider to use.
@ -121,15 +177,32 @@ def _get_provider(stt_config: dict) -> str:
if provider == "local": if provider == "local":
if _HAS_FASTER_WHISPER: if _HAS_FASTER_WHISPER:
return "local" return "local"
if _has_local_command():
logger.info("faster-whisper not installed, falling back to local STT command")
return "local_command"
# Local requested but not available — fall back to groq, then openai # Local requested but not available — fall back to groq, then openai
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
logger.info("faster-whisper not installed, falling back to Groq Whisper API") logger.info("faster-whisper not installed, falling back to Groq Whisper API")
return "groq" return "groq"
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): if _HAS_OPENAI and _resolve_openai_api_key():
logger.info("faster-whisper not installed, falling back to OpenAI Whisper API") logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
return "openai" return "openai"
return "none" return "none"
if provider == "local_command":
if _has_local_command():
return "local_command"
if _HAS_FASTER_WHISPER:
logger.info("Local STT command unavailable, falling back to local faster-whisper")
return "local"
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
logger.info("Local STT command unavailable, falling back to Groq Whisper API")
return "groq"
if _HAS_OPENAI and _resolve_openai_api_key():
logger.info("Local STT command unavailable, falling back to OpenAI Whisper API")
return "openai"
return "none"
if provider == "groq": if provider == "groq":
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
return "groq" return "groq"
@ -137,20 +210,26 @@ def _get_provider(stt_config: dict) -> str:
if _HAS_FASTER_WHISPER: if _HAS_FASTER_WHISPER:
logger.info("GROQ_API_KEY not set, falling back to local faster-whisper") logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
return "local" return "local"
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): if _has_local_command():
logger.info("GROQ_API_KEY not set, falling back to local STT command")
return "local_command"
if _HAS_OPENAI and _resolve_openai_api_key():
logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API") logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
return "openai" return "openai"
return "none" return "none"
if provider == "openai": if provider == "openai":
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): if _HAS_OPENAI and _resolve_openai_api_key():
return "openai" return "openai"
# OpenAI requested but no key — fall back # OpenAI requested but no key — fall back
if _HAS_FASTER_WHISPER: if _HAS_FASTER_WHISPER:
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper") logger.info("OpenAI STT key not set, falling back to local faster-whisper")
return "local" return "local"
if _has_local_command():
logger.info("OpenAI STT key not set, falling back to local STT command")
return "local_command"
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"): if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to Groq Whisper API") logger.info("OpenAI STT key not set, falling back to Groq Whisper API")
return "groq" return "groq"
return "none" return "none"
@ -222,6 +301,89 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
logger.error("Local transcription failed: %s", e, exc_info=True) logger.error("Local transcription failed: %s", e, exc_info=True)
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
def _prepare_local_audio(file_path: str, work_dir: str) -> tuple[Optional[str], Optional[str]]:
"""Normalize audio for local CLI STT when needed."""
audio_path = Path(file_path)
if audio_path.suffix.lower() in LOCAL_NATIVE_AUDIO_FORMATS:
return file_path, None
ffmpeg = _find_ffmpeg_binary()
if not ffmpeg:
return None, "Local STT fallback requires ffmpeg for non-WAV inputs, but ffmpeg was not found"
converted_path = os.path.join(work_dir, f"{audio_path.stem}.wav")
command = [ffmpeg, "-y", "-i", file_path, converted_path]
try:
subprocess.run(command, check=True, capture_output=True, text=True)
return converted_path, None
except subprocess.CalledProcessError as e:
details = e.stderr.strip() or e.stdout.strip() or str(e)
logger.error("ffmpeg conversion failed for %s: %s", file_path, details)
return None, f"Failed to convert audio for local STT: {details}"
def _transcribe_local_command(file_path: str, model_name: str) -> Dict[str, Any]:
"""Run the configured local STT command template and read back a .txt transcript."""
command_template = _get_local_command_template()
if not command_template:
return {
"success": False,
"transcript": "",
"error": (
f"{LOCAL_STT_COMMAND_ENV} not configured and no local whisper binary was found"
),
}
language = os.getenv(LOCAL_STT_LANGUAGE_ENV, DEFAULT_LOCAL_STT_LANGUAGE)
normalized_model = _normalize_local_command_model(model_name)
try:
with tempfile.TemporaryDirectory(prefix="hermes-local-stt-") as output_dir:
prepared_input, prep_error = _prepare_local_audio(file_path, output_dir)
if prep_error:
return {"success": False, "transcript": "", "error": prep_error}
command = command_template.format(
input_path=shlex.quote(prepared_input),
output_dir=shlex.quote(output_dir),
language=shlex.quote(language),
model=shlex.quote(normalized_model),
)
subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
txt_files = sorted(Path(output_dir).glob("*.txt"))
if not txt_files:
return {
"success": False,
"transcript": "",
"error": "Local STT command completed but did not produce a .txt transcript",
}
transcript_text = txt_files[0].read_text(encoding="utf-8").strip()
logger.info(
"Transcribed %s via local STT command (%s, %d chars)",
Path(file_path).name,
normalized_model,
len(transcript_text),
)
return {"success": True, "transcript": transcript_text, "provider": "local_command"}
except KeyError as e:
return {
"success": False,
"transcript": "",
"error": f"Invalid {LOCAL_STT_COMMAND_ENV} template, missing placeholder: {e}",
}
except subprocess.CalledProcessError as e:
details = e.stderr.strip() or e.stdout.strip() or str(e)
logger.error("Local STT command failed for %s: %s", file_path, details)
return {"success": False, "transcript": "", "error": f"Local STT failed: {details}"}
except Exception as e:
logger.error("Unexpected error during local command transcription: %s", e, exc_info=True)
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Provider: groq (Whisper API — free tier) # Provider: groq (Whisper API — free tier)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -277,9 +439,13 @@ def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]:
def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
"""Transcribe using OpenAI Whisper API (paid).""" """Transcribe using OpenAI Whisper API (paid)."""
api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") api_key = _resolve_openai_api_key()
if not api_key: if not api_key:
return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"} return {
"success": False,
"transcript": "",
"error": "Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set",
}
if not _HAS_OPENAI: if not _HAS_OPENAI:
return {"success": False, "transcript": "", "error": "openai package not installed"} return {"success": False, "transcript": "", "error": "openai package not installed"}
@ -363,6 +529,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
return _transcribe_local(file_path, model_name) return _transcribe_local(file_path, model_name)
if provider == "local_command":
local_cfg = stt_config.get("local", {})
model_name = _normalize_local_command_model(
model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
)
return _transcribe_local_command(file_path, model_name)
if provider == "groq": if provider == "groq":
model_name = model or DEFAULT_GROQ_STT_MODEL model_name = model or DEFAULT_GROQ_STT_MODEL
return _transcribe_groq(file_path, model_name) return _transcribe_groq(file_path, model_name)
@ -378,7 +551,8 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
"transcript": "", "transcript": "",
"error": ( "error": (
"No STT provider available. Install faster-whisper for free local " "No STT provider available. Install faster-whisper for free local "
"transcription, set GROQ_API_KEY for free Groq Whisper, " f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
"or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API." "set GROQ_API_KEY for free Groq Whisper, or set VOICE_TOOLS_OPENAI_KEY "
"or OPENAI_API_KEY for the OpenAI Whisper API."
), ),
} }

View file

@ -31,7 +31,9 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config
| `CLAUDE_CODE_OAUTH_TOKEN` | Explicit Claude Code token override if you export one manually | | `CLAUDE_CODE_OAUTH_TOKEN` | Explicit Claude Code token override if you export one manually |
| `HERMES_MODEL` | Preferred model name (checked before `LLM_MODEL`, used by gateway) | | `HERMES_MODEL` | Preferred model name (checked before `LLM_MODEL`, used by gateway) |
| `LLM_MODEL` | Default model name (fallback when not set in config.yaml) | | `LLM_MODEL` | Default model name (fallback when not set in config.yaml) |
| `VOICE_TOOLS_OPENAI_KEY` | OpenAI key for OpenAI speech-to-text and text-to-speech providers | | `VOICE_TOOLS_OPENAI_KEY` | Preferred OpenAI key for OpenAI speech-to-text and text-to-speech providers |
| `HERMES_LOCAL_STT_COMMAND` | Optional local speech-to-text command template. Supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders |
| `HERMES_LOCAL_STT_LANGUAGE` | Default language passed to `HERMES_LOCAL_STT_COMMAND` or auto-detected local `whisper` CLI fallback (default: `en`) |
| `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) | | `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) |
## Provider Auth (OAuth) ## Provider Auth (OAuth)

View file

@ -74,10 +74,11 @@ Voice messages sent on Telegram, Discord, WhatsApp, Slack, or Signal are automat
| Provider | Quality | Cost | API Key | | Provider | Quality | Cost | API Key |
|----------|---------|------|---------| |----------|---------|------|---------|
| **Local Whisper** (default) | Good | Free | None needed | | **Local Whisper** (default) | Good | Free | None needed |
| **OpenAI Whisper API** | GoodBest | Paid | `VOICE_TOOLS_OPENAI_KEY` | | **Groq Whisper API** | GoodBest | Free tier | `GROQ_API_KEY` |
| **OpenAI Whisper API** | GoodBest | Paid | `VOICE_TOOLS_OPENAI_KEY` or `OPENAI_API_KEY` |
:::info Zero Config :::info Zero Config
Local transcription works out of the box — no API key needed. The `faster-whisper` model (~150 MB for `base`) is auto-downloaded on first voice message. Local transcription works out of the box when `faster-whisper` is installed. If that's unavailable, Hermes can also use a local `whisper` CLI from common install locations (like `/opt/homebrew/bin`) or a custom command via `HERMES_LOCAL_STT_COMMAND`.
::: :::
### Configuration ### Configuration
@ -85,7 +86,7 @@ Local transcription works out of the box — no API key needed. The `faster-whis
```yaml ```yaml
# In ~/.hermes/config.yaml # In ~/.hermes/config.yaml
stt: stt:
provider: "local" # "local" (free, faster-whisper) | "openai" (API) provider: "local" # "local" | "groq" | "openai"
local: local:
model: "base" # tiny, base, small, medium, large-v3 model: "base" # tiny, base, small, medium, large-v3
openai: openai:
@ -104,11 +105,16 @@ stt:
| `medium` | ~1.5 GB | Slower | Great | | `medium` | ~1.5 GB | Slower | Great |
| `large-v3` | ~3 GB | Slowest | Best | | `large-v3` | ~3 GB | Slowest | Best |
**OpenAI API** — Requires `VOICE_TOOLS_OPENAI_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. **Groq API** — Requires `GROQ_API_KEY`. Good cloud fallback when you want a free hosted STT option.
**OpenAI API** — Accepts `VOICE_TOOLS_OPENAI_KEY` first and falls back to `OPENAI_API_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
**Custom local CLI fallback** — Set `HERMES_LOCAL_STT_COMMAND` if you want Hermes to call a local transcription command directly. The command template supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders.
### Fallback Behavior ### Fallback Behavior
If your configured provider isn't available, Hermes automatically falls back: If your configured provider isn't available, Hermes automatically falls back:
- **Local not installed** → Falls back to OpenAI API (if key is set) - **Local faster-whisper unavailable** → Tries a local `whisper` CLI or `HERMES_LOCAL_STT_COMMAND` before cloud providers
- **OpenAI key not set** → Falls back to local Whisper (if installed) - **Groq key not set** → Falls back to local transcription, then OpenAI
- **Neither available** → Voice messages pass through with a note to the user - **OpenAI key not set** → Falls back to local transcription, then Groq
- **Nothing available** → Voice messages pass through with an accurate note to the user