mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge pull request #1490 from NousResearch/fix/1033-telegram-voice-fallback
fix: restore local STT fallback for gateway voice notes
This commit is contained in:
commit
70e24d77a1
6 changed files with 324 additions and 18 deletions
|
|
@ -3635,7 +3635,10 @@ class GatewayRunner:
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
error = result.get("error", "unknown error")
|
error = result.get("error", "unknown error")
|
||||||
if "No STT provider" in error or "not set" in error:
|
if (
|
||||||
|
"No STT provider" in error
|
||||||
|
or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set")
|
||||||
|
):
|
||||||
enriched_parts.append(
|
enriched_parts.append(
|
||||||
"[The user sent a voice message but I can't listen "
|
"[The user sent a voice message but I can't listen "
|
||||||
"to it right now~ No STT provider is configured "
|
"to it right now~ No STT provider is configured "
|
||||||
|
|
|
||||||
|
|
@ -51,3 +51,27 @@ async def test_enrich_message_with_transcription_skips_when_stt_disabled():
|
||||||
|
|
||||||
assert "transcription is disabled" in result.lower()
|
assert "transcription is disabled" in result.lower()
|
||||||
assert "caption" in result
|
assert "caption" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_enrich_message_with_transcription_avoids_bogus_no_provider_message_for_backend_key_errors():
|
||||||
|
from gateway.run import GatewayRunner
|
||||||
|
|
||||||
|
runner = GatewayRunner.__new__(GatewayRunner)
|
||||||
|
runner.config = GatewayConfig(stt_enabled=True)
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"tools.transcription_tools.transcribe_audio",
|
||||||
|
return_value={"success": False, "error": "VOICE_TOOLS_OPENAI_KEY not set"},
|
||||||
|
), patch(
|
||||||
|
"tools.transcription_tools.get_stt_model_from_config",
|
||||||
|
return_value=None,
|
||||||
|
):
|
||||||
|
result = await runner._enrich_message_with_transcription(
|
||||||
|
"caption",
|
||||||
|
["/tmp/voice.ogg"],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert "No STT provider is configured" not in result
|
||||||
|
assert "trouble transcribing" in result
|
||||||
|
assert "caption" in result
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ end-to-end dispatch. All external dependencies are mocked.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import struct
|
import struct
|
||||||
|
import subprocess
|
||||||
import wave
|
import wave
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
|
@ -45,7 +46,10 @@ def sample_ogg(tmp_path):
|
||||||
def clean_env(monkeypatch):
|
def clean_env(monkeypatch):
|
||||||
"""Ensure no real API keys leak into tests."""
|
"""Ensure no real API keys leak into tests."""
|
||||||
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False)
|
||||||
|
monkeypatch.delenv("HERMES_LOCAL_STT_LANGUAGE", raising=False)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
@ -132,6 +136,19 @@ class TestGetProviderFallbackPriority:
|
||||||
from tools.transcription_tools import _get_provider
|
from tools.transcription_tools import _get_provider
|
||||||
assert _get_provider({}) == "local"
|
assert _get_provider({}) == "local"
|
||||||
|
|
||||||
|
def test_openai_fallback_to_local_command(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
||||||
|
monkeypatch.setenv(
|
||||||
|
"HERMES_LOCAL_STT_COMMAND",
|
||||||
|
"whisper {input_path} --output_dir {output_dir} --language {language}",
|
||||||
|
)
|
||||||
|
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
|
||||||
|
patch("tools.transcription_tools._HAS_OPENAI", True):
|
||||||
|
from tools.transcription_tools import _get_provider
|
||||||
|
assert _get_provider({"provider": "openai"}) == "local_command"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# _transcribe_groq
|
# _transcribe_groq
|
||||||
|
|
@ -279,6 +296,63 @@ class TestTranscribeOpenAIExtended:
|
||||||
assert "Permission denied" in result["error"]
|
assert "Permission denied" in result["error"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestTranscribeLocalCommand:
|
||||||
|
def test_auto_detects_local_whisper_binary(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False)
|
||||||
|
monkeypatch.setattr("tools.transcription_tools._find_whisper_binary", lambda: "/opt/homebrew/bin/whisper")
|
||||||
|
|
||||||
|
from tools.transcription_tools import _get_local_command_template
|
||||||
|
|
||||||
|
template = _get_local_command_template()
|
||||||
|
|
||||||
|
assert template is not None
|
||||||
|
assert template.startswith("/opt/homebrew/bin/whisper ")
|
||||||
|
assert "{model}" in template
|
||||||
|
assert "{output_dir}" in template
|
||||||
|
|
||||||
|
def test_command_fallback_with_template(self, monkeypatch, sample_ogg, tmp_path):
|
||||||
|
out_dir = tmp_path / "local-out"
|
||||||
|
out_dir.mkdir()
|
||||||
|
|
||||||
|
monkeypatch.setenv(
|
||||||
|
"HERMES_LOCAL_STT_COMMAND",
|
||||||
|
"whisper {input_path} --model {model} --output_dir {output_dir} --language {language}",
|
||||||
|
)
|
||||||
|
monkeypatch.setenv("HERMES_LOCAL_STT_LANGUAGE", "en")
|
||||||
|
|
||||||
|
def fake_tempdir(prefix=None):
|
||||||
|
class _TempDir:
|
||||||
|
def __enter__(self_inner):
|
||||||
|
return str(out_dir)
|
||||||
|
|
||||||
|
def __exit__(self_inner, exc_type, exc, tb):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return _TempDir()
|
||||||
|
|
||||||
|
def fake_run(cmd, *args, **kwargs):
|
||||||
|
if isinstance(cmd, list):
|
||||||
|
output_path = cmd[-1]
|
||||||
|
with open(output_path, "wb") as handle:
|
||||||
|
handle.write(b"RIFF....WAVEfmt ")
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
|
||||||
|
(out_dir / "test.txt").write_text("hello from local command\n", encoding="utf-8")
|
||||||
|
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr("tools.transcription_tools.tempfile.TemporaryDirectory", fake_tempdir)
|
||||||
|
monkeypatch.setattr("tools.transcription_tools._find_ffmpeg_binary", lambda: "/opt/homebrew/bin/ffmpeg")
|
||||||
|
monkeypatch.setattr("tools.transcription_tools.subprocess.run", fake_run)
|
||||||
|
|
||||||
|
from tools.transcription_tools import _transcribe_local_command
|
||||||
|
|
||||||
|
result = _transcribe_local_command(sample_ogg, "base")
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["transcript"] == "hello from local command"
|
||||||
|
assert result["provider"] == "local_command"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# _transcribe_local — additional tests
|
# _transcribe_local — additional tests
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
@ -612,6 +686,29 @@ class TestTranscribeAudioDispatch:
|
||||||
assert "faster-whisper" in result["error"]
|
assert "faster-whisper" in result["error"]
|
||||||
assert "GROQ_API_KEY" in result["error"]
|
assert "GROQ_API_KEY" in result["error"]
|
||||||
|
|
||||||
|
def test_openai_provider_falls_back_to_local_command(self, monkeypatch, sample_ogg):
|
||||||
|
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
||||||
|
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||||
|
monkeypatch.setenv(
|
||||||
|
"HERMES_LOCAL_STT_COMMAND",
|
||||||
|
"whisper {input_path} --model {model} --output_dir {output_dir} --language {language}",
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "openai"}), \
|
||||||
|
patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
|
||||||
|
patch("tools.transcription_tools._HAS_OPENAI", True), \
|
||||||
|
patch("tools.transcription_tools._transcribe_local_command", return_value={
|
||||||
|
"success": True,
|
||||||
|
"transcript": "hello from fallback",
|
||||||
|
"provider": "local_command",
|
||||||
|
}) as mock_local_command:
|
||||||
|
from tools.transcription_tools import transcribe_audio
|
||||||
|
result = transcribe_audio(sample_ogg)
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["transcript"] == "hello from fallback"
|
||||||
|
mock_local_command.assert_called_once_with(sample_ogg, "base")
|
||||||
|
|
||||||
def test_invalid_file_short_circuits(self):
|
def test_invalid_file_short_circuits(self):
|
||||||
from tools.transcription_tools import transcribe_audio
|
from tools.transcription_tools import transcribe_audio
|
||||||
result = transcribe_audio("/nonexistent/audio.wav")
|
result = transcribe_audio("/nonexistent/audio.wav")
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,10 @@ Usage::
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import shlex
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any
|
||||||
|
|
||||||
|
|
@ -44,13 +48,18 @@ _HAS_OPENAI = _ilu.find_spec("openai") is not None
|
||||||
|
|
||||||
DEFAULT_PROVIDER = "local"
|
DEFAULT_PROVIDER = "local"
|
||||||
DEFAULT_LOCAL_MODEL = "base"
|
DEFAULT_LOCAL_MODEL = "base"
|
||||||
|
DEFAULT_LOCAL_STT_LANGUAGE = "en"
|
||||||
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
|
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
|
||||||
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
|
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
|
||||||
|
LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND"
|
||||||
|
LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE"
|
||||||
|
COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
|
||||||
|
|
||||||
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
||||||
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||||
|
|
||||||
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
||||||
|
LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
|
||||||
MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB
|
MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB
|
||||||
|
|
||||||
# Known model sets for auto-correction
|
# Known model sets for auto-correction
|
||||||
|
|
@ -105,6 +114,53 @@ def is_stt_enabled(stt_config: Optional[dict] = None) -> bool:
|
||||||
return bool(enabled)
|
return bool(enabled)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_openai_api_key() -> str:
|
||||||
|
"""Prefer the voice-tools key, but fall back to the normal OpenAI key."""
|
||||||
|
return os.getenv("VOICE_TOOLS_OPENAI_KEY", "") or os.getenv("OPENAI_API_KEY", "")
|
||||||
|
|
||||||
|
|
||||||
|
def _find_binary(binary_name: str) -> Optional[str]:
|
||||||
|
"""Find a local binary, checking common Homebrew/local prefixes as well as PATH."""
|
||||||
|
for directory in COMMON_LOCAL_BIN_DIRS:
|
||||||
|
candidate = Path(directory) / binary_name
|
||||||
|
if candidate.exists() and os.access(candidate, os.X_OK):
|
||||||
|
return str(candidate)
|
||||||
|
return shutil.which(binary_name)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_ffmpeg_binary() -> Optional[str]:
|
||||||
|
return _find_binary("ffmpeg")
|
||||||
|
|
||||||
|
|
||||||
|
def _find_whisper_binary() -> Optional[str]:
|
||||||
|
return _find_binary("whisper")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_local_command_template() -> Optional[str]:
|
||||||
|
configured = os.getenv(LOCAL_STT_COMMAND_ENV, "").strip()
|
||||||
|
if configured:
|
||||||
|
return configured
|
||||||
|
|
||||||
|
whisper_binary = _find_whisper_binary()
|
||||||
|
if whisper_binary:
|
||||||
|
quoted_binary = shlex.quote(whisper_binary)
|
||||||
|
return (
|
||||||
|
f"{quoted_binary} {{input_path}} --model {{model}} --output_format txt "
|
||||||
|
"--output_dir {output_dir} --language {language}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _has_local_command() -> bool:
|
||||||
|
return _get_local_command_template() is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_local_command_model(model_name: Optional[str]) -> str:
|
||||||
|
if not model_name or model_name in OPENAI_MODELS or model_name in GROQ_MODELS:
|
||||||
|
return DEFAULT_LOCAL_MODEL
|
||||||
|
return model_name
|
||||||
|
|
||||||
|
|
||||||
def _get_provider(stt_config: dict) -> str:
|
def _get_provider(stt_config: dict) -> str:
|
||||||
"""Determine which STT provider to use.
|
"""Determine which STT provider to use.
|
||||||
|
|
||||||
|
|
@ -121,15 +177,32 @@ def _get_provider(stt_config: dict) -> str:
|
||||||
if provider == "local":
|
if provider == "local":
|
||||||
if _HAS_FASTER_WHISPER:
|
if _HAS_FASTER_WHISPER:
|
||||||
return "local"
|
return "local"
|
||||||
|
if _has_local_command():
|
||||||
|
logger.info("faster-whisper not installed, falling back to local STT command")
|
||||||
|
return "local_command"
|
||||||
# Local requested but not available — fall back to groq, then openai
|
# Local requested but not available — fall back to groq, then openai
|
||||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||||
logger.info("faster-whisper not installed, falling back to Groq Whisper API")
|
logger.info("faster-whisper not installed, falling back to Groq Whisper API")
|
||||||
return "groq"
|
return "groq"
|
||||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||||
logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
|
logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
|
||||||
return "openai"
|
return "openai"
|
||||||
return "none"
|
return "none"
|
||||||
|
|
||||||
|
if provider == "local_command":
|
||||||
|
if _has_local_command():
|
||||||
|
return "local_command"
|
||||||
|
if _HAS_FASTER_WHISPER:
|
||||||
|
logger.info("Local STT command unavailable, falling back to local faster-whisper")
|
||||||
|
return "local"
|
||||||
|
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||||
|
logger.info("Local STT command unavailable, falling back to Groq Whisper API")
|
||||||
|
return "groq"
|
||||||
|
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||||
|
logger.info("Local STT command unavailable, falling back to OpenAI Whisper API")
|
||||||
|
return "openai"
|
||||||
|
return "none"
|
||||||
|
|
||||||
if provider == "groq":
|
if provider == "groq":
|
||||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||||
return "groq"
|
return "groq"
|
||||||
|
|
@ -137,20 +210,26 @@ def _get_provider(stt_config: dict) -> str:
|
||||||
if _HAS_FASTER_WHISPER:
|
if _HAS_FASTER_WHISPER:
|
||||||
logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
|
logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
|
||||||
return "local"
|
return "local"
|
||||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
if _has_local_command():
|
||||||
|
logger.info("GROQ_API_KEY not set, falling back to local STT command")
|
||||||
|
return "local_command"
|
||||||
|
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||||
logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
|
logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
|
||||||
return "openai"
|
return "openai"
|
||||||
return "none"
|
return "none"
|
||||||
|
|
||||||
if provider == "openai":
|
if provider == "openai":
|
||||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||||
return "openai"
|
return "openai"
|
||||||
# OpenAI requested but no key — fall back
|
# OpenAI requested but no key — fall back
|
||||||
if _HAS_FASTER_WHISPER:
|
if _HAS_FASTER_WHISPER:
|
||||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper")
|
logger.info("OpenAI STT key not set, falling back to local faster-whisper")
|
||||||
return "local"
|
return "local"
|
||||||
|
if _has_local_command():
|
||||||
|
logger.info("OpenAI STT key not set, falling back to local STT command")
|
||||||
|
return "local_command"
|
||||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to Groq Whisper API")
|
logger.info("OpenAI STT key not set, falling back to Groq Whisper API")
|
||||||
return "groq"
|
return "groq"
|
||||||
return "none"
|
return "none"
|
||||||
|
|
||||||
|
|
@ -222,6 +301,89 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
logger.error("Local transcription failed: %s", e, exc_info=True)
|
logger.error("Local transcription failed: %s", e, exc_info=True)
|
||||||
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_local_audio(file_path: str, work_dir: str) -> tuple[Optional[str], Optional[str]]:
|
||||||
|
"""Normalize audio for local CLI STT when needed."""
|
||||||
|
audio_path = Path(file_path)
|
||||||
|
if audio_path.suffix.lower() in LOCAL_NATIVE_AUDIO_FORMATS:
|
||||||
|
return file_path, None
|
||||||
|
|
||||||
|
ffmpeg = _find_ffmpeg_binary()
|
||||||
|
if not ffmpeg:
|
||||||
|
return None, "Local STT fallback requires ffmpeg for non-WAV inputs, but ffmpeg was not found"
|
||||||
|
|
||||||
|
converted_path = os.path.join(work_dir, f"{audio_path.stem}.wav")
|
||||||
|
command = [ffmpeg, "-y", "-i", file_path, converted_path]
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(command, check=True, capture_output=True, text=True)
|
||||||
|
return converted_path, None
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
details = e.stderr.strip() or e.stdout.strip() or str(e)
|
||||||
|
logger.error("ffmpeg conversion failed for %s: %s", file_path, details)
|
||||||
|
return None, f"Failed to convert audio for local STT: {details}"
|
||||||
|
|
||||||
|
|
||||||
|
def _transcribe_local_command(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
|
"""Run the configured local STT command template and read back a .txt transcript."""
|
||||||
|
command_template = _get_local_command_template()
|
||||||
|
if not command_template:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": (
|
||||||
|
f"{LOCAL_STT_COMMAND_ENV} not configured and no local whisper binary was found"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
language = os.getenv(LOCAL_STT_LANGUAGE_ENV, DEFAULT_LOCAL_STT_LANGUAGE)
|
||||||
|
normalized_model = _normalize_local_command_model(model_name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with tempfile.TemporaryDirectory(prefix="hermes-local-stt-") as output_dir:
|
||||||
|
prepared_input, prep_error = _prepare_local_audio(file_path, output_dir)
|
||||||
|
if prep_error:
|
||||||
|
return {"success": False, "transcript": "", "error": prep_error}
|
||||||
|
|
||||||
|
command = command_template.format(
|
||||||
|
input_path=shlex.quote(prepared_input),
|
||||||
|
output_dir=shlex.quote(output_dir),
|
||||||
|
language=shlex.quote(language),
|
||||||
|
model=shlex.quote(normalized_model),
|
||||||
|
)
|
||||||
|
subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
|
||||||
|
|
||||||
|
txt_files = sorted(Path(output_dir).glob("*.txt"))
|
||||||
|
if not txt_files:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": "Local STT command completed but did not produce a .txt transcript",
|
||||||
|
}
|
||||||
|
|
||||||
|
transcript_text = txt_files[0].read_text(encoding="utf-8").strip()
|
||||||
|
logger.info(
|
||||||
|
"Transcribed %s via local STT command (%s, %d chars)",
|
||||||
|
Path(file_path).name,
|
||||||
|
normalized_model,
|
||||||
|
len(transcript_text),
|
||||||
|
)
|
||||||
|
return {"success": True, "transcript": transcript_text, "provider": "local_command"}
|
||||||
|
|
||||||
|
except KeyError as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": f"Invalid {LOCAL_STT_COMMAND_ENV} template, missing placeholder: {e}",
|
||||||
|
}
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
details = e.stderr.strip() or e.stdout.strip() or str(e)
|
||||||
|
logger.error("Local STT command failed for %s: %s", file_path, details)
|
||||||
|
return {"success": False, "transcript": "", "error": f"Local STT failed: {details}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Unexpected error during local command transcription: %s", e, exc_info=True)
|
||||||
|
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Provider: groq (Whisper API — free tier)
|
# Provider: groq (Whisper API — free tier)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -277,9 +439,13 @@ def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
|
|
||||||
def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
|
def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||||
"""Transcribe using OpenAI Whisper API (paid)."""
|
"""Transcribe using OpenAI Whisper API (paid)."""
|
||||||
api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY")
|
api_key = _resolve_openai_api_key()
|
||||||
if not api_key:
|
if not api_key:
|
||||||
return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"}
|
return {
|
||||||
|
"success": False,
|
||||||
|
"transcript": "",
|
||||||
|
"error": "Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set",
|
||||||
|
}
|
||||||
|
|
||||||
if not _HAS_OPENAI:
|
if not _HAS_OPENAI:
|
||||||
return {"success": False, "transcript": "", "error": "openai package not installed"}
|
return {"success": False, "transcript": "", "error": "openai package not installed"}
|
||||||
|
|
@ -363,6 +529,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
||||||
model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
||||||
return _transcribe_local(file_path, model_name)
|
return _transcribe_local(file_path, model_name)
|
||||||
|
|
||||||
|
if provider == "local_command":
|
||||||
|
local_cfg = stt_config.get("local", {})
|
||||||
|
model_name = _normalize_local_command_model(
|
||||||
|
model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
||||||
|
)
|
||||||
|
return _transcribe_local_command(file_path, model_name)
|
||||||
|
|
||||||
if provider == "groq":
|
if provider == "groq":
|
||||||
model_name = model or DEFAULT_GROQ_STT_MODEL
|
model_name = model or DEFAULT_GROQ_STT_MODEL
|
||||||
return _transcribe_groq(file_path, model_name)
|
return _transcribe_groq(file_path, model_name)
|
||||||
|
|
@ -378,7 +551,8 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
||||||
"transcript": "",
|
"transcript": "",
|
||||||
"error": (
|
"error": (
|
||||||
"No STT provider available. Install faster-whisper for free local "
|
"No STT provider available. Install faster-whisper for free local "
|
||||||
"transcription, set GROQ_API_KEY for free Groq Whisper, "
|
f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
|
||||||
"or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
|
"set GROQ_API_KEY for free Groq Whisper, or set VOICE_TOOLS_OPENAI_KEY "
|
||||||
|
"or OPENAI_API_KEY for the OpenAI Whisper API."
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,9 @@ All variables go in `~/.hermes/.env`. You can also set them with `hermes config
|
||||||
| `CLAUDE_CODE_OAUTH_TOKEN` | Explicit Claude Code token override if you export one manually |
|
| `CLAUDE_CODE_OAUTH_TOKEN` | Explicit Claude Code token override if you export one manually |
|
||||||
| `HERMES_MODEL` | Preferred model name (checked before `LLM_MODEL`, used by gateway) |
|
| `HERMES_MODEL` | Preferred model name (checked before `LLM_MODEL`, used by gateway) |
|
||||||
| `LLM_MODEL` | Default model name (fallback when not set in config.yaml) |
|
| `LLM_MODEL` | Default model name (fallback when not set in config.yaml) |
|
||||||
| `VOICE_TOOLS_OPENAI_KEY` | OpenAI key for OpenAI speech-to-text and text-to-speech providers |
|
| `VOICE_TOOLS_OPENAI_KEY` | Preferred OpenAI key for OpenAI speech-to-text and text-to-speech providers |
|
||||||
|
| `HERMES_LOCAL_STT_COMMAND` | Optional local speech-to-text command template. Supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders |
|
||||||
|
| `HERMES_LOCAL_STT_LANGUAGE` | Default language passed to `HERMES_LOCAL_STT_COMMAND` or auto-detected local `whisper` CLI fallback (default: `en`) |
|
||||||
| `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) |
|
| `HERMES_HOME` | Override Hermes config directory (default: `~/.hermes`) |
|
||||||
|
|
||||||
## Provider Auth (OAuth)
|
## Provider Auth (OAuth)
|
||||||
|
|
|
||||||
|
|
@ -74,10 +74,11 @@ Voice messages sent on Telegram, Discord, WhatsApp, Slack, or Signal are automat
|
||||||
| Provider | Quality | Cost | API Key |
|
| Provider | Quality | Cost | API Key |
|
||||||
|----------|---------|------|---------|
|
|----------|---------|------|---------|
|
||||||
| **Local Whisper** (default) | Good | Free | None needed |
|
| **Local Whisper** (default) | Good | Free | None needed |
|
||||||
| **OpenAI Whisper API** | Good–Best | Paid | `VOICE_TOOLS_OPENAI_KEY` |
|
| **Groq Whisper API** | Good–Best | Free tier | `GROQ_API_KEY` |
|
||||||
|
| **OpenAI Whisper API** | Good–Best | Paid | `VOICE_TOOLS_OPENAI_KEY` or `OPENAI_API_KEY` |
|
||||||
|
|
||||||
:::info Zero Config
|
:::info Zero Config
|
||||||
Local transcription works out of the box — no API key needed. The `faster-whisper` model (~150 MB for `base`) is auto-downloaded on first voice message.
|
Local transcription works out of the box when `faster-whisper` is installed. If that's unavailable, Hermes can also use a local `whisper` CLI from common install locations (like `/opt/homebrew/bin`) or a custom command via `HERMES_LOCAL_STT_COMMAND`.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
@ -85,7 +86,7 @@ Local transcription works out of the box — no API key needed. The `faster-whis
|
||||||
```yaml
|
```yaml
|
||||||
# In ~/.hermes/config.yaml
|
# In ~/.hermes/config.yaml
|
||||||
stt:
|
stt:
|
||||||
provider: "local" # "local" (free, faster-whisper) | "openai" (API)
|
provider: "local" # "local" | "groq" | "openai"
|
||||||
local:
|
local:
|
||||||
model: "base" # tiny, base, small, medium, large-v3
|
model: "base" # tiny, base, small, medium, large-v3
|
||||||
openai:
|
openai:
|
||||||
|
|
@ -104,11 +105,16 @@ stt:
|
||||||
| `medium` | ~1.5 GB | Slower | Great |
|
| `medium` | ~1.5 GB | Slower | Great |
|
||||||
| `large-v3` | ~3 GB | Slowest | Best |
|
| `large-v3` | ~3 GB | Slowest | Best |
|
||||||
|
|
||||||
**OpenAI API** — Requires `VOICE_TOOLS_OPENAI_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
|
**Groq API** — Requires `GROQ_API_KEY`. Good cloud fallback when you want a free hosted STT option.
|
||||||
|
|
||||||
|
**OpenAI API** — Accepts `VOICE_TOOLS_OPENAI_KEY` first and falls back to `OPENAI_API_KEY`. Supports `whisper-1`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
|
||||||
|
|
||||||
|
**Custom local CLI fallback** — Set `HERMES_LOCAL_STT_COMMAND` if you want Hermes to call a local transcription command directly. The command template supports `{input_path}`, `{output_dir}`, `{language}`, and `{model}` placeholders.
|
||||||
|
|
||||||
### Fallback Behavior
|
### Fallback Behavior
|
||||||
|
|
||||||
If your configured provider isn't available, Hermes automatically falls back:
|
If your configured provider isn't available, Hermes automatically falls back:
|
||||||
- **Local not installed** → Falls back to OpenAI API (if key is set)
|
- **Local faster-whisper unavailable** → Tries a local `whisper` CLI or `HERMES_LOCAL_STT_COMMAND` before cloud providers
|
||||||
- **OpenAI key not set** → Falls back to local Whisper (if installed)
|
- **Groq key not set** → Falls back to local transcription, then OpenAI
|
||||||
- **Neither available** → Voice messages pass through with a note to the user
|
- **OpenAI key not set** → Falls back to local transcription, then Groq
|
||||||
|
- **Nothing available** → Voice messages pass through with an accurate note to the user
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue