mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: restore local STT fallback for gateway voice notes
Restore local STT command fallback for voice transcription, detect whisper and ffmpeg in common local install paths, and avoid bogus no-provider messaging when only a backend-specific key is missing.
This commit is contained in:
parent
5beb681c70
commit
1f72ce71b7
6 changed files with 324 additions and 18 deletions
|
|
@ -25,6 +25,10 @@ Usage::
|
|||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
|
|
@ -44,13 +48,18 @@ _HAS_OPENAI = _ilu.find_spec("openai") is not None
|
|||
|
||||
DEFAULT_PROVIDER = "local"
|
||||
DEFAULT_LOCAL_MODEL = "base"
|
||||
DEFAULT_LOCAL_STT_LANGUAGE = "en"
|
||||
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
|
||||
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
|
||||
LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND"
|
||||
LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE"
|
||||
COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
|
||||
|
||||
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
||||
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
|
||||
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
||||
LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
|
||||
MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB
|
||||
|
||||
# Known model sets for auto-correction
|
||||
|
|
@ -105,6 +114,53 @@ def is_stt_enabled(stt_config: Optional[dict] = None) -> bool:
|
|||
return bool(enabled)
|
||||
|
||||
|
||||
def _resolve_openai_api_key() -> str:
|
||||
"""Prefer the voice-tools key, but fall back to the normal OpenAI key."""
|
||||
return os.getenv("VOICE_TOOLS_OPENAI_KEY", "") or os.getenv("OPENAI_API_KEY", "")
|
||||
|
||||
|
||||
def _find_binary(binary_name: str) -> Optional[str]:
|
||||
"""Find a local binary, checking common Homebrew/local prefixes as well as PATH."""
|
||||
for directory in COMMON_LOCAL_BIN_DIRS:
|
||||
candidate = Path(directory) / binary_name
|
||||
if candidate.exists() and os.access(candidate, os.X_OK):
|
||||
return str(candidate)
|
||||
return shutil.which(binary_name)
|
||||
|
||||
|
||||
def _find_ffmpeg_binary() -> Optional[str]:
|
||||
return _find_binary("ffmpeg")
|
||||
|
||||
|
||||
def _find_whisper_binary() -> Optional[str]:
|
||||
return _find_binary("whisper")
|
||||
|
||||
|
||||
def _get_local_command_template() -> Optional[str]:
|
||||
configured = os.getenv(LOCAL_STT_COMMAND_ENV, "").strip()
|
||||
if configured:
|
||||
return configured
|
||||
|
||||
whisper_binary = _find_whisper_binary()
|
||||
if whisper_binary:
|
||||
quoted_binary = shlex.quote(whisper_binary)
|
||||
return (
|
||||
f"{quoted_binary} {{input_path}} --model {{model}} --output_format txt "
|
||||
"--output_dir {output_dir} --language {language}"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def _has_local_command() -> bool:
|
||||
return _get_local_command_template() is not None
|
||||
|
||||
|
||||
def _normalize_local_command_model(model_name: Optional[str]) -> str:
|
||||
if not model_name or model_name in OPENAI_MODELS or model_name in GROQ_MODELS:
|
||||
return DEFAULT_LOCAL_MODEL
|
||||
return model_name
|
||||
|
||||
|
||||
def _get_provider(stt_config: dict) -> str:
|
||||
"""Determine which STT provider to use.
|
||||
|
||||
|
|
@ -121,15 +177,32 @@ def _get_provider(stt_config: dict) -> str:
|
|||
if provider == "local":
|
||||
if _HAS_FASTER_WHISPER:
|
||||
return "local"
|
||||
if _has_local_command():
|
||||
logger.info("faster-whisper not installed, falling back to local STT command")
|
||||
return "local_command"
|
||||
# Local requested but not available — fall back to groq, then openai
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
logger.info("faster-whisper not installed, falling back to Groq Whisper API")
|
||||
return "groq"
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||
logger.info("faster-whisper not installed, falling back to OpenAI Whisper API")
|
||||
return "openai"
|
||||
return "none"
|
||||
|
||||
if provider == "local_command":
|
||||
if _has_local_command():
|
||||
return "local_command"
|
||||
if _HAS_FASTER_WHISPER:
|
||||
logger.info("Local STT command unavailable, falling back to local faster-whisper")
|
||||
return "local"
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
logger.info("Local STT command unavailable, falling back to Groq Whisper API")
|
||||
return "groq"
|
||||
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||
logger.info("Local STT command unavailable, falling back to OpenAI Whisper API")
|
||||
return "openai"
|
||||
return "none"
|
||||
|
||||
if provider == "groq":
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
return "groq"
|
||||
|
|
@ -137,20 +210,26 @@ def _get_provider(stt_config: dict) -> str:
|
|||
if _HAS_FASTER_WHISPER:
|
||||
logger.info("GROQ_API_KEY not set, falling back to local faster-whisper")
|
||||
return "local"
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
if _has_local_command():
|
||||
logger.info("GROQ_API_KEY not set, falling back to local STT command")
|
||||
return "local_command"
|
||||
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||
logger.info("GROQ_API_KEY not set, falling back to OpenAI Whisper API")
|
||||
return "openai"
|
||||
return "none"
|
||||
|
||||
if provider == "openai":
|
||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||
if _HAS_OPENAI and _resolve_openai_api_key():
|
||||
return "openai"
|
||||
# OpenAI requested but no key — fall back
|
||||
if _HAS_FASTER_WHISPER:
|
||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper")
|
||||
logger.info("OpenAI STT key not set, falling back to local faster-whisper")
|
||||
return "local"
|
||||
if _has_local_command():
|
||||
logger.info("OpenAI STT key not set, falling back to local STT command")
|
||||
return "local_command"
|
||||
if _HAS_OPENAI and os.getenv("GROQ_API_KEY"):
|
||||
logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to Groq Whisper API")
|
||||
logger.info("OpenAI STT key not set, falling back to Groq Whisper API")
|
||||
return "groq"
|
||||
return "none"
|
||||
|
||||
|
|
@ -222,6 +301,89 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]:
|
|||
logger.error("Local transcription failed: %s", e, exc_info=True)
|
||||
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
||||
|
||||
|
||||
def _prepare_local_audio(file_path: str, work_dir: str) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Normalize audio for local CLI STT when needed."""
|
||||
audio_path = Path(file_path)
|
||||
if audio_path.suffix.lower() in LOCAL_NATIVE_AUDIO_FORMATS:
|
||||
return file_path, None
|
||||
|
||||
ffmpeg = _find_ffmpeg_binary()
|
||||
if not ffmpeg:
|
||||
return None, "Local STT fallback requires ffmpeg for non-WAV inputs, but ffmpeg was not found"
|
||||
|
||||
converted_path = os.path.join(work_dir, f"{audio_path.stem}.wav")
|
||||
command = [ffmpeg, "-y", "-i", file_path, converted_path]
|
||||
|
||||
try:
|
||||
subprocess.run(command, check=True, capture_output=True, text=True)
|
||||
return converted_path, None
|
||||
except subprocess.CalledProcessError as e:
|
||||
details = e.stderr.strip() or e.stdout.strip() or str(e)
|
||||
logger.error("ffmpeg conversion failed for %s: %s", file_path, details)
|
||||
return None, f"Failed to convert audio for local STT: {details}"
|
||||
|
||||
|
||||
def _transcribe_local_command(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
"""Run the configured local STT command template and read back a .txt transcript."""
|
||||
command_template = _get_local_command_template()
|
||||
if not command_template:
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": (
|
||||
f"{LOCAL_STT_COMMAND_ENV} not configured and no local whisper binary was found"
|
||||
),
|
||||
}
|
||||
|
||||
language = os.getenv(LOCAL_STT_LANGUAGE_ENV, DEFAULT_LOCAL_STT_LANGUAGE)
|
||||
normalized_model = _normalize_local_command_model(model_name)
|
||||
|
||||
try:
|
||||
with tempfile.TemporaryDirectory(prefix="hermes-local-stt-") as output_dir:
|
||||
prepared_input, prep_error = _prepare_local_audio(file_path, output_dir)
|
||||
if prep_error:
|
||||
return {"success": False, "transcript": "", "error": prep_error}
|
||||
|
||||
command = command_template.format(
|
||||
input_path=shlex.quote(prepared_input),
|
||||
output_dir=shlex.quote(output_dir),
|
||||
language=shlex.quote(language),
|
||||
model=shlex.quote(normalized_model),
|
||||
)
|
||||
subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
|
||||
|
||||
txt_files = sorted(Path(output_dir).glob("*.txt"))
|
||||
if not txt_files:
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": "Local STT command completed but did not produce a .txt transcript",
|
||||
}
|
||||
|
||||
transcript_text = txt_files[0].read_text(encoding="utf-8").strip()
|
||||
logger.info(
|
||||
"Transcribed %s via local STT command (%s, %d chars)",
|
||||
Path(file_path).name,
|
||||
normalized_model,
|
||||
len(transcript_text),
|
||||
)
|
||||
return {"success": True, "transcript": transcript_text, "provider": "local_command"}
|
||||
|
||||
except KeyError as e:
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": f"Invalid {LOCAL_STT_COMMAND_ENV} template, missing placeholder: {e}",
|
||||
}
|
||||
except subprocess.CalledProcessError as e:
|
||||
details = e.stderr.strip() or e.stdout.strip() or str(e)
|
||||
logger.error("Local STT command failed for %s: %s", file_path, details)
|
||||
return {"success": False, "transcript": "", "error": f"Local STT failed: {details}"}
|
||||
except Exception as e:
|
||||
logger.error("Unexpected error during local command transcription: %s", e, exc_info=True)
|
||||
return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider: groq (Whisper API — free tier)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -277,9 +439,13 @@ def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]:
|
|||
|
||||
def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
"""Transcribe using OpenAI Whisper API (paid)."""
|
||||
api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY")
|
||||
api_key = _resolve_openai_api_key()
|
||||
if not api_key:
|
||||
return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"}
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": "Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set",
|
||||
}
|
||||
|
||||
if not _HAS_OPENAI:
|
||||
return {"success": False, "transcript": "", "error": "openai package not installed"}
|
||||
|
|
@ -363,6 +529,13 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
||||
return _transcribe_local(file_path, model_name)
|
||||
|
||||
if provider == "local_command":
|
||||
local_cfg = stt_config.get("local", {})
|
||||
model_name = _normalize_local_command_model(
|
||||
model or local_cfg.get("model", DEFAULT_LOCAL_MODEL)
|
||||
)
|
||||
return _transcribe_local_command(file_path, model_name)
|
||||
|
||||
if provider == "groq":
|
||||
model_name = model or DEFAULT_GROQ_STT_MODEL
|
||||
return _transcribe_groq(file_path, model_name)
|
||||
|
|
@ -378,7 +551,8 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
"transcript": "",
|
||||
"error": (
|
||||
"No STT provider available. Install faster-whisper for free local "
|
||||
"transcription, set GROQ_API_KEY for free Groq Whisper, "
|
||||
"or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API."
|
||||
f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
|
||||
"set GROQ_API_KEY for free Groq Whisper, or set VOICE_TOOLS_OPENAI_KEY "
|
||||
"or OPENAI_API_KEY for the OpenAI Whisper API."
|
||||
),
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue