mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-27 01:11:40 +00:00
feat: add voice mode with push-to-talk and TTS output for CLI
Implements Issue #314 Phase 2 & 3: - /voice command to toggle voice mode (on/off/tts/status) - Ctrl+Space push-to-talk recording via sounddevice - Whisper STT transcription via existing transcription_tools - Optional TTS response playback via existing tts_tool - Visual indicators in prompt (recording/transcribing/voice) - 21 unit tests, all mocked (no real mic/API) - Optional deps: sounddevice, numpy (pip install hermes-agent[voice])
This commit is contained in:
parent
cf3dceafe1
commit
1a6fbef8a9
6 changed files with 977 additions and 1 deletions
344
tools/voice_mode.py
Normal file
344
tools/voice_mode.py
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
"""Voice Mode -- Push-to-talk audio recording and playback for the CLI.
|
||||
|
||||
Provides audio capture via sounddevice, WAV encoding via stdlib wave,
|
||||
STT dispatch via tools.transcription_tools, and TTS playback via
|
||||
sounddevice or system audio players.
|
||||
|
||||
Dependencies (optional):
|
||||
pip install sounddevice numpy
|
||||
or: pip install hermes-agent[voice]
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Optional imports with graceful degradation
|
||||
# ---------------------------------------------------------------------------
|
||||
try:
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
|
||||
_HAS_AUDIO = True
|
||||
except ImportError:
|
||||
sd = None # type: ignore[assignment]
|
||||
np = None # type: ignore[assignment]
|
||||
_HAS_AUDIO = False
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recording parameters
|
||||
# ---------------------------------------------------------------------------
|
||||
SAMPLE_RATE = 16000 # Whisper native rate
|
||||
CHANNELS = 1 # Mono
|
||||
DTYPE = "int16" # 16-bit PCM
|
||||
SAMPLE_WIDTH = 2 # bytes per sample (int16)
|
||||
MAX_RECORDING_SECONDS = 120 # Safety cap
|
||||
|
||||
# Temp directory for voice recordings
|
||||
_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# AudioRecorder
|
||||
# ============================================================================
|
||||
class AudioRecorder:
|
||||
"""Thread-safe audio recorder using sounddevice.InputStream.
|
||||
|
||||
Usage::
|
||||
|
||||
recorder = AudioRecorder()
|
||||
recorder.start()
|
||||
# ... user speaks ...
|
||||
wav_path = recorder.stop() # returns path to WAV file
|
||||
# or
|
||||
recorder.cancel() # discard without saving
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.Lock()
|
||||
self._stream: Any = None
|
||||
self._frames: List[Any] = []
|
||||
self._recording = False
|
||||
self._start_time: float = 0.0
|
||||
|
||||
# -- public properties ---------------------------------------------------
|
||||
|
||||
@property
|
||||
def is_recording(self) -> bool:
|
||||
return self._recording
|
||||
|
||||
@property
|
||||
def elapsed_seconds(self) -> float:
|
||||
if not self._recording:
|
||||
return 0.0
|
||||
return time.monotonic() - self._start_time
|
||||
|
||||
# -- public methods ------------------------------------------------------
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start capturing audio from the default input device.
|
||||
|
||||
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
||||
or if a recording is already in progress.
|
||||
"""
|
||||
if not _HAS_AUDIO:
|
||||
raise RuntimeError(
|
||||
"Voice mode requires sounddevice and numpy.\n"
|
||||
"Install with: pip install sounddevice numpy\n"
|
||||
"Or: pip install hermes-agent[voice]"
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
if self._recording:
|
||||
return # already recording
|
||||
|
||||
self._frames = []
|
||||
self._start_time = time.monotonic()
|
||||
|
||||
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
||||
if status:
|
||||
logger.debug("sounddevice status: %s", status)
|
||||
self._frames.append(indata.copy())
|
||||
|
||||
self._stream = sd.InputStream(
|
||||
samplerate=SAMPLE_RATE,
|
||||
channels=CHANNELS,
|
||||
dtype=DTYPE,
|
||||
callback=_callback,
|
||||
)
|
||||
self._stream.start()
|
||||
self._recording = True
|
||||
logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
|
||||
|
||||
def stop(self) -> Optional[str]:
|
||||
"""Stop recording and write captured audio to a WAV file.
|
||||
|
||||
Returns:
|
||||
Path to the WAV file, or ``None`` if no audio was captured.
|
||||
"""
|
||||
with self._lock:
|
||||
if not self._recording:
|
||||
return None
|
||||
|
||||
self._recording = False
|
||||
|
||||
if self._stream is not None:
|
||||
try:
|
||||
self._stream.stop()
|
||||
self._stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._stream = None
|
||||
|
||||
if not self._frames:
|
||||
return None
|
||||
|
||||
# Concatenate frames and write WAV
|
||||
audio_data = np.concatenate(self._frames, axis=0)
|
||||
self._frames = []
|
||||
|
||||
elapsed = time.monotonic() - self._start_time
|
||||
logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data))
|
||||
|
||||
# Skip very short recordings (< 0.3s of audio)
|
||||
min_samples = int(SAMPLE_RATE * 0.3)
|
||||
if len(audio_data) < min_samples:
|
||||
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
|
||||
return None
|
||||
|
||||
return self._write_wav(audio_data)
|
||||
|
||||
def cancel(self) -> None:
|
||||
"""Stop recording and discard all captured audio."""
|
||||
with self._lock:
|
||||
self._recording = False
|
||||
self._frames = []
|
||||
|
||||
if self._stream is not None:
|
||||
try:
|
||||
self._stream.stop()
|
||||
self._stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._stream = None
|
||||
|
||||
logger.info("Voice recording cancelled")
|
||||
|
||||
# -- private helpers -----------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _write_wav(audio_data) -> str:
|
||||
"""Write numpy int16 audio data to a WAV file.
|
||||
|
||||
Returns the file path.
|
||||
"""
|
||||
os.makedirs(_TEMP_DIR, exist_ok=True)
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav")
|
||||
|
||||
with wave.open(wav_path, "wb") as wf:
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setsampwidth(SAMPLE_WIDTH)
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
wf.writeframes(audio_data.tobytes())
|
||||
|
||||
file_size = os.path.getsize(wav_path)
|
||||
logger.info("WAV written: %s (%d bytes)", wav_path, file_size)
|
||||
return wav_path
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# STT dispatch
|
||||
# ============================================================================
|
||||
def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Transcribe a WAV recording using the existing Whisper pipeline.
|
||||
|
||||
Delegates to ``tools.transcription_tools.transcribe_audio()``.
|
||||
|
||||
Args:
|
||||
wav_path: Path to the WAV file.
|
||||
model: Whisper model name (default: from config or ``whisper-1``).
|
||||
|
||||
Returns:
|
||||
Dict with ``success``, ``transcript``, and optionally ``error``.
|
||||
"""
|
||||
from tools.transcription_tools import transcribe_audio
|
||||
|
||||
return transcribe_audio(wav_path, model=model)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audio playback
|
||||
# ============================================================================
|
||||
def play_audio_file(file_path: str) -> bool:
|
||||
"""Play an audio file through the default output device.
|
||||
|
||||
Strategy:
|
||||
1. WAV files via ``sounddevice.play()`` when available.
|
||||
2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
|
||||
``aplay`` (Linux ALSA).
|
||||
|
||||
Returns:
|
||||
``True`` if playback succeeded, ``False`` otherwise.
|
||||
"""
|
||||
if not os.path.isfile(file_path):
|
||||
logger.warning("Audio file not found: %s", file_path)
|
||||
return False
|
||||
|
||||
# Try sounddevice for WAV files
|
||||
if _HAS_AUDIO and file_path.endswith(".wav"):
|
||||
try:
|
||||
with wave.open(file_path, "rb") as wf:
|
||||
frames = wf.readframes(wf.getnframes())
|
||||
audio_data = np.frombuffer(frames, dtype=np.int16)
|
||||
sample_rate = wf.getframerate()
|
||||
|
||||
sd.play(audio_data, samplerate=sample_rate)
|
||||
sd.wait()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug("sounddevice playback failed: %s", e)
|
||||
|
||||
# Fall back to system audio players
|
||||
system = platform.system()
|
||||
players = []
|
||||
|
||||
if system == "Darwin":
|
||||
players.append(["afplay", file_path])
|
||||
players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
|
||||
if system == "Linux":
|
||||
players.append(["aplay", "-q", file_path])
|
||||
|
||||
for cmd in players:
|
||||
exe = shutil.which(cmd[0])
|
||||
if exe:
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, timeout=300)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug("System player %s failed: %s", cmd[0], e)
|
||||
|
||||
logger.warning("No audio player available for %s", file_path)
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Requirements check
|
||||
# ============================================================================
|
||||
def check_voice_requirements() -> Dict[str, Any]:
|
||||
"""Check if all voice mode requirements are met.
|
||||
|
||||
Returns:
|
||||
Dict with ``available``, ``audio_available``, ``stt_key_set``,
|
||||
``missing_packages``, and ``details``.
|
||||
"""
|
||||
stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY"))
|
||||
missing: List[str] = []
|
||||
|
||||
if not _HAS_AUDIO:
|
||||
missing.extend(["sounddevice", "numpy"])
|
||||
|
||||
available = _HAS_AUDIO and stt_key_set
|
||||
details_parts = []
|
||||
|
||||
if _HAS_AUDIO:
|
||||
details_parts.append("Audio capture: OK")
|
||||
else:
|
||||
details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
|
||||
|
||||
if stt_key_set:
|
||||
details_parts.append("STT API key: OK")
|
||||
else:
|
||||
details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)")
|
||||
|
||||
return {
|
||||
"available": available,
|
||||
"audio_available": _HAS_AUDIO,
|
||||
"stt_key_set": stt_key_set,
|
||||
"missing_packages": missing,
|
||||
"details": "\n".join(details_parts),
|
||||
}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Temp file cleanup
|
||||
# ============================================================================
|
||||
def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int:
|
||||
"""Remove old temporary voice recording files.
|
||||
|
||||
Args:
|
||||
max_age_seconds: Delete files older than this (default: 1 hour).
|
||||
|
||||
Returns:
|
||||
Number of files deleted.
|
||||
"""
|
||||
if not os.path.isdir(_TEMP_DIR):
|
||||
return 0
|
||||
|
||||
deleted = 0
|
||||
now = time.time()
|
||||
|
||||
for entry in os.scandir(_TEMP_DIR):
|
||||
if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"):
|
||||
try:
|
||||
age = now - entry.stat().st_mtime
|
||||
if age > max_age_seconds:
|
||||
os.unlink(entry.path)
|
||||
deleted += 1
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if deleted:
|
||||
logger.debug("Cleaned up %d old voice recordings", deleted)
|
||||
return deleted
|
||||
Loading…
Add table
Add a link
Reference in a new issue