feat: add voice mode with push-to-talk and TTS output for CLI

Implements Issue #314 Phase 2 & 3: - /voice command to toggle voice mode (on/off/tts/status) - Ctrl+Space push-to-talk recording via sounddevice - Whisper STT transcription via existing transcription_tools - Optional TTS response playback via existing tts_tool - Visual indicators in prompt (recording/transcribing/voice) - 21 unit tests, all mocked (no real mic/API) - Optional deps: sounddevice, numpy (pip install hermes-agent[voice])
2026-04-27 01:11:40 +00:00 · 2026-03-03 16:17:05 +03:00 · 2026-03-03 16:17:05 +03:00 · 1a6fbef8a9
commit 1a6fbef8a9
parent cf3dceafe1
6 changed files with 977 additions and 1 deletions
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@ -0,0 +1,344 @@
+"""Voice Mode -- Push-to-talk audio recording and playback for the CLI.
+
+Provides audio capture via sounddevice, WAV encoding via stdlib wave,
+STT dispatch via tools.transcription_tools, and TTS playback via
+sounddevice or system audio players.
+
+Dependencies (optional):
+    pip install sounddevice numpy
+    or: pip install hermes-agent[voice]
+"""
+
+import logging
+import os
+import platform
+import shutil
+import subprocess
+import tempfile
+import threading
+import time
+import wave
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Optional imports with graceful degradation
+# ---------------------------------------------------------------------------
+try:
+    import sounddevice as sd
+    import numpy as np
+
+    _HAS_AUDIO = True
+except ImportError:
+    sd = None  # type: ignore[assignment]
+    np = None  # type: ignore[assignment]
+    _HAS_AUDIO = False
+
+# ---------------------------------------------------------------------------
+# Recording parameters
+# ---------------------------------------------------------------------------
+SAMPLE_RATE = 16000  # Whisper native rate
+CHANNELS = 1  # Mono
+DTYPE = "int16"  # 16-bit PCM
+SAMPLE_WIDTH = 2  # bytes per sample (int16)
+MAX_RECORDING_SECONDS = 120  # Safety cap
+
+# Temp directory for voice recordings
+_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice")
+
+
+# ============================================================================
+# AudioRecorder
+# ============================================================================
+class AudioRecorder:
+    """Thread-safe audio recorder using sounddevice.InputStream.
+
+    Usage::
+
+        recorder = AudioRecorder()
+        recorder.start()
+        # ... user speaks ...
+        wav_path = recorder.stop()   # returns path to WAV file
+        # or
+        recorder.cancel()            # discard without saving
+    """
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._stream: Any = None
+        self._frames: List[Any] = []
+        self._recording = False
+        self._start_time: float = 0.0
+
+    # -- public properties ---------------------------------------------------
+
+    @property
+    def is_recording(self) -> bool:
+        return self._recording
+
+    @property
+    def elapsed_seconds(self) -> float:
+        if not self._recording:
+            return 0.0
+        return time.monotonic() - self._start_time
+
+    # -- public methods ------------------------------------------------------
+
+    def start(self) -> None:
+        """Start capturing audio from the default input device.
+
+        Raises ``RuntimeError`` if sounddevice/numpy are not installed
+        or if a recording is already in progress.
+        """
+        if not _HAS_AUDIO:
+            raise RuntimeError(
+                "Voice mode requires sounddevice and numpy.\n"
+                "Install with: pip install sounddevice numpy\n"
+                "Or: pip install hermes-agent[voice]"
+            )
+
+        with self._lock:
+            if self._recording:
+                return  # already recording
+
+            self._frames = []
+            self._start_time = time.monotonic()
+
+            def _callback(indata, frames, time_info, status):  # noqa: ARG001
+                if status:
+                    logger.debug("sounddevice status: %s", status)
+                self._frames.append(indata.copy())
+
+            self._stream = sd.InputStream(
+                samplerate=SAMPLE_RATE,
+                channels=CHANNELS,
+                dtype=DTYPE,
+                callback=_callback,
+            )
+            self._stream.start()
+            self._recording = True
+            logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
+
+    def stop(self) -> Optional[str]:
+        """Stop recording and write captured audio to a WAV file.
+
+        Returns:
+            Path to the WAV file, or ``None`` if no audio was captured.
+        """
+        with self._lock:
+            if not self._recording:
+                return None
+
+            self._recording = False
+
+            if self._stream is not None:
+                try:
+                    self._stream.stop()
+                    self._stream.close()
+                except Exception:
+                    pass
+                self._stream = None
+
+            if not self._frames:
+                return None
+
+            # Concatenate frames and write WAV
+            audio_data = np.concatenate(self._frames, axis=0)
+            self._frames = []
+
+            elapsed = time.monotonic() - self._start_time
+            logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data))
+
+            # Skip very short recordings (< 0.3s of audio)
+            min_samples = int(SAMPLE_RATE * 0.3)
+            if len(audio_data) < min_samples:
+                logger.debug("Recording too short (%d samples), discarding", len(audio_data))
+                return None
+
+            return self._write_wav(audio_data)
+
+    def cancel(self) -> None:
+        """Stop recording and discard all captured audio."""
+        with self._lock:
+            self._recording = False
+            self._frames = []
+
+            if self._stream is not None:
+                try:
+                    self._stream.stop()
+                    self._stream.close()
+                except Exception:
+                    pass
+                self._stream = None
+
+            logger.info("Voice recording cancelled")
+
+    # -- private helpers -----------------------------------------------------
+
+    @staticmethod
+    def _write_wav(audio_data) -> str:
+        """Write numpy int16 audio data to a WAV file.
+
+        Returns the file path.
+        """
+        os.makedirs(_TEMP_DIR, exist_ok=True)
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav")
+
+        with wave.open(wav_path, "wb") as wf:
+            wf.setnchannels(CHANNELS)
+            wf.setsampwidth(SAMPLE_WIDTH)
+            wf.setframerate(SAMPLE_RATE)
+            wf.writeframes(audio_data.tobytes())
+
+        file_size = os.path.getsize(wav_path)
+        logger.info("WAV written: %s (%d bytes)", wav_path, file_size)
+        return wav_path
+
+
+# ============================================================================
+# STT dispatch
+# ============================================================================
+def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]:
+    """Transcribe a WAV recording using the existing Whisper pipeline.
+
+    Delegates to ``tools.transcription_tools.transcribe_audio()``.
+
+    Args:
+        wav_path: Path to the WAV file.
+        model: Whisper model name (default: from config or ``whisper-1``).
+
+    Returns:
+        Dict with ``success``, ``transcript``, and optionally ``error``.
+    """
+    from tools.transcription_tools import transcribe_audio
+
+    return transcribe_audio(wav_path, model=model)
+
+
+# ============================================================================
+# Audio playback
+# ============================================================================
+def play_audio_file(file_path: str) -> bool:
+    """Play an audio file through the default output device.
+
+    Strategy:
+    1. WAV files via ``sounddevice.play()`` when available.
+    2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
+       ``aplay`` (Linux ALSA).
+
+    Returns:
+        ``True`` if playback succeeded, ``False`` otherwise.
+    """
+    if not os.path.isfile(file_path):
+        logger.warning("Audio file not found: %s", file_path)
+        return False
+
+    # Try sounddevice for WAV files
+    if _HAS_AUDIO and file_path.endswith(".wav"):
+        try:
+            with wave.open(file_path, "rb") as wf:
+                frames = wf.readframes(wf.getnframes())
+                audio_data = np.frombuffer(frames, dtype=np.int16)
+                sample_rate = wf.getframerate()
+
+            sd.play(audio_data, samplerate=sample_rate)
+            sd.wait()
+            return True
+        except Exception as e:
+            logger.debug("sounddevice playback failed: %s", e)
+
+    # Fall back to system audio players
+    system = platform.system()
+    players = []
+
+    if system == "Darwin":
+        players.append(["afplay", file_path])
+    players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path])
+    if system == "Linux":
+        players.append(["aplay", "-q", file_path])
+
+    for cmd in players:
+        exe = shutil.which(cmd[0])
+        if exe:
+            try:
+                subprocess.run(cmd, capture_output=True, timeout=300)
+                return True
+            except Exception as e:
+                logger.debug("System player %s failed: %s", cmd[0], e)
+
+    logger.warning("No audio player available for %s", file_path)
+    return False
+
+
+# ============================================================================
+# Requirements check
+# ============================================================================
+def check_voice_requirements() -> Dict[str, Any]:
+    """Check if all voice mode requirements are met.
+
+    Returns:
+        Dict with ``available``, ``audio_available``, ``stt_key_set``,
+        ``missing_packages``, and ``details``.
+    """
+    stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY"))
+    missing: List[str] = []
+
+    if not _HAS_AUDIO:
+        missing.extend(["sounddevice", "numpy"])
+
+    available = _HAS_AUDIO and stt_key_set
+    details_parts = []
+
+    if _HAS_AUDIO:
+        details_parts.append("Audio capture: OK")
+    else:
+        details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
+
+    if stt_key_set:
+        details_parts.append("STT API key: OK")
+    else:
+        details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)")
+
+    return {
+        "available": available,
+        "audio_available": _HAS_AUDIO,
+        "stt_key_set": stt_key_set,
+        "missing_packages": missing,
+        "details": "\n".join(details_parts),
+    }
+
+
+# ============================================================================
+# Temp file cleanup
+# ============================================================================
+def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int:
+    """Remove old temporary voice recording files.
+
+    Args:
+        max_age_seconds: Delete files older than this (default: 1 hour).
+
+    Returns:
+        Number of files deleted.
+    """
+    if not os.path.isdir(_TEMP_DIR):
+        return 0
+
+    deleted = 0
+    now = time.time()
+
+    for entry in os.scandir(_TEMP_DIR):
+        if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"):
+            try:
+                age = now - entry.stat().st_mtime
+                if age > max_age_seconds:
+                    os.unlink(entry.path)
+                    deleted += 1
+            except OSError:
+                pass
+
+    if deleted:
+        logger.debug("Cleaned up %d old voice recordings", deleted)
+    return deleted