diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py new file mode 100644 index 0000000000..71f1805633 --- /dev/null +++ b/hermes_cli/voice.py @@ -0,0 +1,120 @@ +"""Process-wide voice recording + TTS API for the TUI gateway. + +Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool`` +(text-to-speech) behind idempotent, stateful entry points that the gateway's +``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a +dedicated thread. The gateway imports this module lazily so missing optional +audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError`` +at call time, not at startup. +""" + +from __future__ import annotations + +import json +import logging +import threading +from typing import Optional + +from tools.voice_mode import ( + create_audio_recorder, + is_whisper_hallucination, + play_audio_file, + transcribe_recording, +) + +logger = logging.getLogger(__name__) + +_recorder = None +_recorder_lock = threading.Lock() + + +def start_recording() -> None: + """Begin capturing from the default input device. + + Idempotent — calling again while a recording is in progress is a no-op, + which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops). + """ + global _recorder + + with _recorder_lock: + if _recorder is not None and getattr(_recorder, "is_recording", False): + return + rec = create_audio_recorder() + # No silence callback: the TUI drives start/stop explicitly via + # the voice.record RPC. VAD auto-stop is a CLI-mode feature. + rec.start() + _recorder = rec + + +def stop_and_transcribe() -> Optional[str]: + """Stop the active recording, transcribe it, and return the text. + + Returns ``None`` when no recording is active, when the microphone + captured no speech, or when Whisper returned a known hallucination + token (silence artefacts like "Thanks for watching!"). The caller + treats ``None`` as "no speech detected" and leaves the composer + untouched. + """ + global _recorder + + with _recorder_lock: + rec = _recorder + _recorder = None + + if rec is None: + return None + + wav_path = rec.stop() + if not wav_path: + return None + + try: + result = transcribe_recording(wav_path) + except Exception as e: + logger.warning("voice transcription failed: %s", e) + return None + + text = (result.get("text") or "").strip() + if not text or is_whisper_hallucination(text): + return None + + return text + + +def speak_text(text: str) -> None: + """Synthesize ``text`` with the configured TTS provider and play it. + + The gateway spawns a daemon thread to call this so the RPC returns + immediately. Failures are logged and swallowed — the UI already + acknowledged "speaking" by the time we get here. + """ + if not text or not text.strip(): + return + + # Lazy import — tts_tool pulls optional provider SDKs (OpenAI, + # ElevenLabs, etc.) and config-reading machinery that we don't + # want to load at module import time. + from tools.tts_tool import text_to_speech_tool + + try: + raw = text_to_speech_tool(text) + except Exception as e: + logger.warning("TTS synthesis failed: %s", e) + return + + try: + result = json.loads(raw) if isinstance(raw, str) else raw + except json.JSONDecodeError: + logger.warning("TTS returned non-JSON result") + return + + if not isinstance(result, dict): + return + + file_path = result.get("file_path") + if not file_path: + err = result.get("error") or "no file_path in TTS result" + logger.warning("TTS succeeded but produced no audio: %s", err) + return + + play_audio_file(file_path) diff --git a/tests/hermes_cli/test_voice_wrapper.py b/tests/hermes_cli/test_voice_wrapper.py new file mode 100644 index 0000000000..f711ec356f --- /dev/null +++ b/tests/hermes_cli/test_voice_wrapper.py @@ -0,0 +1,53 @@ +"""Tests for ``hermes_cli.voice`` — the TUI gateway's voice wrapper. + +The module is imported *lazily* by ``tui_gateway/server.py`` so that a +box with missing audio deps fails at call time (returning a clean RPC +error) rather than at gateway startup. These tests therefore only +assert the public contract the gateway depends on: the three symbols +exist, ``stop_and_transcribe`` is a no-op when nothing is recording, +and ``speak_text`` tolerates empty input without touching the provider +stack. +""" + +import os +import sys + +import pytest + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + + +class TestPublicAPI: + def test_gateway_symbols_importable(self): + """Match the exact import shape tui_gateway/server.py uses.""" + from hermes_cli.voice import ( + speak_text, + start_recording, + stop_and_transcribe, + ) + + assert callable(start_recording) + assert callable(stop_and_transcribe) + assert callable(speak_text) + + +class TestStopWithoutStart: + def test_returns_none_when_no_recording_active(self, monkeypatch): + """Idempotent no-op: stop before start must not raise or touch state.""" + import hermes_cli.voice as voice + + monkeypatch.setattr(voice, "_recorder", None) + + assert voice.stop_and_transcribe() is None + + +class TestSpeakTextGuards: + @pytest.mark.parametrize("text", ["", " ", "\n\t "]) + def test_empty_text_is_noop(self, text): + """Empty / whitespace-only text must return without importing tts_tool + (the gateway spawns a thread per call, so a no-op on empty input + keeps the thread pool from churning on trivial inputs).""" + from hermes_cli.voice import speak_text + + # Should simply return None without raising. + assert speak_text(text) is None