From 04c489b5873dae86caa4c99757e004c767e1303f Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:55:17 +0300 Subject: [PATCH] feat(tui): match CLI's voice slash + VAD-continuous recording model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TUI had drifted from the CLI's voice model in two ways: - /voice on was lighting up the microphone immediately and Ctrl+B was interpreted as a mode toggle. The CLI separates the two: /voice on just flips the umbrella bit, recording only starts once the user presses Ctrl+B, which also sets _voice_continuous so the VAD loop auto-restarts until the user presses Ctrl+B again or three silent cycles pass. - /voice tts was missing entirely, so users couldn't turn agent reply speech on/off from inside the TUI. This commit brings the TUI to parity. Python - hermes_cli/voice.py: continuous-mode API (start_continuous, stop_continuous, is_continuous_active) layered on the existing PTT wrappers. The silence callback transcribes, fires on_transcript, tracks consecutive no-speech cycles, and auto-restarts — mirroring cli.py:_voice_stop_and_transcribe + _restart_recording. - tui_gateway/server.py: - voice.toggle now supports on / off / tts / status. The umbrella bit lives in HERMES_VOICE + display.voice_enabled; tts lives in HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down any active continuous loop so a toggle-off really releases the microphone. - voice.record start/stop now drives start_continuous/stop_continuous. start is refused with a clear error when the mode is off, matching cli.py:handle_voice_record's early return on `not _voice_mode`. - New voice.transcript / voice.status events emit through _voice_emit (remembers the sid that last enabled the mode so events land in the right session). TypeScript - gatewayTypes.ts: voice.status + voice.transcript event discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse gains status for the new "started/stopped" responses. - interfaces.ts: GatewayEventHandlerContext gains composer.setInput + submission.submitRef + voice.{setRecording, setProcessing, setVoiceEnabled}; InputHandlerContext.voice gains enabled + setVoiceEnabled for the mode-aware Ctrl+B handler. - createGatewayEventHandler.ts: voice.status drives REC/STT badges; voice.transcript auto-submits when the composer is empty (CLI _pending_input.put parity) and appends when a draft is in flight. no_speech_limit flips voice off + sys line. - useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop), not voice.toggle, and nudges the user with a sys line when the mode is off instead of silently flipping it on. - useMainApp.ts: wires the new event-handler context fields. - slash/commands/session.ts: /voice handles on / off / tts / status with CLI-matching output ("voice: mode on · tts off"). Backward compat preserved for voice.record (was always PTT shape; gateway still honours start/stop with mode-gating added). --- hermes_cli/voice.py | 360 +++++++++++++++++- tests/hermes_cli/test_voice_wrapper.py | 202 ++++++++++ tui_gateway/server.py | 147 ++++++- .../createGatewayEventHandler.test.ts | 11 +- ui-tui/src/app/createGatewayEventHandler.ts | 57 +++ ui-tui/src/app/interfaces.ts | 13 + ui-tui/src/app/slash/commands/session.ts | 55 ++- ui-tui/src/app/useInputHandlers.ts | 64 ++-- ui-tui/src/app/useMainApp.ts | 22 +- ui-tui/src/gatewayTypes.ts | 8 + 10 files changed, 861 insertions(+), 78 deletions(-) diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py index 71f180563..70e097e77 100644 --- a/hermes_cli/voice.py +++ b/hermes_cli/voice.py @@ -2,18 +2,31 @@ Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool`` (text-to-speech) behind idempotent, stateful entry points that the gateway's -``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a -dedicated thread. The gateway imports this module lazily so missing optional -audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError`` -at call time, not at startup. +``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can +call from a dedicated thread. The gateway imports this module lazily so that +missing optional audio deps (sounddevice, faster-whisper, numpy) surface as +an ``ImportError`` at call time, not at startup. + +Two usage modes are exposed: + +* **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single + manually-bounded capture used when the caller drives the start/stop pair + explicitly. +* **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors + the classic CLI voice mode: recording auto-stops on silence, transcribes, + hands the result to a callback, and then auto-restarts for the next turn. + Three consecutive no-speech cycles stop the loop and fire + ``on_silent_limit`` so the UI can turn the mode off. """ from __future__ import annotations import json import logging +import os +import sys import threading -from typing import Optional +from typing import Any, Callable, Optional from tools.voice_mode import ( create_audio_recorder, @@ -24,15 +37,71 @@ from tools.voice_mode import ( logger = logging.getLogger(__name__) + +def _debug(msg: str) -> None: + """Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1. + + Goes to stderr so the TUI gateway wraps it as a gateway.stderr event, + which createGatewayEventHandler shows as an Activity line — exactly + what we need to diagnose "why didn't the loop auto-restart?" in the + user's real terminal without shipping a separate debug RPC. + """ + if os.environ.get("HERMES_VOICE_DEBUG", "").strip() == "1": + print(f"[voice] {msg}", file=sys.stderr, flush=True) + + +def _beeps_enabled() -> bool: + """CLI parity: voice.beep_enabled in config.yaml (default True).""" + try: + from hermes_cli.config import load_config + + voice_cfg = load_config().get("voice", {}) + if isinstance(voice_cfg, dict): + return bool(voice_cfg.get("beep_enabled", True)) + except Exception: + pass + return True + + +def _play_beep(frequency: int, count: int = 1) -> None: + """Audible cue matching cli.py's record/stop beeps. + + 880 Hz single-beep on start (cli.py:_voice_start_recording line 7532), + 660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585). + Best-effort — sounddevice failures are silently swallowed so the + voice loop never breaks because a speaker was unavailable. + """ + if not _beeps_enabled(): + return + try: + from tools.voice_mode import play_beep + + play_beep(frequency=frequency, count=count) + except Exception as e: + _debug(f"beep {frequency}Hz failed: {e}") + +# ── Push-to-talk state ─────────────────────────────────────────────── _recorder = None _recorder_lock = threading.Lock() +# ── Continuous (VAD) state ─────────────────────────────────────────── +_continuous_lock = threading.Lock() +_continuous_active = False +_continuous_recorder: Any = None +_continuous_on_transcript: Optional[Callable[[str], None]] = None +_continuous_on_status: Optional[Callable[[str], None]] = None +_continuous_on_silent_limit: Optional[Callable[[], None]] = None +_continuous_no_speech_count = 0 +_CONTINUOUS_NO_SPEECH_LIMIT = 3 + + +# ── Push-to-talk API ───────────────────────────────────────────────── + def start_recording() -> None: - """Begin capturing from the default input device. + """Begin capturing from the default input device (push-to-talk). - Idempotent — calling again while a recording is in progress is a no-op, - which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops). + Idempotent — calling again while a recording is in progress is a no-op. """ global _recorder @@ -40,20 +109,15 @@ def start_recording() -> None: if _recorder is not None and getattr(_recorder, "is_recording", False): return rec = create_audio_recorder() - # No silence callback: the TUI drives start/stop explicitly via - # the voice.record RPC. VAD auto-stop is a CLI-mode feature. rec.start() _recorder = rec def stop_and_transcribe() -> Optional[str]: - """Stop the active recording, transcribe it, and return the text. + """Stop the active push-to-talk recording, transcribe, return text. Returns ``None`` when no recording is active, when the microphone - captured no speech, or when Whisper returned a known hallucination - token (silence artefacts like "Thanks for watching!"). The caller - treats ``None`` as "no speech detected" and leaves the composer - untouched. + captured no speech, or when Whisper returned a known hallucination. """ global _recorder @@ -73,27 +137,281 @@ def stop_and_transcribe() -> Optional[str]: except Exception as e: logger.warning("voice transcription failed: %s", e) return None + finally: + try: + if os.path.isfile(wav_path): + os.unlink(wav_path) + except Exception: + pass - text = (result.get("text") or "").strip() + # transcribe_recording returns {"success": bool, "transcript": str, ...} + # — matches cli.py:_voice_stop_and_transcribe's result.get("transcript"). + if not result.get("success"): + return None + text = (result.get("transcript") or "").strip() if not text or is_whisper_hallucination(text): return None return text +# ── Continuous (VAD) API ───────────────────────────────────────────── + + +def start_continuous( + on_transcript: Callable[[str], None], + on_status: Optional[Callable[[str], None]] = None, + on_silent_limit: Optional[Callable[[], None]] = None, + silence_threshold: int = 200, + silence_duration: float = 3.0, +) -> None: + """Start a VAD-driven continuous recording loop. + + The loop calls ``on_transcript(text)`` each time speech is detected and + transcribed successfully, then auto-restarts. After + ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech + picked up at all) the loop stops itself and calls ``on_silent_limit`` + so the UI can reflect "voice off". Idempotent — calling while already + active is a no-op. + + ``on_status`` is called with ``"listening"`` / ``"transcribing"`` / + ``"idle"`` so the UI can show a live indicator. + """ + global _continuous_active, _continuous_recorder + global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit + global _continuous_no_speech_count + + with _continuous_lock: + if _continuous_active: + _debug("start_continuous: already active — no-op") + return + _continuous_active = True + _continuous_on_transcript = on_transcript + _continuous_on_status = on_status + _continuous_on_silent_limit = on_silent_limit + _continuous_no_speech_count = 0 + + if _continuous_recorder is None: + _continuous_recorder = create_audio_recorder() + + _continuous_recorder._silence_threshold = silence_threshold + _continuous_recorder._silence_duration = silence_duration + rec = _continuous_recorder + + _debug( + f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)" + ) + + # CLI parity: single 880 Hz beep *before* opening the stream — placing + # the beep after stream.start() on macOS triggers a CoreAudio conflict + # (cli.py:7528 comment). + _play_beep(frequency=880, count=1) + + try: + rec.start(on_silence_stop=_continuous_on_silence) + except Exception as e: + logger.error("failed to start continuous recording: %s", e) + _debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}") + with _continuous_lock: + _continuous_active = False + raise + + if on_status: + try: + on_status("listening") + except Exception: + pass + + +def stop_continuous() -> None: + """Stop the active continuous loop and release the microphone. + + Idempotent — calling while not active is a no-op. Any in-flight + transcription completes but its result is discarded (the callback + checks ``_continuous_active`` before firing). + """ + global _continuous_active, _continuous_on_transcript + global _continuous_on_status, _continuous_on_silent_limit + global _continuous_recorder, _continuous_no_speech_count + + with _continuous_lock: + if not _continuous_active: + return + _continuous_active = False + rec = _continuous_recorder + on_status = _continuous_on_status + _continuous_on_transcript = None + _continuous_on_status = None + _continuous_on_silent_limit = None + _continuous_no_speech_count = 0 + + if rec is not None: + try: + # cancel() (not stop()) discards buffered frames — the loop + # is over, we don't want to transcribe a half-captured turn. + rec.cancel() + except Exception as e: + logger.warning("failed to cancel recorder: %s", e) + + # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the + # silence-auto-stop path plays). + _play_beep(frequency=660, count=2) + + if on_status: + try: + on_status("idle") + except Exception: + pass + + +def is_continuous_active() -> bool: + """Whether a continuous voice loop is currently running.""" + with _continuous_lock: + return _continuous_active + + +def _continuous_on_silence() -> None: + """AudioRecorder silence callback — runs in a daemon thread. + + Stops the current capture, transcribes, delivers the text via + ``on_transcript``, and — if the loop is still active — starts the + next capture. Three consecutive silent cycles end the loop. + """ + global _continuous_active, _continuous_no_speech_count + + _debug("_continuous_on_silence: fired") + + with _continuous_lock: + if not _continuous_active: + _debug("_continuous_on_silence: loop inactive — abort") + return + rec = _continuous_recorder + on_transcript = _continuous_on_transcript + on_status = _continuous_on_status + on_silent_limit = _continuous_on_silent_limit + + if rec is None: + _debug("_continuous_on_silence: no recorder — abort") + return + + if on_status: + try: + on_status("transcribing") + except Exception: + pass + + wav_path = rec.stop() + # Peak RMS is the critical diagnostic when stop() returns None despite + # the VAD firing — tells us at a glance whether the mic was too quiet + # for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree. + peak_rms = getattr(rec, "_peak_rms", -1) + _debug( + f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})" + ) + + # CLI parity: double 660 Hz beep after the stream stops (safe from the + # CoreAudio conflict that blocks pre-start beeps). + _play_beep(frequency=660, count=2) + + transcript: Optional[str] = None + + if wav_path: + try: + result = transcribe_recording(wav_path) + # transcribe_recording returns {"success": bool, "transcript": str, + # "error": str?} — NOT {"text": str}. Using the wrong key silently + # produced empty transcripts even when Groq/local STT returned fine, + # which masqueraded as "not hearing the user" to the caller. + success = bool(result.get("success")) + text = (result.get("transcript") or "").strip() + err = result.get("error") + _debug( + f"_continuous_on_silence: transcribe -> success={success} " + f"text={text!r} err={err!r}" + ) + if success and text and not is_whisper_hallucination(text): + transcript = text + except Exception as e: + logger.warning("continuous transcription failed: %s", e) + _debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}") + finally: + try: + if os.path.isfile(wav_path): + os.unlink(wav_path) + except Exception: + pass + + with _continuous_lock: + if not _continuous_active: + # User stopped us while we were transcribing — discard. + _debug("_continuous_on_silence: stopped during transcribe — no restart") + return + if transcript: + _continuous_no_speech_count = 0 + else: + _continuous_no_speech_count += 1 + should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT + no_speech = _continuous_no_speech_count + + if transcript and on_transcript: + try: + on_transcript(transcript) + except Exception as e: + logger.warning("on_transcript callback raised: %s", e) + + if should_halt: + _debug(f"_continuous_on_silence: {no_speech} silent cycles — halting") + with _continuous_lock: + _continuous_active = False + _continuous_no_speech_count = 0 + if on_silent_limit: + try: + on_silent_limit() + except Exception: + pass + try: + rec.cancel() + except Exception: + pass + if on_status: + try: + on_status("idle") + except Exception: + pass + return + + # Restart for the next turn. + _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})") + _play_beep(frequency=880, count=1) + try: + rec.start(on_silence_stop=_continuous_on_silence) + except Exception as e: + logger.error("failed to restart continuous recording: %s", e) + _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}") + with _continuous_lock: + _continuous_active = False + return + + if on_status: + try: + on_status("listening") + except Exception: + pass + + +# ── TTS API ────────────────────────────────────────────────────────── + + def speak_text(text: str) -> None: """Synthesize ``text`` with the configured TTS provider and play it. The gateway spawns a daemon thread to call this so the RPC returns - immediately. Failures are logged and swallowed — the UI already - acknowledged "speaking" by the time we get here. + immediately. Failures are logged and swallowed. """ if not text or not text.strip(): return - # Lazy import — tts_tool pulls optional provider SDKs (OpenAI, - # ElevenLabs, etc.) and config-reading machinery that we don't - # want to load at module import time. + # Lazy import — tts_tool pulls optional provider SDKs. from tools.tts_tool import text_to_speech_tool try: diff --git a/tests/hermes_cli/test_voice_wrapper.py b/tests/hermes_cli/test_voice_wrapper.py index f711ec356..a372c1194 100644 --- a/tests/hermes_cli/test_voice_wrapper.py +++ b/tests/hermes_cli/test_voice_wrapper.py @@ -51,3 +51,205 @@ class TestSpeakTextGuards: # Should simply return None without raising. assert speak_text(text) is None + + +class TestContinuousAPI: + """Continuous (VAD) mode API — CLI-parity loop entry points.""" + + def test_continuous_exports(self): + from hermes_cli.voice import ( + is_continuous_active, + start_continuous, + stop_continuous, + ) + + assert callable(start_continuous) + assert callable(stop_continuous) + assert callable(is_continuous_active) + + def test_not_active_by_default(self, monkeypatch): + import hermes_cli.voice as voice + + # Isolate from any state left behind by other tests in the session. + monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_recorder", None) + + assert voice.is_continuous_active() is False + + def test_stop_continuous_idempotent_when_inactive(self, monkeypatch): + """stop_continuous must not raise when no loop is active — the + gateway's voice.toggle off path calls it unconditionally.""" + import hermes_cli.voice as voice + + monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_recorder", None) + + # Should return cleanly without exceptions + assert voice.stop_continuous() is None + assert voice.is_continuous_active() is False + + def test_double_start_is_idempotent(self, monkeypatch): + """A second start_continuous while already active is a no-op — prevents + two overlapping capture threads fighting over the microphone when the + UI double-fires (e.g. both /voice on and Ctrl+B within the same tick).""" + import hermes_cli.voice as voice + + monkeypatch.setattr(voice, "_continuous_active", True) + called = {"n": 0} + + class FakeRecorder: + def start(self, on_silence_stop=None): + called["n"] += 1 + + def cancel(self): + pass + + monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder()) + + voice.start_continuous(on_transcript=lambda _t: None) + + # The guard inside start_continuous short-circuits before rec.start() + assert called["n"] == 0 + + +class TestContinuousLoopSimulation: + """End-to-end simulation of the VAD loop with a fake recorder. + + Proves auto-restart works: the silence callback must trigger transcribe → + on_transcript → re-call rec.start(on_silence_stop=same_cb). Also covers + the 3-strikes no-speech halt. + """ + + @pytest.fixture + def fake_recorder(self, monkeypatch): + import hermes_cli.voice as voice + + # Reset module state between tests. + monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_recorder", None) + monkeypatch.setattr(voice, "_continuous_no_speech_count", 0) + monkeypatch.setattr(voice, "_continuous_on_transcript", None) + monkeypatch.setattr(voice, "_continuous_on_status", None) + monkeypatch.setattr(voice, "_continuous_on_silent_limit", None) + + class FakeRecorder: + _silence_threshold = 200 + _silence_duration = 3.0 + is_recording = False + + def __init__(self): + self.start_calls = 0 + self.last_callback = None + self.stopped = 0 + self.cancelled = 0 + # Preset WAV path returned by stop() + self.next_stop_wav = "/tmp/fake.wav" + + def start(self, on_silence_stop=None): + self.start_calls += 1 + self.last_callback = on_silence_stop + self.is_recording = True + + def stop(self): + self.stopped += 1 + self.is_recording = False + return self.next_stop_wav + + def cancel(self): + self.cancelled += 1 + self.is_recording = False + + rec = FakeRecorder() + monkeypatch.setattr(voice, "create_audio_recorder", lambda: rec) + # Skip real file ops in the silence callback. + monkeypatch.setattr(voice.os.path, "isfile", lambda _p: False) + return rec + + def test_loop_auto_restarts_after_transcript(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "hello world"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + statuses = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_status=lambda s: statuses.append(s), + ) + + assert fake_recorder.start_calls == 1 + assert statuses == ["listening"] + + # Simulate AudioRecorder's silence detector firing. + fake_recorder.last_callback() + + assert transcripts == ["hello world"] + assert fake_recorder.start_calls == 2 # auto-restarted + assert statuses == ["listening", "transcribing", "listening"] + assert voice.is_continuous_active() is True + + voice.stop_continuous() + + def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + # Transcription returns no speech — fake_recorder.stop() returns the + # path, but transcribe returns empty text, counting as silence. + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": ""}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + silent_limit_fired = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_silent_limit=lambda: silent_limit_fired.append(True), + ) + + # Fire silence callback 3 times + for _ in range(3): + fake_recorder.last_callback() + + assert transcripts == [] + assert silent_limit_fired == [True] + assert voice.is_continuous_active() is False + assert fake_recorder.cancelled >= 1 + + def test_stop_during_transcription_discards_restart(self, fake_recorder, monkeypatch): + """User hits Ctrl+B mid-transcription: the in-flight transcript must + still fire (it's a real utterance), but the loop must NOT restart.""" + import hermes_cli.voice as voice + + stop_triggered = {"flag": False} + + def late_transcribe(_p): + # Simulate stop_continuous arriving while we're inside transcribe + voice.stop_continuous() + stop_triggered["flag"] = True + return {"success": True, "transcript": "final word"} + + monkeypatch.setattr(voice, "transcribe_recording", late_transcribe) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + voice.start_continuous(on_transcript=lambda t: transcripts.append(t)) + + initial_starts = fake_recorder.start_calls # 1 + fake_recorder.last_callback() + + assert stop_triggered["flag"] is True + # Loop is stopped — no auto-restart + assert fake_recorder.start_calls == initial_starts + # The in-flight transcript was suppressed because we stopped mid-flight + assert transcripts == [] + assert voice.is_continuous_active() is False diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 165b47bf9..130b60576 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -3455,43 +3455,154 @@ def _(rid, params: dict) -> dict: # ── Methods: voice ─────────────────────────────────────────────────── +_voice_sid_lock = threading.Lock() +_voice_event_sid: str = "" + + +def _voice_emit(event: str, payload: dict | None = None) -> None: + """Emit a voice event toward the session that most recently turned the + mode on. Voice is process-global (one microphone), so there's only ever + one sid to target; the TUI handler treats an empty sid as "active + session". Kept separate from _emit to make the lack of per-call sid + argument explicit.""" + with _voice_sid_lock: + sid = _voice_event_sid + _emit(event, sid, payload) + + +def _voice_mode_enabled() -> bool: + """Current voice-mode flag. HERMES_VOICE env var wins over config so + the gateway and CLI agree when one of them was launched with an + explicit override.""" + env = os.environ.get("HERMES_VOICE", "").strip() + if env in {"0", "1"}: + return env == "1" + return bool(_load_cfg().get("display", {}).get("voice_enabled", False)) + + +def _voice_tts_enabled() -> bool: + """Whether agent replies should be spoken back via TTS.""" + env = os.environ.get("HERMES_VOICE_TTS", "").strip() + if env in {"0", "1"}: + return env == "1" + return bool(_load_cfg().get("display", {}).get("voice_tts", False)) + + @method("voice.toggle") def _(rid, params: dict) -> dict: + """CLI parity for the ``/voice`` slash command. + + Subcommands: + + * ``status`` — report mode + TTS flags (default when action is unknown). + * ``on`` / ``off`` — flip voice *mode* (the umbrella bit). Turning it + off also tears down any active continuous recording loop. Does NOT + start recording on its own; recording is driven by ``voice.record`` + (Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split. + * ``tts`` — toggle speech-output of agent replies. Requires mode on + (mirrors CLI's _toggle_voice_tts guard). + """ action = params.get("action", "status") + if action == "status": - env = os.environ.get("HERMES_VOICE", "").strip() - if env in {"0", "1"}: - return _ok(rid, {"enabled": env == "1"}) - return _ok( - rid, - { - "enabled": bool( - _load_cfg().get("display", {}).get("voice_enabled", False) - ) - }, - ) + # Mirror CLI's _show_voice_status: include STT/TTS provider + # availability so the user can tell at a glance *why* voice mode + # isn't working ("STT provider: MISSING ..." is the common case). + payload: dict = { + "enabled": _voice_mode_enabled(), + "tts": _voice_tts_enabled(), + } + try: + from tools.voice_mode import check_voice_requirements + + reqs = check_voice_requirements() + payload["available"] = bool(reqs.get("available")) + payload["audio_available"] = bool(reqs.get("audio_available")) + payload["stt_available"] = bool(reqs.get("stt_available")) + payload["details"] = reqs.get("details") or "" + except Exception as e: + # check_voice_requirements pulls optional transcription deps — + # swallow so /voice status always returns something useful. + logger.warning("voice.toggle status: requirements probe failed: %s", e) + + return _ok(rid, payload) + if action in ("on", "off"): enabled = action == "on" os.environ["HERMES_VOICE"] = "1" if enabled else "0" _write_config_key("display.voice_enabled", enabled) - return _ok(rid, {"enabled": action == "on"}) + + if not enabled: + # Disabling the mode must tear the continuous loop down; the + # loop holds the microphone and would otherwise keep running. + try: + from hermes_cli.voice import stop_continuous + + stop_continuous() + except ImportError: + pass + except Exception as e: + logger.warning("voice: stop_continuous failed during toggle off: %s", e) + + return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()}) + + if action == "tts": + if not _voice_mode_enabled(): + return _err(rid, 4014, "enable voice mode first: /voice on") + new_value = not _voice_tts_enabled() + os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0" + _write_config_key("display.voice_tts", new_value) + return _ok(rid, {"enabled": True, "tts": new_value}) + return _err(rid, 4013, f"unknown voice action: {action}") @method("voice.record") def _(rid, params: dict) -> dict: + """VAD-driven continuous record loop, CLI-parity. + + ``start`` turns on a VAD loop that emits ``voice.transcript`` events + for each detected utterance and auto-restarts for the next turn. + ``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while- + recording branch clearing ``_voice_continuous``). Three consecutive + silent cycles stop the loop automatically and emit a + ``voice.transcript`` with ``no_speech_limit=True``. + """ action = params.get("action", "start") + + if action not in {"start", "stop"}: + return _err(rid, 4019, f"unknown voice action: {action}") + try: if action == "start": - from hermes_cli.voice import start_recording + if not _voice_mode_enabled(): + return _err(rid, 4015, "voice mode is off — enable with /voice on") - start_recording() + with _voice_sid_lock: + global _voice_event_sid + _voice_event_sid = params.get("session_id") or _voice_event_sid + + from hermes_cli.voice import start_continuous + + voice_cfg = _load_cfg().get("voice", {}) + start_continuous( + on_transcript=lambda t: _voice_emit( + "voice.transcript", {"text": t} + ), + on_status=lambda s: _voice_emit("voice.status", {"state": s}), + on_silent_limit=lambda: _voice_emit( + "voice.transcript", {"no_speech_limit": True} + ), + silence_threshold=voice_cfg.get("silence_threshold", 200), + silence_duration=voice_cfg.get("silence_duration", 3.0), + ) return _ok(rid, {"status": "recording"}) - if action == "stop": - from hermes_cli.voice import stop_and_transcribe - return _ok(rid, {"text": stop_and_transcribe() or ""}) - return _err(rid, 4019, f"unknown voice action: {action}") + # action == "stop" + from hermes_cli.voice import stop_continuous + + stop_continuous() + return _ok(rid, {"status": "stopped"}) except ImportError: return _err( rid, 5025, "voice module not available — install audio dependencies" diff --git a/ui-tui/src/__tests__/createGatewayEventHandler.test.ts b/ui-tui/src/__tests__/createGatewayEventHandler.test.ts index 23f7c4646..ef55d807c 100644 --- a/ui-tui/src/__tests__/createGatewayEventHandler.test.ts +++ b/ui-tui/src/__tests__/createGatewayEventHandler.test.ts @@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) => composer: { dequeue: () => undefined, queueEditRef: ref(null), - sendQueued: vi.fn() + sendQueued: vi.fn(), + setInput: vi.fn() }, gateway: { gw: { request: vi.fn() }, @@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) => resumeById: vi.fn(), setCatalog: vi.fn() }, + submission: { + submitRef: { current: vi.fn() } + }, system: { bellOnComplete: false, sys: vi.fn() @@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) => panel: (title: string, sections: any[]) => appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }), setHistoryItems: vi.fn() + }, + voice: { + setProcessing: vi.fn(), + setRecording: vi.fn(), + setVoiceEnabled: vi.fn() } }) as any diff --git a/ui-tui/src/app/createGatewayEventHandler.ts b/ui-tui/src/app/createGatewayEventHandler.ts index 1ec123f11..377735ca9 100644 --- a/ui-tui/src/app/createGatewayEventHandler.ts +++ b/ui-tui/src/app/createGatewayEventHandler.ts @@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session const { bellOnComplete, stdout, sys } = ctx.system const { appendMessage, panel, setHistoryItems } = ctx.transcript + const { setInput } = ctx.composer + const { submitRef } = ctx.submission + const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice let pendingThinkingStatus = '' let thinkingStatusTimer: null | ReturnType = null @@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev: return } + case 'voice.status': { + // Continuous VAD loop reports its internal state so the status bar + // can show listening / transcribing / idle without polling. + const state = String(ev.payload?.state ?? '') + + if (state === 'listening') { + setVoiceRecording(true) + setVoiceProcessing(false) + } else if (state === 'transcribing') { + setVoiceRecording(false) + setVoiceProcessing(true) + } else { + setVoiceRecording(false) + setVoiceProcessing(false) + } + + return + } + + case 'voice.transcript': { + // CLI parity: the 3-strikes silence detector flipped off automatically. + // Mirror that on the UI side and tell the user why the mode is off. + if (ev.payload?.no_speech_limit) { + setVoiceEnabled(false) + setVoiceRecording(false) + setVoiceProcessing(false) + sys('voice: no speech detected 3 times, continuous mode stopped') + + return + } + + const text = String(ev.payload?.text ?? '').trim() + + if (!text) { + return + } + + // Match CLI's _pending_input.put(transcript): auto-submit when the + // composer is empty, otherwise append so the user can keep editing + // a partial draft they were working on. + setInput(prev => { + if (!prev) { + // defer submit so React commits the state change first + setTimeout(() => submitRef.current(text), 0) + + return '' + } + + return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}` + }) + + return + } + case 'gateway.start_timeout': { const { cwd, python } = ev.payload ?? {} const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : '' diff --git a/ui-tui/src/app/interfaces.ts b/ui-tui/src/app/interfaces.ts index c1c427739..81def036c 100644 --- a/ui-tui/src/app/interfaces.ts +++ b/ui-tui/src/app/interfaces.ts @@ -189,9 +189,11 @@ export interface InputHandlerContext { stdout?: NodeJS.WriteStream } voice: { + enabled: boolean recording: boolean setProcessing: StateSetter setRecording: StateSetter + setVoiceEnabled: StateSetter } wheelStep: number } @@ -201,6 +203,9 @@ export interface InputHandlerResult { } export interface GatewayEventHandlerContext { + composer: { + setInput: StateSetter + } gateway: GatewayServices session: { STARTUP_RESUME_ID: string @@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext { resumeById: (id: string) => void setCatalog: StateSetter } + submission: { + submitRef: MutableRefObject<(value: string) => void> + } system: { bellOnComplete: boolean stdout?: NodeJS.WriteStream @@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext { panel: (title: string, sections: PanelSection[]) => void setHistoryItems: StateSetter } + voice: { + setProcessing: StateSetter + setRecording: StateSetter + setVoiceEnabled: StateSetter + } } export interface SlashHandlerContext { diff --git a/ui-tui/src/app/slash/commands/session.ts b/ui-tui/src/app/slash/commands/session.ts index 90a1beb3f..cf36fee6c 100644 --- a/ui-tui/src/app/slash/commands/session.ts +++ b/ui-tui/src/app/slash/commands/session.ts @@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [ }, { - help: 'toggle voice input', + help: 'voice mode: [on|off|tts|status]', name: 'voice', run: (arg, ctx) => { - const action = arg === 'on' || arg === 'off' ? arg : 'status' + const normalized = (arg ?? '').trim().toLowerCase() + + const action = + normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status' + ? normalized + : 'status' ctx.gateway.rpc('voice.toggle', { action }).then( ctx.guarded(r => { ctx.voice.setVoiceEnabled(!!r.enabled) - ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`) + + // Match CLI's _show_voice_status / _enable_voice_mode / + // _toggle_voice_tts output shape so users don't have to learn + // two vocabularies. + if (action === 'status') { + const mode = r.enabled ? 'ON' : 'OFF' + const tts = r.tts ? 'ON' : 'OFF' + ctx.transcript.sys('Voice Mode Status') + ctx.transcript.sys(` Mode: ${mode}`) + ctx.transcript.sys(` TTS: ${tts}`) + ctx.transcript.sys(' Record key: Ctrl+B') + + // CLI's "Requirements:" block — surfaces STT/audio setup issues + // so the user sees "STT provider: MISSING ..." instead of + // silently failing on every Ctrl+B press. + if (r.details) { + ctx.transcript.sys('') + ctx.transcript.sys(' Requirements:') + + for (const line of r.details.split('\n')) { + if (line.trim()) { + ctx.transcript.sys(` ${line}`) + } + } + } + + return + } + + if (action === 'tts') { + ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`) + + return + } + + // on/off — mirror cli.py:_enable_voice_mode's 3-line output + if (r.enabled) { + const tts = r.tts ? ' (TTS enabled)' : '' + ctx.transcript.sys(`Voice mode enabled${tts}`) + ctx.transcript.sys(' Ctrl+B to start/stop recording') + ctx.transcript.sys(' /voice tts to toggle speech output') + ctx.transcript.sys(' /voice off to disable voice mode') + } else { + ctx.transcript.sys('Voice mode disabled.') + } }) ) } diff --git a/ui-tui/src/app/useInputHandlers.ts b/ui-tui/src/app/useInputHandlers.ts index cfc3eed7c..47fe8a216 100644 --- a/ui-tui/src/app/useInputHandlers.ts +++ b/ui-tui/src/app/useInputHandlers.ts @@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { } } - const voiceStop = () => { - voice.setRecording(false) - voice.setProcessing(true) + // CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop + // (NOT the voice-mode umbrella bit). The mode is enabled via /voice on; + // Ctrl+B while the mode is off sys-nudges the user. While the mode is + // on, the first press starts a continuous loop (gateway → start_continuous, + // VAD auto-stop → transcribe → auto-restart), a subsequent press stops it. + // The gateway publishes voice.status + voice.transcript events that + // createGatewayEventHandler turns into UI badges and composer injection. + const voiceRecordToggle = () => { + if (!voice.enabled) { + return actions.sys('voice: mode is off — enable with /voice on') + } + + const starting = !voice.recording + const action = starting ? 'start' : 'stop' + + // Optimistic UI — flip the REC badge immediately so the user gets + // feedback while the RPC round-trips; the voice.status event is the + // authoritative source and may correct us. + if (starting) { + voice.setRecording(true) + } else { + voice.setRecording(false) + voice.setProcessing(false) + } gateway - .rpc('voice.record', { action: 'stop' }) - .then(r => { - if (!r) { - return + .rpc('voice.record', { action }) + .catch((e: Error) => { + // Revert optimistic UI on failure. + if (starting) { + voice.setRecording(false) } - const transcript = String(r.text || '').trim() - - if (!transcript) { - return actions.sys('voice: no speech detected') - } - - cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript)) - }) - .catch((e: Error) => actions.sys(`voice error: ${e.message}`)) - .finally(() => { - voice.setProcessing(false) - patchUiState({ status: 'ready' }) + actions.sys(`voice error: ${e.message}`) }) } - const voiceStart = () => - gateway - .rpc('voice.record', { action: 'start' }) - .then(r => { - if (!r) { - return - } - - voice.setRecording(true) - patchUiState({ status: 'recording…' }) - }) - .catch((e: Error) => actions.sys(`voice error: ${e.message}`)) - useInput((ch, key) => { const live = getUiState() @@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { } if (isVoiceToggleKey(key, ch)) { - return voice.recording ? voiceStop() : voiceStart() + return voiceRecordToggle() } if (isAction(key, ch, 'g')) { diff --git a/ui-tui/src/app/useMainApp.ts b/ui-tui/src/app/useMainApp.ts index 41edcc828..c061aa5dd 100644 --- a/ui-tui/src/app/useMainApp.ts +++ b/ui-tui/src/app/useMainApp.ts @@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) { composer: { actions: composerActions, refs: composerRefs, state: composerState }, gateway, terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout }, - voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording }, + voice: { + enabled: voiceEnabled, + recording: voiceRecording, + setProcessing: setVoiceProcessing, + setRecording: setVoiceRecording, + setVoiceEnabled + }, wheelStep: WHEEL_SCROLL_STEP }) const onEvent = useMemo( () => createGatewayEventHandler({ + composer: { setInput: composerActions.setInput }, gateway, session: { STARTUP_RESUME_ID, @@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) { resumeById: session.resumeById, setCatalog }, + submission: { submitRef }, system: { bellOnComplete, stdout, sys }, - transcript: { appendMessage, panel, setHistoryItems } + transcript: { appendMessage, panel, setHistoryItems }, + voice: { + setProcessing: setVoiceProcessing, + setRecording: setVoiceRecording, + setVoiceEnabled + } }), [ appendMessage, bellOnComplete, + composerActions.setInput, gateway, panel, session.newSession, session.resetSession, session.resumeById, + setVoiceEnabled, + setVoiceProcessing, + setVoiceRecording, stdout, + submitRef, sys ] ) diff --git a/ui-tui/src/gatewayTypes.ts b/ui-tui/src/gatewayTypes.ts index 1dc8ea5be..05f8d9a41 100644 --- a/ui-tui/src/gatewayTypes.ts +++ b/ui-tui/src/gatewayTypes.ts @@ -236,10 +236,16 @@ export interface ImageAttachResponse { // ── Voice ──────────────────────────────────────────────────────────── export interface VoiceToggleResponse { + audio_available?: boolean + available?: boolean + details?: string enabled?: boolean + stt_available?: boolean + tts?: boolean } export interface VoiceRecordResponse { + status?: string text?: string } @@ -368,6 +374,8 @@ export type GatewayEvent = | { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' } | { payload?: undefined; session_id?: string; type: 'message.start' } | { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' } + | { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' } + | { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' } | { payload: { line: string }; session_id?: string; type: 'gateway.stderr' } | { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' } | { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }