feat(tui): match CLI's voice slash + VAD-continuous recording model

The TUI had drifted from the CLI's voice model in two ways: - /voice on was lighting up the microphone immediately and Ctrl+B was interpreted as a mode toggle. The CLI separates the two: /voice on just flips the umbrella bit, recording only starts once the user presses Ctrl+B, which also sets _voice_continuous so the VAD loop auto-restarts until the user presses Ctrl+B again or three silent cycles pass. - /voice tts was missing entirely, so users couldn't turn agent reply speech on/off from inside the TUI. This commit brings the TUI to parity. Python - hermes_cli/voice.py: continuous-mode API (start_continuous, stop_continuous, is_continuous_active) layered on the existing PTT wrappers. The silence callback transcribes, fires on_transcript, tracks consecutive no-speech cycles, and auto-restarts — mirroring cli.py:_voice_stop_and_transcribe + _restart_recording. - tui_gateway/server.py: - voice.toggle now supports on / off / tts / status. The umbrella bit lives in HERMES_VOICE + display.voice_enabled; tts lives in HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down any active continuous loop so a toggle-off really releases the microphone. - voice.record start/stop now drives start_continuous/stop_continuous. start is refused with a clear error when the mode is off, matching cli.py:handle_voice_record's early return on `not _voice_mode`. - New voice.transcript / voice.status events emit through _voice_emit (remembers the sid that last enabled the mode so events land in the right session). TypeScript - gatewayTypes.ts: voice.status + voice.transcript event discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse gains status for the new "started/stopped" responses. - interfaces.ts: GatewayEventHandlerContext gains composer.setInput + submission.submitRef + voice.{setRecording, setProcessing, setVoiceEnabled}; InputHandlerContext.voice gains enabled + setVoiceEnabled for the mode-aware Ctrl+B handler. - createGatewayEventHandler.ts: voice.status drives REC/STT badges; voice.transcript auto-submits when the composer is empty (CLI _pending_input.put parity) and appends when a draft is in flight. no_speech_limit flips voice off + sys line. - useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop), not voice.toggle, and nudges the user with a sys line when the mode is off instead of silently flipping it on. - useMainApp.ts: wires the new event-handler context fields. - slash/commands/session.ts: /voice handles on / off / tts / status with CLI-matching output ("voice: mode on · tts off"). Backward compat preserved for voice.record (was always PTT shape; gateway still honours start/stop with mode-gating added).
2026-04-25 00:51:20 +00:00 · 2026-04-24 00:55:17 +03:00 · 2026-04-24 00:55:17 +03:00 · 04c489b587
commit 04c489b587
parent 0bb460b070
10 changed files with 861 additions and 78 deletions
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@ -2,18 +2,31 @@
 Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
 (text-to-speech) behind idempotent, stateful entry points that the gateway's
-``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a
+``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can
-dedicated thread. The gateway imports this module lazily so missing optional
+call from a dedicated thread. The gateway imports this module lazily so that
-audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError``
+missing optional audio deps (sounddevice, faster-whisper, numpy) surface as
-at call time, not at startup.
+an ``ImportError`` at call time, not at startup.
 Two usage modes are exposed:
 * **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single
  manually-bounded capture used when the caller drives the start/stop pair
  explicitly.
 * **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors
  the classic CLI voice mode: recording auto-stops on silence, transcribes,
  hands the result to a callback, and then auto-restarts for the next turn.
  Three consecutive no-speech cycles stop the loop and fire
  ``on_silent_limit`` so the UI can turn the mode off.
 """
 from __future__ import annotations
 import json
 import logging
 import os
 import sys
 import threading
-from typing import Optional
+from typing import Any, Callable, Optional
 from tools.voice_mode import (
    create_audio_recorder,
@ -24,15 +37,71 @@ from tools.voice_mode import (
 logger = logging.getLogger(__name__)
 def _debug(msg: str) -> None:
    """Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1.
    Goes to stderr so the TUI gateway wraps it as a gateway.stderr event,
    which createGatewayEventHandler shows as an Activity line — exactly
    what we need to diagnose "why didn't the loop auto-restart?" in the
    user's real terminal without shipping a separate debug RPC.
    """
    if os.environ.get("HERMES_VOICE_DEBUG", "").strip() == "1":
        print(f"[voice] {msg}", file=sys.stderr, flush=True)
 def _beeps_enabled() -> bool:
    """CLI parity: voice.beep_enabled in config.yaml (default True)."""
    try:
        from hermes_cli.config import load_config
        voice_cfg = load_config().get("voice", {})
        if isinstance(voice_cfg, dict):
            return bool(voice_cfg.get("beep_enabled", True))
    except Exception:
        pass
    return True
 def _play_beep(frequency: int, count: int = 1) -> None:
    """Audible cue matching cli.py's record/stop beeps.
    880 Hz single-beep on start (cli.py:_voice_start_recording line 7532),
    660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585).
    Best-effort — sounddevice failures are silently swallowed so the
    voice loop never breaks because a speaker was unavailable.
    """
    if not _beeps_enabled():
        return
    try:
        from tools.voice_mode import play_beep
        play_beep(frequency=frequency, count=count)
    except Exception as e:
        _debug(f"beep {frequency}Hz failed: {e}")
 # ── Push-to-talk state ───────────────────────────────────────────────
 _recorder = None
 _recorder_lock = threading.Lock()
 # ── Continuous (VAD) state ───────────────────────────────────────────
 _continuous_lock = threading.Lock()
 _continuous_active = False
 _continuous_recorder: Any = None
 _continuous_on_transcript: Optional[Callable[[str], None]] = None
 _continuous_on_status: Optional[Callable[[str], None]] = None
 _continuous_on_silent_limit: Optional[Callable[[], None]] = None
 _continuous_no_speech_count = 0
 _CONTINUOUS_NO_SPEECH_LIMIT = 3
 # ── Push-to-talk API ─────────────────────────────────────────────────
 def start_recording() -> None:
-    """Begin capturing from the default input device.
+    """Begin capturing from the default input device (push-to-talk).
-    Idempotent — calling again while a recording is in progress is a no-op,
+    Idempotent — calling again while a recording is in progress is a no-op.
    which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops).
    """
    global _recorder
@ -40,20 +109,15 @@ def start_recording() -> None:
        if _recorder is not None and getattr(_recorder, "is_recording", False):
            return
        rec = create_audio_recorder()
        # No silence callback: the TUI drives start/stop explicitly via
        # the voice.record RPC. VAD auto-stop is a CLI-mode feature.
        rec.start()
        _recorder = rec
 def stop_and_transcribe() -> Optional[str]:
-    """Stop the active recording, transcribe it, and return the text.
+    """Stop the active push-to-talk recording, transcribe, return text.
    Returns ``None`` when no recording is active, when the microphone
-    captured no speech, or when Whisper returned a known hallucination
+    captured no speech, or when Whisper returned a known hallucination.
    token (silence artefacts like "Thanks for watching!"). The caller
    treats ``None`` as "no speech detected" and leaves the composer
    untouched.
    """
    global _recorder
@ -73,27 +137,281 @@ def stop_and_transcribe() -> Optional[str]:
    except Exception as e:
        logger.warning("voice transcription failed: %s", e)
        return None
    finally:
        try:
            if os.path.isfile(wav_path):
                os.unlink(wav_path)
        except Exception:
            pass
-    text = (result.get("text") or "").strip()
+    # transcribe_recording returns {"success": bool, "transcript": str, ...}
    # — matches cli.py:_voice_stop_and_transcribe's result.get("transcript").
    if not result.get("success"):
        return None
    text = (result.get("transcript") or "").strip()
    if not text or is_whisper_hallucination(text):
        return None
    return text
 # ── Continuous (VAD) API ─────────────────────────────────────────────
 def start_continuous(
    on_transcript: Callable[[str], None],
    on_status: Optional[Callable[[str], None]] = None,
    on_silent_limit: Optional[Callable[[], None]] = None,
    silence_threshold: int = 200,
    silence_duration: float = 3.0,
 ) -> None:
    """Start a VAD-driven continuous recording loop.
    The loop calls ``on_transcript(text)`` each time speech is detected and
    transcribed successfully, then auto-restarts. After
    ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
    picked up at all) the loop stops itself and calls ``on_silent_limit``
    so the UI can reflect "voice off". Idempotent — calling while already
    active is a no-op.
    ``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
    ``"idle"`` so the UI can show a live indicator.
    """
    global _continuous_active, _continuous_recorder
    global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
    global _continuous_no_speech_count
    with _continuous_lock:
        if _continuous_active:
            _debug("start_continuous: already active — no-op")
            return
        _continuous_active = True
        _continuous_on_transcript = on_transcript
        _continuous_on_status = on_status
        _continuous_on_silent_limit = on_silent_limit
        _continuous_no_speech_count = 0
        if _continuous_recorder is None:
            _continuous_recorder = create_audio_recorder()
        _continuous_recorder._silence_threshold = silence_threshold
        _continuous_recorder._silence_duration = silence_duration
        rec = _continuous_recorder
    _debug(
        f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)"
    )
    # CLI parity: single 880 Hz beep *before* opening the stream — placing
    # the beep after stream.start() on macOS triggers a CoreAudio conflict
    # (cli.py:7528 comment).
    _play_beep(frequency=880, count=1)
    try:
        rec.start(on_silence_stop=_continuous_on_silence)
    except Exception as e:
        logger.error("failed to start continuous recording: %s", e)
        _debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}")
        with _continuous_lock:
            _continuous_active = False
        raise
    if on_status:
        try:
            on_status("listening")
        except Exception:
            pass
 def stop_continuous() -> None:
    """Stop the active continuous loop and release the microphone.
    Idempotent — calling while not active is a no-op. Any in-flight
    transcription completes but its result is discarded (the callback
    checks ``_continuous_active`` before firing).
    """
    global _continuous_active, _continuous_on_transcript
    global _continuous_on_status, _continuous_on_silent_limit
    global _continuous_recorder, _continuous_no_speech_count
    with _continuous_lock:
        if not _continuous_active:
            return
        _continuous_active = False
        rec = _continuous_recorder
        on_status = _continuous_on_status
        _continuous_on_transcript = None
        _continuous_on_status = None
        _continuous_on_silent_limit = None
        _continuous_no_speech_count = 0
    if rec is not None:
        try:
            # cancel() (not stop()) discards buffered frames — the loop
            # is over, we don't want to transcribe a half-captured turn.
            rec.cancel()
        except Exception as e:
            logger.warning("failed to cancel recorder: %s", e)
    # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the
    # silence-auto-stop path plays).
    _play_beep(frequency=660, count=2)
    if on_status:
        try:
            on_status("idle")
        except Exception:
            pass
 def is_continuous_active() -> bool:
    """Whether a continuous voice loop is currently running."""
    with _continuous_lock:
        return _continuous_active
 def _continuous_on_silence() -> None:
    """AudioRecorder silence callback — runs in a daemon thread.
    Stops the current capture, transcribes, delivers the text via
    ``on_transcript``, and — if the loop is still active — starts the
    next capture. Three consecutive silent cycles end the loop.
    """
    global _continuous_active, _continuous_no_speech_count
    _debug("_continuous_on_silence: fired")
    with _continuous_lock:
        if not _continuous_active:
            _debug("_continuous_on_silence: loop inactive — abort")
            return
        rec = _continuous_recorder
        on_transcript = _continuous_on_transcript
        on_status = _continuous_on_status
        on_silent_limit = _continuous_on_silent_limit
    if rec is None:
        _debug("_continuous_on_silence: no recorder — abort")
        return
    if on_status:
        try:
            on_status("transcribing")
        except Exception:
            pass
    wav_path = rec.stop()
    # Peak RMS is the critical diagnostic when stop() returns None despite
    # the VAD firing — tells us at a glance whether the mic was too quiet
    # for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree.
    peak_rms = getattr(rec, "_peak_rms", -1)
    _debug(
        f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})"
    )
    # CLI parity: double 660 Hz beep after the stream stops (safe from the
    # CoreAudio conflict that blocks pre-start beeps).
    _play_beep(frequency=660, count=2)
    transcript: Optional[str] = None
    if wav_path:
        try:
            result = transcribe_recording(wav_path)
            # transcribe_recording returns {"success": bool, "transcript": str,
            # "error": str?} — NOT {"text": str}.  Using the wrong key silently
            # produced empty transcripts even when Groq/local STT returned fine,
            # which masqueraded as "not hearing the user" to the caller.
            success = bool(result.get("success"))
            text = (result.get("transcript") or "").strip()
            err = result.get("error")
            _debug(
                f"_continuous_on_silence: transcribe -> success={success} "
                f"text={text!r} err={err!r}"
            )
            if success and text and not is_whisper_hallucination(text):
                transcript = text
        except Exception as e:
            logger.warning("continuous transcription failed: %s", e)
            _debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}")
        finally:
            try:
                if os.path.isfile(wav_path):
                    os.unlink(wav_path)
            except Exception:
                pass
    with _continuous_lock:
        if not _continuous_active:
            # User stopped us while we were transcribing — discard.
            _debug("_continuous_on_silence: stopped during transcribe — no restart")
            return
        if transcript:
            _continuous_no_speech_count = 0
        else:
            _continuous_no_speech_count += 1
        should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT
        no_speech = _continuous_no_speech_count
    if transcript and on_transcript:
        try:
            on_transcript(transcript)
        except Exception as e:
            logger.warning("on_transcript callback raised: %s", e)
    if should_halt:
        _debug(f"_continuous_on_silence: {no_speech} silent cycles — halting")
        with _continuous_lock:
            _continuous_active = False
            _continuous_no_speech_count = 0
        if on_silent_limit:
            try:
                on_silent_limit()
            except Exception:
                pass
        try:
            rec.cancel()
        except Exception:
            pass
        if on_status:
            try:
                on_status("idle")
            except Exception:
                pass
        return
    # Restart for the next turn.
    _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
    _play_beep(frequency=880, count=1)
    try:
        rec.start(on_silence_stop=_continuous_on_silence)
    except Exception as e:
        logger.error("failed to restart continuous recording: %s", e)
        _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
        with _continuous_lock:
            _continuous_active = False
        return
    if on_status:
        try:
            on_status("listening")
        except Exception:
            pass
 # ── TTS API ──────────────────────────────────────────────────────────
 def speak_text(text: str) -> None:
    """Synthesize ``text`` with the configured TTS provider and play it.
    The gateway spawns a daemon thread to call this so the RPC returns
-    immediately. Failures are logged and swallowed — the UI already
+    immediately. Failures are logged and swallowed.
    acknowledged "speaking" by the time we get here.
    """
    if not text or not text.strip():
        return
-    # Lazy import — tts_tool pulls optional provider SDKs (OpenAI,
+    # Lazy import — tts_tool pulls optional provider SDKs.
    # ElevenLabs, etc.) and config-reading machinery that we don't
    # want to load at module import time.
    from tools.tts_tool import text_to_speech_tool
    try:
--- a/tests/hermes_cli/test_voice_wrapper.py
+++ b/tests/hermes_cli/test_voice_wrapper.py
@ -51,3 +51,205 @@ class TestSpeakTextGuards:
        # Should simply return None without raising.
        assert speak_text(text) is None
 class TestContinuousAPI:
    """Continuous (VAD) mode API — CLI-parity loop entry points."""
    def test_continuous_exports(self):
        from hermes_cli.voice import (
            is_continuous_active,
            start_continuous,
            stop_continuous,
        )
        assert callable(start_continuous)
        assert callable(stop_continuous)
        assert callable(is_continuous_active)
    def test_not_active_by_default(self, monkeypatch):
        import hermes_cli.voice as voice
        # Isolate from any state left behind by other tests in the session.
        monkeypatch.setattr(voice, "_continuous_active", False)
        monkeypatch.setattr(voice, "_continuous_recorder", None)
        assert voice.is_continuous_active() is False
    def test_stop_continuous_idempotent_when_inactive(self, monkeypatch):
        """stop_continuous must not raise when no loop is active — the
        gateway's voice.toggle off path calls it unconditionally."""
        import hermes_cli.voice as voice
        monkeypatch.setattr(voice, "_continuous_active", False)
        monkeypatch.setattr(voice, "_continuous_recorder", None)
        # Should return cleanly without exceptions
        assert voice.stop_continuous() is None
        assert voice.is_continuous_active() is False
    def test_double_start_is_idempotent(self, monkeypatch):
        """A second start_continuous while already active is a no-op — prevents
        two overlapping capture threads fighting over the microphone when the
        UI double-fires (e.g. both /voice on and Ctrl+B within the same tick)."""
        import hermes_cli.voice as voice
        monkeypatch.setattr(voice, "_continuous_active", True)
        called = {"n": 0}
        class FakeRecorder:
            def start(self, on_silence_stop=None):
                called["n"] += 1
            def cancel(self):
                pass
        monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder())
        voice.start_continuous(on_transcript=lambda _t: None)
        # The guard inside start_continuous short-circuits before rec.start()
        assert called["n"] == 0
 class TestContinuousLoopSimulation:
    """End-to-end simulation of the VAD loop with a fake recorder.
    Proves auto-restart works: the silence callback must trigger transcribe →
    on_transcript → re-call rec.start(on_silence_stop=same_cb). Also covers
    the 3-strikes no-speech halt.
    """
    @pytest.fixture
    def fake_recorder(self, monkeypatch):
        import hermes_cli.voice as voice
        # Reset module state between tests.
        monkeypatch.setattr(voice, "_continuous_active", False)
        monkeypatch.setattr(voice, "_continuous_recorder", None)
        monkeypatch.setattr(voice, "_continuous_no_speech_count", 0)
        monkeypatch.setattr(voice, "_continuous_on_transcript", None)
        monkeypatch.setattr(voice, "_continuous_on_status", None)
        monkeypatch.setattr(voice, "_continuous_on_silent_limit", None)
        class FakeRecorder:
            _silence_threshold = 200
            _silence_duration = 3.0
            is_recording = False
            def __init__(self):
                self.start_calls = 0
                self.last_callback = None
                self.stopped = 0
                self.cancelled = 0
                # Preset WAV path returned by stop()
                self.next_stop_wav = "/tmp/fake.wav"
            def start(self, on_silence_stop=None):
                self.start_calls += 1
                self.last_callback = on_silence_stop
                self.is_recording = True
            def stop(self):
                self.stopped += 1
                self.is_recording = False
                return self.next_stop_wav
            def cancel(self):
                self.cancelled += 1
                self.is_recording = False
        rec = FakeRecorder()
        monkeypatch.setattr(voice, "create_audio_recorder", lambda: rec)
        # Skip real file ops in the silence callback.
        monkeypatch.setattr(voice.os.path, "isfile", lambda _p: False)
        return rec
    def test_loop_auto_restarts_after_transcript(self, fake_recorder, monkeypatch):
        import hermes_cli.voice as voice
        monkeypatch.setattr(
            voice,
            "transcribe_recording",
            lambda _p: {"success": True, "transcript": "hello world"},
        )
        monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
        transcripts = []
        statuses = []
        voice.start_continuous(
            on_transcript=lambda t: transcripts.append(t),
            on_status=lambda s: statuses.append(s),
        )
        assert fake_recorder.start_calls == 1
        assert statuses == ["listening"]
        # Simulate AudioRecorder's silence detector firing.
        fake_recorder.last_callback()
        assert transcripts == ["hello world"]
        assert fake_recorder.start_calls == 2  # auto-restarted
        assert statuses == ["listening", "transcribing", "listening"]
        assert voice.is_continuous_active() is True
        voice.stop_continuous()
    def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch):
        import hermes_cli.voice as voice
        # Transcription returns no speech — fake_recorder.stop() returns the
        # path, but transcribe returns empty text, counting as silence.
        monkeypatch.setattr(
            voice,
            "transcribe_recording",
            lambda _p: {"success": True, "transcript": ""},
        )
        monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
        transcripts = []
        silent_limit_fired = []
        voice.start_continuous(
            on_transcript=lambda t: transcripts.append(t),
            on_silent_limit=lambda: silent_limit_fired.append(True),
        )
        # Fire silence callback 3 times
        for _ in range(3):
            fake_recorder.last_callback()
        assert transcripts == []
        assert silent_limit_fired == [True]
        assert voice.is_continuous_active() is False
        assert fake_recorder.cancelled >= 1
    def test_stop_during_transcription_discards_restart(self, fake_recorder, monkeypatch):
        """User hits Ctrl+B mid-transcription: the in-flight transcript must
        still fire (it's a real utterance), but the loop must NOT restart."""
        import hermes_cli.voice as voice
        stop_triggered = {"flag": False}
        def late_transcribe(_p):
            # Simulate stop_continuous arriving while we're inside transcribe
            voice.stop_continuous()
            stop_triggered["flag"] = True
            return {"success": True, "transcript": "final word"}
        monkeypatch.setattr(voice, "transcribe_recording", late_transcribe)
        monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
        transcripts = []
        voice.start_continuous(on_transcript=lambda t: transcripts.append(t))
        initial_starts = fake_recorder.start_calls  # 1
        fake_recorder.last_callback()
        assert stop_triggered["flag"] is True
        # Loop is stopped — no auto-restart
        assert fake_recorder.start_calls == initial_starts
        # The in-flight transcript was suppressed because we stopped mid-flight
        assert transcripts == []
        assert voice.is_continuous_active() is False
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -3455,43 +3455,154 @@ def _(rid, params: dict) -> dict:
 # ── Methods: voice ───────────────────────────────────────────────────
 _voice_sid_lock = threading.Lock()
 _voice_event_sid: str = ""
 def _voice_emit(event: str, payload: dict | None = None) -> None:
    """Emit a voice event toward the session that most recently turned the
    mode on. Voice is process-global (one microphone), so there's only ever
    one sid to target; the TUI handler treats an empty sid as "active
    session". Kept separate from _emit to make the lack of per-call sid
    argument explicit."""
    with _voice_sid_lock:
        sid = _voice_event_sid
    _emit(event, sid, payload)
 def _voice_mode_enabled() -> bool:
    """Current voice-mode flag. HERMES_VOICE env var wins over config so
    the gateway and CLI agree when one of them was launched with an
    explicit override."""
    env = os.environ.get("HERMES_VOICE", "").strip()
    if env in {"0", "1"}:
        return env == "1"
    return bool(_load_cfg().get("display", {}).get("voice_enabled", False))
 def _voice_tts_enabled() -> bool:
    """Whether agent replies should be spoken back via TTS."""
    env = os.environ.get("HERMES_VOICE_TTS", "").strip()
    if env in {"0", "1"}:
        return env == "1"
    return bool(_load_cfg().get("display", {}).get("voice_tts", False))
@method("voice.toggle")
 def _(rid, params: dict) -> dict:
    """CLI parity for the ``/voice`` slash command.
    Subcommands:
    * ``status`` — report mode + TTS flags (default when action is unknown).
    * ``on`` / ``off`` — flip voice *mode* (the umbrella bit). Turning it
      off also tears down any active continuous recording loop. Does NOT
      start recording on its own; recording is driven by ``voice.record``
      (Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split.
    * ``tts`` — toggle speech-output of agent replies. Requires mode on
      (mirrors CLI's _toggle_voice_tts guard).
    """
    action = params.get("action", "status")
    if action == "status":
-        env = os.environ.get("HERMES_VOICE", "").strip()
+        # Mirror CLI's _show_voice_status: include STT/TTS provider
-        if env in {"0", "1"}:
+        # availability so the user can tell at a glance *why* voice mode
-            return _ok(rid, {"enabled": env == "1"})
+        # isn't working ("STT provider: MISSING ..." is the common case).
-        return _ok(
+        payload: dict = {
-            rid,
+            "enabled": _voice_mode_enabled(),
-            {
+            "tts": _voice_tts_enabled(),
-                "enabled": bool(
+        }
-                    _load_cfg().get("display", {}).get("voice_enabled", False)
+        try:
-                )
+            from tools.voice_mode import check_voice_requirements
-            },
+
-        )
+            reqs = check_voice_requirements()
            payload["available"] = bool(reqs.get("available"))
            payload["audio_available"] = bool(reqs.get("audio_available"))
            payload["stt_available"] = bool(reqs.get("stt_available"))
            payload["details"] = reqs.get("details") or ""
        except Exception as e:
            # check_voice_requirements pulls optional transcription deps —
            # swallow so /voice status always returns something useful.
            logger.warning("voice.toggle status: requirements probe failed: %s", e)
        return _ok(rid, payload)
    if action in ("on", "off"):
        enabled = action == "on"
        os.environ["HERMES_VOICE"] = "1" if enabled else "0"
        _write_config_key("display.voice_enabled", enabled)
-        return _ok(rid, {"enabled": action == "on"})
+
        if not enabled:
            # Disabling the mode must tear the continuous loop down; the
            # loop holds the microphone and would otherwise keep running.
            try:
                from hermes_cli.voice import stop_continuous
                stop_continuous()
            except ImportError:
                pass
            except Exception as e:
                logger.warning("voice: stop_continuous failed during toggle off: %s", e)
        return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()})
    if action == "tts":
        if not _voice_mode_enabled():
            return _err(rid, 4014, "enable voice mode first: /voice on")
        new_value = not _voice_tts_enabled()
        os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0"
        _write_config_key("display.voice_tts", new_value)
        return _ok(rid, {"enabled": True, "tts": new_value})
    return _err(rid, 4013, f"unknown voice action: {action}")
@method("voice.record")
 def _(rid, params: dict) -> dict:
    """VAD-driven continuous record loop, CLI-parity.
    ``start`` turns on a VAD loop that emits ``voice.transcript`` events
    for each detected utterance and auto-restarts for the next turn.
    ``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while-
    recording branch clearing ``_voice_continuous``). Three consecutive
    silent cycles stop the loop automatically and emit a
    ``voice.transcript`` with ``no_speech_limit=True``.
    """
    action = params.get("action", "start")
    if action not in {"start", "stop"}:
        return _err(rid, 4019, f"unknown voice action: {action}")
    try:
        if action == "start":
-            from hermes_cli.voice import start_recording
+            if not _voice_mode_enabled():
                return _err(rid, 4015, "voice mode is off — enable with /voice on")
-            start_recording()
+            with _voice_sid_lock:
                global _voice_event_sid
                _voice_event_sid = params.get("session_id") or _voice_event_sid
            from hermes_cli.voice import start_continuous
            voice_cfg = _load_cfg().get("voice", {})
            start_continuous(
                on_transcript=lambda t: _voice_emit(
                    "voice.transcript", {"text": t}
                ),
                on_status=lambda s: _voice_emit("voice.status", {"state": s}),
                on_silent_limit=lambda: _voice_emit(
                    "voice.transcript", {"no_speech_limit": True}
                ),
                silence_threshold=voice_cfg.get("silence_threshold", 200),
                silence_duration=voice_cfg.get("silence_duration", 3.0),
            )
            return _ok(rid, {"status": "recording"})
        if action == "stop":
            from hermes_cli.voice import stop_and_transcribe
-            return _ok(rid, {"text": stop_and_transcribe() or ""})
+        # action == "stop"
-        return _err(rid, 4019, f"unknown voice action: {action}")
+        from hermes_cli.voice import stop_continuous
        stop_continuous()
        return _ok(rid, {"status": "stopped"})
    except ImportError:
        return _err(
            rid, 5025, "voice module not available — install audio dependencies"
--- a/ui-tui/src/tests/createGatewayEventHandler.test.ts
+++ b/ui-tui/src/tests/createGatewayEventHandler.test.ts
@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
    composer: {
      dequeue: () => undefined,
      queueEditRef: ref<null | number>(null),
-      sendQueued: vi.fn()
+      sendQueued: vi.fn(),
      setInput: vi.fn()
    },
    gateway: {
      gw: { request: vi.fn() },
@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
      resumeById: vi.fn(),
      setCatalog: vi.fn()
    },
    submission: {
      submitRef: { current: vi.fn() }
    },
    system: {
      bellOnComplete: false,
      sys: vi.fn()
@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
      panel: (title: string, sections: any[]) =>
        appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
      setHistoryItems: vi.fn()
    },
    voice: {
      setProcessing: vi.fn(),
      setRecording: vi.fn(),
      setVoiceEnabled: vi.fn()
    }
  }) as any
--- a/ui-tui/src/app/createGatewayEventHandler.ts
+++ b/ui-tui/src/app/createGatewayEventHandler.ts
@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
  const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
  const { bellOnComplete, stdout, sys } = ctx.system
  const { appendMessage, panel, setHistoryItems } = ctx.transcript
  const { setInput } = ctx.composer
  const { submitRef } = ctx.submission
  const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice
  let pendingThinkingStatus = ''
  let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
        return
      }
      case 'voice.status': {
        // Continuous VAD loop reports its internal state so the status bar
        // can show listening / transcribing / idle without polling.
        const state = String(ev.payload?.state ?? '')
        if (state === 'listening') {
          setVoiceRecording(true)
          setVoiceProcessing(false)
        } else if (state === 'transcribing') {
          setVoiceRecording(false)
          setVoiceProcessing(true)
        } else {
          setVoiceRecording(false)
          setVoiceProcessing(false)
        }
        return
      }
      case 'voice.transcript': {
        // CLI parity: the 3-strikes silence detector flipped off automatically.
        // Mirror that on the UI side and tell the user why the mode is off.
        if (ev.payload?.no_speech_limit) {
          setVoiceEnabled(false)
          setVoiceRecording(false)
          setVoiceProcessing(false)
          sys('voice: no speech detected 3 times, continuous mode stopped')
          return
        }
        const text = String(ev.payload?.text ?? '').trim()
        if (!text) {
          return
        }
        // Match CLI's _pending_input.put(transcript): auto-submit when the
        // composer is empty, otherwise append so the user can keep editing
        // a partial draft they were working on.
        setInput(prev => {
          if (!prev) {
            // defer submit so React commits the state change first
            setTimeout(() => submitRef.current(text), 0)
            return ''
          }
          return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
        })
        return
      }
      case 'gateway.start_timeout': {
        const { cwd, python } = ev.payload ?? {}
        const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
--- a/ui-tui/src/app/interfaces.ts
+++ b/ui-tui/src/app/interfaces.ts
@ -189,9 +189,11 @@ export interface InputHandlerContext {
    stdout?: NodeJS.WriteStream
  }
  voice: {
    enabled: boolean
    recording: boolean
    setProcessing: StateSetter<boolean>
    setRecording: StateSetter<boolean>
    setVoiceEnabled: StateSetter<boolean>
  }
  wheelStep: number
 }
@ -201,6 +203,9 @@ export interface InputHandlerResult {
 }
 export interface GatewayEventHandlerContext {
  composer: {
    setInput: StateSetter<string>
  }
  gateway: GatewayServices
  session: {
    STARTUP_RESUME_ID: string
@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
    resumeById: (id: string) => void
    setCatalog: StateSetter<null | SlashCatalog>
  }
  submission: {
    submitRef: MutableRefObject<(value: string) => void>
  }
  system: {
    bellOnComplete: boolean
    stdout?: NodeJS.WriteStream
@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
    panel: (title: string, sections: PanelSection[]) => void
    setHistoryItems: StateSetter<Msg[]>
  }
  voice: {
    setProcessing: StateSetter<boolean>
    setRecording: StateSetter<boolean>
    setVoiceEnabled: StateSetter<boolean>
  }
 }
 export interface SlashHandlerContext {
--- a/ui-tui/src/app/slash/commands/session.ts
+++ b/ui-tui/src/app/slash/commands/session.ts
@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
  },
  {
-    help: 'toggle voice input',
+    help: 'voice mode: [on|off|tts|status]',
    name: 'voice',
    run: (arg, ctx) => {
-      const action = arg === 'on' || arg === 'off' ? arg : 'status'
+      const normalized = (arg ?? '').trim().toLowerCase()
      const action =
        normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
          ? normalized
          : 'status'
      ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
        ctx.guarded<VoiceToggleResponse>(r => {
          ctx.voice.setVoiceEnabled(!!r.enabled)
-          ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`)
+
          // Match CLI's _show_voice_status / _enable_voice_mode /
          // _toggle_voice_tts output shape so users don't have to learn
          // two vocabularies.
          if (action === 'status') {
            const mode = r.enabled ? 'ON' : 'OFF'
            const tts = r.tts ? 'ON' : 'OFF'
            ctx.transcript.sys('Voice Mode Status')
            ctx.transcript.sys(`  Mode:       ${mode}`)
            ctx.transcript.sys(`  TTS:        ${tts}`)
            ctx.transcript.sys('  Record key: Ctrl+B')
            // CLI's "Requirements:" block — surfaces STT/audio setup issues
            // so the user sees "STT provider: MISSING ..." instead of
            // silently failing on every Ctrl+B press.
            if (r.details) {
              ctx.transcript.sys('')
              ctx.transcript.sys('  Requirements:')
              for (const line of r.details.split('\n')) {
                if (line.trim()) {
                  ctx.transcript.sys(`    ${line}`)
                }
              }
            }
            return
          }
          if (action === 'tts') {
            ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
            return
          }
          // on/off — mirror cli.py:_enable_voice_mode's 3-line output
          if (r.enabled) {
            const tts = r.tts ? ' (TTS enabled)' : ''
            ctx.transcript.sys(`Voice mode enabled${tts}`)
            ctx.transcript.sys('  Ctrl+B to start/stop recording')
            ctx.transcript.sys('  /voice tts  to toggle speech output')
            ctx.transcript.sys('  /voice off  to disable voice mode')
          } else {
            ctx.transcript.sys('Voice mode disabled.')
          }
        })
      )
    }
--- a/ui-tui/src/app/useInputHandlers.ts
+++ b/ui-tui/src/app/useInputHandlers.ts
@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
    }
  }
-  const voiceStop = () => {
+  // CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
-    voice.setRecording(false)
+  // (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
-    voice.setProcessing(true)
+  // Ctrl+B while the mode is off sys-nudges the user. While the mode is
  // on, the first press starts a continuous loop (gateway → start_continuous,
  // VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
  // The gateway publishes voice.status + voice.transcript events that
  // createGatewayEventHandler turns into UI badges and composer injection.
  const voiceRecordToggle = () => {
    if (!voice.enabled) {
      return actions.sys('voice: mode is off — enable with /voice on')
    }
    const starting = !voice.recording
    const action = starting ? 'start' : 'stop'
    // Optimistic UI — flip the REC badge immediately so the user gets
    // feedback while the RPC round-trips; the voice.status event is the
    // authoritative source and may correct us.
    if (starting) {
      voice.setRecording(true)
    } else {
      voice.setRecording(false)
      voice.setProcessing(false)
    }
    gateway
-      .rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
+      .rpc<VoiceRecordResponse>('voice.record', { action })
-      .then(r => {
+      .catch((e: Error) => {
-        if (!r) {
+        // Revert optimistic UI on failure.
-          return
+        if (starting) {
          voice.setRecording(false)
        }
-        const transcript = String(r.text || '').trim()
+        actions.sys(`voice error: ${e.message}`)
        if (!transcript) {
          return actions.sys('voice: no speech detected')
        }
        cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
      })
      .catch((e: Error) => actions.sys(`voice error: ${e.message}`))
      .finally(() => {
        voice.setProcessing(false)
        patchUiState({ status: 'ready' })
      })
  }
  const voiceStart = () =>
    gateway
      .rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
      .then(r => {
        if (!r) {
          return
        }
        voice.setRecording(true)
        patchUiState({ status: 'recording…' })
      })
      .catch((e: Error) => actions.sys(`voice error: ${e.message}`))
  useInput((ch, key) => {
    const live = getUiState()
@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
    }
    if (isVoiceToggleKey(key, ch)) {
-      return voice.recording ? voiceStop() : voiceStart()
+      return voiceRecordToggle()
    }
    if (isAction(key, ch, 'g')) {
--- a/ui-tui/src/app/useMainApp.ts
+++ b/ui-tui/src/app/useMainApp.ts
@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
    composer: { actions: composerActions, refs: composerRefs, state: composerState },
    gateway,
    terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
-    voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
+    voice: {
      enabled: voiceEnabled,
      recording: voiceRecording,
      setProcessing: setVoiceProcessing,
      setRecording: setVoiceRecording,
      setVoiceEnabled
    },
    wheelStep: WHEEL_SCROLL_STEP
  })
  const onEvent = useMemo(
    () =>
      createGatewayEventHandler({
        composer: { setInput: composerActions.setInput },
        gateway,
        session: {
          STARTUP_RESUME_ID,
@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
          resumeById: session.resumeById,
          setCatalog
        },
        submission: { submitRef },
        system: { bellOnComplete, stdout, sys },
-        transcript: { appendMessage, panel, setHistoryItems }
+        transcript: { appendMessage, panel, setHistoryItems },
        voice: {
          setProcessing: setVoiceProcessing,
          setRecording: setVoiceRecording,
          setVoiceEnabled
        }
      }),
    [
      appendMessage,
      bellOnComplete,
      composerActions.setInput,
      gateway,
      panel,
      session.newSession,
      session.resetSession,
      session.resumeById,
      setVoiceEnabled,
      setVoiceProcessing,
      setVoiceRecording,
      stdout,
      submitRef,
      sys
    ]
  )
--- a/ui-tui/src/gatewayTypes.ts
+++ b/ui-tui/src/gatewayTypes.ts
@ -236,10 +236,16 @@ export interface ImageAttachResponse {
 // ── Voice ────────────────────────────────────────────────────────────
 export interface VoiceToggleResponse {
  audio_available?: boolean
  available?: boolean
  details?: string
  enabled?: boolean
  stt_available?: boolean
  tts?: boolean
 }
 export interface VoiceRecordResponse {
  status?: string
  text?: string
 }
@ -368,6 +374,8 @@ export type GatewayEvent =
  | { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
  | { payload?: undefined; session_id?: string; type: 'message.start' }
  | { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
  | { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
  | { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
  | { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
  | { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
  | { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }