feat(tui): match CLI's voice slash + VAD-continuous recording model

The TUI had drifted from the CLI's voice model in two ways: - /voice on was lighting up the microphone immediately and Ctrl+B was interpreted as a mode toggle. The CLI separates the two: /voice on just flips the umbrella bit, recording only starts once the user presses Ctrl+B, which also sets _voice_continuous so the VAD loop auto-restarts until the user presses Ctrl+B again or three silent cycles pass. - /voice tts was missing entirely, so users couldn't turn agent reply speech on/off from inside the TUI. This commit brings the TUI to parity. Python - hermes_cli/voice.py: continuous-mode API (start_continuous, stop_continuous, is_continuous_active) layered on the existing PTT wrappers. The silence callback transcribes, fires on_transcript, tracks consecutive no-speech cycles, and auto-restarts — mirroring cli.py:_voice_stop_and_transcribe + _restart_recording. - tui_gateway/server.py: - voice.toggle now supports on / off / tts / status. The umbrella bit lives in HERMES_VOICE + display.voice_enabled; tts lives in HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down any active continuous loop so a toggle-off really releases the microphone. - voice.record start/stop now drives start_continuous/stop_continuous. start is refused with a clear error when the mode is off, matching cli.py:handle_voice_record's early return on `not _voice_mode`. - New voice.transcript / voice.status events emit through _voice_emit (remembers the sid that last enabled the mode so events land in the right session). TypeScript - gatewayTypes.ts: voice.status + voice.transcript event discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse gains status for the new "started/stopped" responses. - interfaces.ts: GatewayEventHandlerContext gains composer.setInput + submission.submitRef + voice.{setRecording, setProcessing, setVoiceEnabled}; InputHandlerContext.voice gains enabled + setVoiceEnabled for the mode-aware Ctrl+B handler. - createGatewayEventHandler.ts: voice.status drives REC/STT badges; voice.transcript auto-submits when the composer is empty (CLI _pending_input.put parity) and appends when a draft is in flight. no_speech_limit flips voice off + sys line. - useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop), not voice.toggle, and nudges the user with a sys line when the mode is off instead of silently flipping it on. - useMainApp.ts: wires the new event-handler context fields. - slash/commands/session.ts: /voice handles on / off / tts / status with CLI-matching output ("voice: mode on · tts off"). Backward compat preserved for voice.record (was always PTT shape; gateway still honours start/stop with mode-gating added).
2026-07-27 17:58:07 +00:00 · 2026-04-24 00:55:17 +03:00 · 2026-04-24 00:55:17 +03:00 · 04c489b587
commit 04c489b587
parent 0bb460b070
10 changed files with 861 additions and 78 deletions
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@ -2,18 +2,31 @@

 Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
 (text-to-speech) behind idempotent, stateful entry points that the gateway's
-``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a
-dedicated thread. The gateway imports this module lazily so missing optional
-audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError``
-at call time, not at startup.
+``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can
+call from a dedicated thread. The gateway imports this module lazily so that
+missing optional audio deps (sounddevice, faster-whisper, numpy) surface as
+an ``ImportError`` at call time, not at startup.
+
+Two usage modes are exposed:
+
+* **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single
+  manually-bounded capture used when the caller drives the start/stop pair
+  explicitly.
+* **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors
+  the classic CLI voice mode: recording auto-stops on silence, transcribes,
+  hands the result to a callback, and then auto-restarts for the next turn.
+  Three consecutive no-speech cycles stop the loop and fire
+  ``on_silent_limit`` so the UI can turn the mode off.
 """

 from __future__ import annotations

 import json
 import logging
+import os
+import sys
 import threading
-from typing import Optional
+from typing import Any, Callable, Optional

 from tools.voice_mode import (
    create_audio_recorder,
@ -24,15 +37,71 @@ from tools.voice_mode import (

 logger = logging.getLogger(__name__)

+
+def _debug(msg: str) -> None:
+    """Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1.
+
+    Goes to stderr so the TUI gateway wraps it as a gateway.stderr event,
+    which createGatewayEventHandler shows as an Activity line — exactly
+    what we need to diagnose "why didn't the loop auto-restart?" in the
+    user's real terminal without shipping a separate debug RPC.
+    """
+    if os.environ.get("HERMES_VOICE_DEBUG", "").strip() == "1":
+        print(f"[voice] {msg}", file=sys.stderr, flush=True)
+
+
+def _beeps_enabled() -> bool:
+    """CLI parity: voice.beep_enabled in config.yaml (default True)."""
+    try:
+        from hermes_cli.config import load_config
+
+        voice_cfg = load_config().get("voice", {})
+        if isinstance(voice_cfg, dict):
+            return bool(voice_cfg.get("beep_enabled", True))
+    except Exception:
+        pass
+    return True
+
+
+def _play_beep(frequency: int, count: int = 1) -> None:
+    """Audible cue matching cli.py's record/stop beeps.
+
+    880 Hz single-beep on start (cli.py:_voice_start_recording line 7532),
+    660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585).
+    Best-effort — sounddevice failures are silently swallowed so the
+    voice loop never breaks because a speaker was unavailable.
+    """
+    if not _beeps_enabled():
+        return
+    try:
+        from tools.voice_mode import play_beep
+
+        play_beep(frequency=frequency, count=count)
+    except Exception as e:
+        _debug(f"beep {frequency}Hz failed: {e}")
+
+# ── Push-to-talk state ───────────────────────────────────────────────
 _recorder = None
 _recorder_lock = threading.Lock()

+# ── Continuous (VAD) state ───────────────────────────────────────────
+_continuous_lock = threading.Lock()
+_continuous_active = False
+_continuous_recorder: Any = None
+_continuous_on_transcript: Optional[Callable[[str], None]] = None
+_continuous_on_status: Optional[Callable[[str], None]] = None
+_continuous_on_silent_limit: Optional[Callable[[], None]] = None
+_continuous_no_speech_count = 0
+_CONTINUOUS_NO_SPEECH_LIMIT = 3
+
+
+# ── Push-to-talk API ─────────────────────────────────────────────────
+

 def start_recording() -> None:
-    """Begin capturing from the default input device.
+    """Begin capturing from the default input device (push-to-talk).

-    Idempotent — calling again while a recording is in progress is a no-op,
-    which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops).
+    Idempotent — calling again while a recording is in progress is a no-op.
    """
    global _recorder

@ -40,20 +109,15 @@ def start_recording() -> None:
        if _recorder is not None and getattr(_recorder, "is_recording", False):
            return
        rec = create_audio_recorder()
-        # No silence callback: the TUI drives start/stop explicitly via
-        # the voice.record RPC. VAD auto-stop is a CLI-mode feature.
        rec.start()
        _recorder = rec


 def stop_and_transcribe() -> Optional[str]:
-    """Stop the active recording, transcribe it, and return the text.
+    """Stop the active push-to-talk recording, transcribe, return text.

    Returns ``None`` when no recording is active, when the microphone
-    captured no speech, or when Whisper returned a known hallucination
-    token (silence artefacts like "Thanks for watching!"). The caller
-    treats ``None`` as "no speech detected" and leaves the composer
-    untouched.
+    captured no speech, or when Whisper returned a known hallucination.
    """
    global _recorder

@ -73,27 +137,281 @@ def stop_and_transcribe() -> Optional[str]:
    except Exception as e:
        logger.warning("voice transcription failed: %s", e)
        return None
+    finally:
+        try:
+            if os.path.isfile(wav_path):
+                os.unlink(wav_path)
+        except Exception:
+            pass

-    text = (result.get("text") or "").strip()
+    # transcribe_recording returns {"success": bool, "transcript": str, ...}
+    # — matches cli.py:_voice_stop_and_transcribe's result.get("transcript").
+    if not result.get("success"):
+        return None
+    text = (result.get("transcript") or "").strip()
    if not text or is_whisper_hallucination(text):
        return None

    return text


+# ── Continuous (VAD) API ─────────────────────────────────────────────
+
+
+def start_continuous(
+    on_transcript: Callable[[str], None],
+    on_status: Optional[Callable[[str], None]] = None,
+    on_silent_limit: Optional[Callable[[], None]] = None,
+    silence_threshold: int = 200,
+    silence_duration: float = 3.0,
+) -> None:
+    """Start a VAD-driven continuous recording loop.
+
+    The loop calls ``on_transcript(text)`` each time speech is detected and
+    transcribed successfully, then auto-restarts. After
+    ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
+    picked up at all) the loop stops itself and calls ``on_silent_limit``
+    so the UI can reflect "voice off". Idempotent — calling while already
+    active is a no-op.
+
+    ``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
+    ``"idle"`` so the UI can show a live indicator.
+    """
+    global _continuous_active, _continuous_recorder
+    global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
+    global _continuous_no_speech_count
+
+    with _continuous_lock:
+        if _continuous_active:
+            _debug("start_continuous: already active — no-op")
+            return
+        _continuous_active = True
+        _continuous_on_transcript = on_transcript
+        _continuous_on_status = on_status
+        _continuous_on_silent_limit = on_silent_limit
+        _continuous_no_speech_count = 0
+
+        if _continuous_recorder is None:
+            _continuous_recorder = create_audio_recorder()
+
+        _continuous_recorder._silence_threshold = silence_threshold
+        _continuous_recorder._silence_duration = silence_duration
+        rec = _continuous_recorder
+
+    _debug(
+        f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)"
+    )
+
+    # CLI parity: single 880 Hz beep *before* opening the stream — placing
+    # the beep after stream.start() on macOS triggers a CoreAudio conflict
+    # (cli.py:7528 comment).
+    _play_beep(frequency=880, count=1)
+
+    try:
+        rec.start(on_silence_stop=_continuous_on_silence)
+    except Exception as e:
+        logger.error("failed to start continuous recording: %s", e)
+        _debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}")
+        with _continuous_lock:
+            _continuous_active = False
+        raise
+
+    if on_status:
+        try:
+            on_status("listening")
+        except Exception:
+            pass
+
+
+def stop_continuous() -> None:
+    """Stop the active continuous loop and release the microphone.
+
+    Idempotent — calling while not active is a no-op. Any in-flight
+    transcription completes but its result is discarded (the callback
+    checks ``_continuous_active`` before firing).
+    """
+    global _continuous_active, _continuous_on_transcript
+    global _continuous_on_status, _continuous_on_silent_limit
+    global _continuous_recorder, _continuous_no_speech_count
+
+    with _continuous_lock:
+        if not _continuous_active:
+            return
+        _continuous_active = False
+        rec = _continuous_recorder
+        on_status = _continuous_on_status
+        _continuous_on_transcript = None
+        _continuous_on_status = None
+        _continuous_on_silent_limit = None
+        _continuous_no_speech_count = 0
+
+    if rec is not None:
+        try:
+            # cancel() (not stop()) discards buffered frames — the loop
+            # is over, we don't want to transcribe a half-captured turn.
+            rec.cancel()
+        except Exception as e:
+            logger.warning("failed to cancel recorder: %s", e)
+
+    # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the
+    # silence-auto-stop path plays).
+    _play_beep(frequency=660, count=2)
+
+    if on_status:
+        try:
+            on_status("idle")
+        except Exception:
+            pass
+
+
+def is_continuous_active() -> bool:
+    """Whether a continuous voice loop is currently running."""
+    with _continuous_lock:
+        return _continuous_active
+
+
+def _continuous_on_silence() -> None:
+    """AudioRecorder silence callback — runs in a daemon thread.
+
+    Stops the current capture, transcribes, delivers the text via
+    ``on_transcript``, and — if the loop is still active — starts the
+    next capture. Three consecutive silent cycles end the loop.
+    """
+    global _continuous_active, _continuous_no_speech_count
+
+    _debug("_continuous_on_silence: fired")
+
+    with _continuous_lock:
+        if not _continuous_active:
+            _debug("_continuous_on_silence: loop inactive — abort")
+            return
+        rec = _continuous_recorder
+        on_transcript = _continuous_on_transcript
+        on_status = _continuous_on_status
+        on_silent_limit = _continuous_on_silent_limit
+
+    if rec is None:
+        _debug("_continuous_on_silence: no recorder — abort")
+        return
+
+    if on_status:
+        try:
+            on_status("transcribing")
+        except Exception:
+            pass
+
+    wav_path = rec.stop()
+    # Peak RMS is the critical diagnostic when stop() returns None despite
+    # the VAD firing — tells us at a glance whether the mic was too quiet
+    # for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree.
+    peak_rms = getattr(rec, "_peak_rms", -1)
+    _debug(
+        f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})"
+    )
+
+    # CLI parity: double 660 Hz beep after the stream stops (safe from the
+    # CoreAudio conflict that blocks pre-start beeps).
+    _play_beep(frequency=660, count=2)
+
+    transcript: Optional[str] = None
+
+    if wav_path:
+        try:
+            result = transcribe_recording(wav_path)
+            # transcribe_recording returns {"success": bool, "transcript": str,
+            # "error": str?} — NOT {"text": str}.  Using the wrong key silently
+            # produced empty transcripts even when Groq/local STT returned fine,
+            # which masqueraded as "not hearing the user" to the caller.
+            success = bool(result.get("success"))
+            text = (result.get("transcript") or "").strip()
+            err = result.get("error")
+            _debug(
+                f"_continuous_on_silence: transcribe -> success={success} "
+                f"text={text!r} err={err!r}"
+            )
+            if success and text and not is_whisper_hallucination(text):
+                transcript = text
+        except Exception as e:
+            logger.warning("continuous transcription failed: %s", e)
+            _debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}")
+        finally:
+            try:
+                if os.path.isfile(wav_path):
+                    os.unlink(wav_path)
+            except Exception:
+                pass
+
+    with _continuous_lock:
+        if not _continuous_active:
+            # User stopped us while we were transcribing — discard.
+            _debug("_continuous_on_silence: stopped during transcribe — no restart")
+            return
+        if transcript:
+            _continuous_no_speech_count = 0
+        else:
+            _continuous_no_speech_count += 1
+        should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT
+        no_speech = _continuous_no_speech_count
+
+    if transcript and on_transcript:
+        try:
+            on_transcript(transcript)
+        except Exception as e:
+            logger.warning("on_transcript callback raised: %s", e)
+
+    if should_halt:
+        _debug(f"_continuous_on_silence: {no_speech} silent cycles — halting")
+        with _continuous_lock:
+            _continuous_active = False
+            _continuous_no_speech_count = 0
+        if on_silent_limit:
+            try:
+                on_silent_limit()
+            except Exception:
+                pass
+        try:
+            rec.cancel()
+        except Exception:
+            pass
+        if on_status:
+            try:
+                on_status("idle")
+            except Exception:
+                pass
+        return
+
+    # Restart for the next turn.
+    _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
+    _play_beep(frequency=880, count=1)
+    try:
+        rec.start(on_silence_stop=_continuous_on_silence)
+    except Exception as e:
+        logger.error("failed to restart continuous recording: %s", e)
+        _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
+        with _continuous_lock:
+            _continuous_active = False
+        return
+
+    if on_status:
+        try:
+            on_status("listening")
+        except Exception:
+            pass
+
+
+# ── TTS API ──────────────────────────────────────────────────────────
+
+
 def speak_text(text: str) -> None:
    """Synthesize ``text`` with the configured TTS provider and play it.

    The gateway spawns a daemon thread to call this so the RPC returns
-    immediately. Failures are logged and swallowed — the UI already
-    acknowledged "speaking" by the time we get here.
+    immediately. Failures are logged and swallowed.
    """
    if not text or not text.strip():
        return

-    # Lazy import — tts_tool pulls optional provider SDKs (OpenAI,
-    # ElevenLabs, etc.) and config-reading machinery that we don't
-    # want to load at module import time.
+    # Lazy import — tts_tool pulls optional provider SDKs.
    from tools.tts_tool import text_to_speech_tool

    try:
--- a/tests/hermes_cli/test_voice_wrapper.py
+++ b/tests/hermes_cli/test_voice_wrapper.py
@ -51,3 +51,205 @@ class TestSpeakTextGuards:

        # Should simply return None without raising.
        assert speak_text(text) is None
+
+
+class TestContinuousAPI:
+    """Continuous (VAD) mode API — CLI-parity loop entry points."""
+
+    def test_continuous_exports(self):
+        from hermes_cli.voice import (
+            is_continuous_active,
+            start_continuous,
+            stop_continuous,
+        )
+
+        assert callable(start_continuous)
+        assert callable(stop_continuous)
+        assert callable(is_continuous_active)
+
+    def test_not_active_by_default(self, monkeypatch):
+        import hermes_cli.voice as voice
+
+        # Isolate from any state left behind by other tests in the session.
+        monkeypatch.setattr(voice, "_continuous_active", False)
+        monkeypatch.setattr(voice, "_continuous_recorder", None)
+
+        assert voice.is_continuous_active() is False
+
+    def test_stop_continuous_idempotent_when_inactive(self, monkeypatch):
+        """stop_continuous must not raise when no loop is active — the
+        gateway's voice.toggle off path calls it unconditionally."""
+        import hermes_cli.voice as voice
+
+        monkeypatch.setattr(voice, "_continuous_active", False)
+        monkeypatch.setattr(voice, "_continuous_recorder", None)
+
+        # Should return cleanly without exceptions
+        assert voice.stop_continuous() is None
+        assert voice.is_continuous_active() is False
+
+    def test_double_start_is_idempotent(self, monkeypatch):
+        """A second start_continuous while already active is a no-op — prevents
+        two overlapping capture threads fighting over the microphone when the
+        UI double-fires (e.g. both /voice on and Ctrl+B within the same tick)."""
+        import hermes_cli.voice as voice
+
+        monkeypatch.setattr(voice, "_continuous_active", True)
+        called = {"n": 0}
+
+        class FakeRecorder:
+            def start(self, on_silence_stop=None):
+                called["n"] += 1
+
+            def cancel(self):
+                pass
+
+        monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder())
+
+        voice.start_continuous(on_transcript=lambda _t: None)
+
+        # The guard inside start_continuous short-circuits before rec.start()
+        assert called["n"] == 0
+
+
+class TestContinuousLoopSimulation:
+    """End-to-end simulation of the VAD loop with a fake recorder.
+
+    Proves auto-restart works: the silence callback must trigger transcribe →
+    on_transcript → re-call rec.start(on_silence_stop=same_cb). Also covers
+    the 3-strikes no-speech halt.
+    """
+
+    @pytest.fixture
+    def fake_recorder(self, monkeypatch):
+        import hermes_cli.voice as voice
+
+        # Reset module state between tests.
+        monkeypatch.setattr(voice, "_continuous_active", False)
+        monkeypatch.setattr(voice, "_continuous_recorder", None)
+        monkeypatch.setattr(voice, "_continuous_no_speech_count", 0)
+        monkeypatch.setattr(voice, "_continuous_on_transcript", None)
+        monkeypatch.setattr(voice, "_continuous_on_status", None)
+        monkeypatch.setattr(voice, "_continuous_on_silent_limit", None)
+
+        class FakeRecorder:
+            _silence_threshold = 200
+            _silence_duration = 3.0
+            is_recording = False
+
+            def __init__(self):
+                self.start_calls = 0
+                self.last_callback = None
+                self.stopped = 0
+                self.cancelled = 0
+                # Preset WAV path returned by stop()
+                self.next_stop_wav = "/tmp/fake.wav"
+
+            def start(self, on_silence_stop=None):
+                self.start_calls += 1
+                self.last_callback = on_silence_stop
+                self.is_recording = True
+
+            def stop(self):
+                self.stopped += 1
+                self.is_recording = False
+                return self.next_stop_wav
+
+            def cancel(self):
+                self.cancelled += 1
+                self.is_recording = False
+
+        rec = FakeRecorder()
+        monkeypatch.setattr(voice, "create_audio_recorder", lambda: rec)
+        # Skip real file ops in the silence callback.
+        monkeypatch.setattr(voice.os.path, "isfile", lambda _p: False)
+        return rec
+
+    def test_loop_auto_restarts_after_transcript(self, fake_recorder, monkeypatch):
+        import hermes_cli.voice as voice
+
+        monkeypatch.setattr(
+            voice,
+            "transcribe_recording",
+            lambda _p: {"success": True, "transcript": "hello world"},
+        )
+        monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
+
+        transcripts = []
+        statuses = []
+
+        voice.start_continuous(
+            on_transcript=lambda t: transcripts.append(t),
+            on_status=lambda s: statuses.append(s),
+        )
+
+        assert fake_recorder.start_calls == 1
+        assert statuses == ["listening"]
+
+        # Simulate AudioRecorder's silence detector firing.
+        fake_recorder.last_callback()
+
+        assert transcripts == ["hello world"]
+        assert fake_recorder.start_calls == 2  # auto-restarted
+        assert statuses == ["listening", "transcribing", "listening"]
+        assert voice.is_continuous_active() is True
+
+        voice.stop_continuous()
+
+    def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch):
+        import hermes_cli.voice as voice
+
+        # Transcription returns no speech — fake_recorder.stop() returns the
+        # path, but transcribe returns empty text, counting as silence.
+        monkeypatch.setattr(
+            voice,
+            "transcribe_recording",
+            lambda _p: {"success": True, "transcript": ""},
+        )
+        monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
+
+        transcripts = []
+        silent_limit_fired = []
+
+        voice.start_continuous(
+            on_transcript=lambda t: transcripts.append(t),
+            on_silent_limit=lambda: silent_limit_fired.append(True),
+        )
+
+        # Fire silence callback 3 times
+        for _ in range(3):
+            fake_recorder.last_callback()
+
+        assert transcripts == []
+        assert silent_limit_fired == [True]
+        assert voice.is_continuous_active() is False
+        assert fake_recorder.cancelled >= 1
+
+    def test_stop_during_transcription_discards_restart(self, fake_recorder, monkeypatch):
+        """User hits Ctrl+B mid-transcription: the in-flight transcript must
+        still fire (it's a real utterance), but the loop must NOT restart."""
+        import hermes_cli.voice as voice
+
+        stop_triggered = {"flag": False}
+
+        def late_transcribe(_p):
+            # Simulate stop_continuous arriving while we're inside transcribe
+            voice.stop_continuous()
+            stop_triggered["flag"] = True
+            return {"success": True, "transcript": "final word"}
+
+        monkeypatch.setattr(voice, "transcribe_recording", late_transcribe)
+        monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
+
+        transcripts = []
+        voice.start_continuous(on_transcript=lambda t: transcripts.append(t))
+
+        initial_starts = fake_recorder.start_calls  # 1
+        fake_recorder.last_callback()
+
+        assert stop_triggered["flag"] is True
+        # Loop is stopped — no auto-restart
+        assert fake_recorder.start_calls == initial_starts
+        # The in-flight transcript was suppressed because we stopped mid-flight
+        assert transcripts == []
+        assert voice.is_continuous_active() is False
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -3455,43 +3455,154 @@ def _(rid, params: dict) -> dict:
 # ── Methods: voice ───────────────────────────────────────────────────


+_voice_sid_lock = threading.Lock()
+_voice_event_sid: str = ""
+
+
+def _voice_emit(event: str, payload: dict | None = None) -> None:
+    """Emit a voice event toward the session that most recently turned the
+    mode on. Voice is process-global (one microphone), so there's only ever
+    one sid to target; the TUI handler treats an empty sid as "active
+    session". Kept separate from _emit to make the lack of per-call sid
+    argument explicit."""
+    with _voice_sid_lock:
+        sid = _voice_event_sid
+    _emit(event, sid, payload)
+
+
+def _voice_mode_enabled() -> bool:
+    """Current voice-mode flag. HERMES_VOICE env var wins over config so
+    the gateway and CLI agree when one of them was launched with an
+    explicit override."""
+    env = os.environ.get("HERMES_VOICE", "").strip()
+    if env in {"0", "1"}:
+        return env == "1"
+    return bool(_load_cfg().get("display", {}).get("voice_enabled", False))
+
+
+def _voice_tts_enabled() -> bool:
+    """Whether agent replies should be spoken back via TTS."""
+    env = os.environ.get("HERMES_VOICE_TTS", "").strip()
+    if env in {"0", "1"}:
+        return env == "1"
+    return bool(_load_cfg().get("display", {}).get("voice_tts", False))
+
+
@method("voice.toggle")
 def _(rid, params: dict) -> dict:
+    """CLI parity for the ``/voice`` slash command.
+
+    Subcommands:
+
+    * ``status`` — report mode + TTS flags (default when action is unknown).
+    * ``on`` / ``off`` — flip voice *mode* (the umbrella bit). Turning it
+      off also tears down any active continuous recording loop. Does NOT
+      start recording on its own; recording is driven by ``voice.record``
+      (Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split.
+    * ``tts`` — toggle speech-output of agent replies. Requires mode on
+      (mirrors CLI's _toggle_voice_tts guard).
+    """
    action = params.get("action", "status")
+
    if action == "status":
-        env = os.environ.get("HERMES_VOICE", "").strip()
-        if env in {"0", "1"}:
-            return _ok(rid, {"enabled": env == "1"})
-        return _ok(
-            rid,
-            {
-                "enabled": bool(
-                    _load_cfg().get("display", {}).get("voice_enabled", False)
-                )
-            },
-        )
+        # Mirror CLI's _show_voice_status: include STT/TTS provider
+        # availability so the user can tell at a glance *why* voice mode
+        # isn't working ("STT provider: MISSING ..." is the common case).
+        payload: dict = {
+            "enabled": _voice_mode_enabled(),
+            "tts": _voice_tts_enabled(),
+        }
+        try:
+            from tools.voice_mode import check_voice_requirements
+
+            reqs = check_voice_requirements()
+            payload["available"] = bool(reqs.get("available"))
+            payload["audio_available"] = bool(reqs.get("audio_available"))
+            payload["stt_available"] = bool(reqs.get("stt_available"))
+            payload["details"] = reqs.get("details") or ""
+        except Exception as e:
+            # check_voice_requirements pulls optional transcription deps —
+            # swallow so /voice status always returns something useful.
+            logger.warning("voice.toggle status: requirements probe failed: %s", e)
+
+        return _ok(rid, payload)
+
    if action in ("on", "off"):
        enabled = action == "on"
        os.environ["HERMES_VOICE"] = "1" if enabled else "0"
        _write_config_key("display.voice_enabled", enabled)
-        return _ok(rid, {"enabled": action == "on"})
+
+        if not enabled:
+            # Disabling the mode must tear the continuous loop down; the
+            # loop holds the microphone and would otherwise keep running.
+            try:
+                from hermes_cli.voice import stop_continuous
+
+                stop_continuous()
+            except ImportError:
+                pass
+            except Exception as e:
+                logger.warning("voice: stop_continuous failed during toggle off: %s", e)
+
+        return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()})
+
+    if action == "tts":
+        if not _voice_mode_enabled():
+            return _err(rid, 4014, "enable voice mode first: /voice on")
+        new_value = not _voice_tts_enabled()
+        os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0"
+        _write_config_key("display.voice_tts", new_value)
+        return _ok(rid, {"enabled": True, "tts": new_value})
+
    return _err(rid, 4013, f"unknown voice action: {action}")


@method("voice.record")
 def _(rid, params: dict) -> dict:
+    """VAD-driven continuous record loop, CLI-parity.
+
+    ``start`` turns on a VAD loop that emits ``voice.transcript`` events
+    for each detected utterance and auto-restarts for the next turn.
+    ``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while-
+    recording branch clearing ``_voice_continuous``). Three consecutive
+    silent cycles stop the loop automatically and emit a
+    ``voice.transcript`` with ``no_speech_limit=True``.
+    """
    action = params.get("action", "start")
+
+    if action not in {"start", "stop"}:
+        return _err(rid, 4019, f"unknown voice action: {action}")
+
    try:
        if action == "start":
-            from hermes_cli.voice import start_recording
+            if not _voice_mode_enabled():
+                return _err(rid, 4015, "voice mode is off — enable with /voice on")

-            start_recording()
+            with _voice_sid_lock:
+                global _voice_event_sid
+                _voice_event_sid = params.get("session_id") or _voice_event_sid
+
+            from hermes_cli.voice import start_continuous
+
+            voice_cfg = _load_cfg().get("voice", {})
+            start_continuous(
+                on_transcript=lambda t: _voice_emit(
+                    "voice.transcript", {"text": t}
+                ),
+                on_status=lambda s: _voice_emit("voice.status", {"state": s}),
+                on_silent_limit=lambda: _voice_emit(
+                    "voice.transcript", {"no_speech_limit": True}
+                ),
+                silence_threshold=voice_cfg.get("silence_threshold", 200),
+                silence_duration=voice_cfg.get("silence_duration", 3.0),
+            )
            return _ok(rid, {"status": "recording"})
-        if action == "stop":
-            from hermes_cli.voice import stop_and_transcribe

-            return _ok(rid, {"text": stop_and_transcribe() or ""})
-        return _err(rid, 4019, f"unknown voice action: {action}")
+        # action == "stop"
+        from hermes_cli.voice import stop_continuous
+
+        stop_continuous()
+        return _ok(rid, {"status": "stopped"})
    except ImportError:
        return _err(
            rid, 5025, "voice module not available — install audio dependencies"
--- a/ui-tui/src/tests/createGatewayEventHandler.test.ts
+++ b/ui-tui/src/tests/createGatewayEventHandler.test.ts
@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
    composer: {
      dequeue: () => undefined,
      queueEditRef: ref<null | number>(null),
-      sendQueued: vi.fn()
+      sendQueued: vi.fn(),
+      setInput: vi.fn()
    },
    gateway: {
      gw: { request: vi.fn() },
@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
      resumeById: vi.fn(),
      setCatalog: vi.fn()
    },
+    submission: {
+      submitRef: { current: vi.fn() }
+    },
    system: {
      bellOnComplete: false,
      sys: vi.fn()
@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
      panel: (title: string, sections: any[]) =>
        appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
      setHistoryItems: vi.fn()
+    },
+    voice: {
+      setProcessing: vi.fn(),
+      setRecording: vi.fn(),
+      setVoiceEnabled: vi.fn()
    }
  }) as any

--- a/ui-tui/src/app/createGatewayEventHandler.ts
+++ b/ui-tui/src/app/createGatewayEventHandler.ts
@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
  const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
  const { bellOnComplete, stdout, sys } = ctx.system
  const { appendMessage, panel, setHistoryItems } = ctx.transcript
+  const { setInput } = ctx.composer
+  const { submitRef } = ctx.submission
+  const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice

  let pendingThinkingStatus = ''
  let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
        return
      }

+      case 'voice.status': {
+        // Continuous VAD loop reports its internal state so the status bar
+        // can show listening / transcribing / idle without polling.
+        const state = String(ev.payload?.state ?? '')
+
+        if (state === 'listening') {
+          setVoiceRecording(true)
+          setVoiceProcessing(false)
+        } else if (state === 'transcribing') {
+          setVoiceRecording(false)
+          setVoiceProcessing(true)
+        } else {
+          setVoiceRecording(false)
+          setVoiceProcessing(false)
+        }
+
+        return
+      }
+
+      case 'voice.transcript': {
+        // CLI parity: the 3-strikes silence detector flipped off automatically.
+        // Mirror that on the UI side and tell the user why the mode is off.
+        if (ev.payload?.no_speech_limit) {
+          setVoiceEnabled(false)
+          setVoiceRecording(false)
+          setVoiceProcessing(false)
+          sys('voice: no speech detected 3 times, continuous mode stopped')
+
+          return
+        }
+
+        const text = String(ev.payload?.text ?? '').trim()
+
+        if (!text) {
+          return
+        }
+
+        // Match CLI's _pending_input.put(transcript): auto-submit when the
+        // composer is empty, otherwise append so the user can keep editing
+        // a partial draft they were working on.
+        setInput(prev => {
+          if (!prev) {
+            // defer submit so React commits the state change first
+            setTimeout(() => submitRef.current(text), 0)
+
+            return ''
+          }
+
+          return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
+        })
+
+        return
+      }
+
      case 'gateway.start_timeout': {
        const { cwd, python } = ev.payload ?? {}
        const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
--- a/ui-tui/src/app/interfaces.ts
+++ b/ui-tui/src/app/interfaces.ts
@ -189,9 +189,11 @@ export interface InputHandlerContext {
    stdout?: NodeJS.WriteStream
  }
  voice: {
+    enabled: boolean
    recording: boolean
    setProcessing: StateSetter<boolean>
    setRecording: StateSetter<boolean>
+    setVoiceEnabled: StateSetter<boolean>
  }
  wheelStep: number
 }
@ -201,6 +203,9 @@ export interface InputHandlerResult {
 }

 export interface GatewayEventHandlerContext {
+  composer: {
+    setInput: StateSetter<string>
+  }
  gateway: GatewayServices
  session: {
    STARTUP_RESUME_ID: string
@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
    resumeById: (id: string) => void
    setCatalog: StateSetter<null | SlashCatalog>
  }
+  submission: {
+    submitRef: MutableRefObject<(value: string) => void>
+  }
  system: {
    bellOnComplete: boolean
    stdout?: NodeJS.WriteStream
@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
    panel: (title: string, sections: PanelSection[]) => void
    setHistoryItems: StateSetter<Msg[]>
  }
+  voice: {
+    setProcessing: StateSetter<boolean>
+    setRecording: StateSetter<boolean>
+    setVoiceEnabled: StateSetter<boolean>
+  }
 }

 export interface SlashHandlerContext {
--- a/ui-tui/src/app/slash/commands/session.ts
+++ b/ui-tui/src/app/slash/commands/session.ts
@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
  },

  {
-    help: 'toggle voice input',
+    help: 'voice mode: [on|off|tts|status]',
    name: 'voice',
    run: (arg, ctx) => {
-      const action = arg === 'on' || arg === 'off' ? arg : 'status'
+      const normalized = (arg ?? '').trim().toLowerCase()
+
+      const action =
+        normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
+          ? normalized
+          : 'status'

      ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
        ctx.guarded<VoiceToggleResponse>(r => {
          ctx.voice.setVoiceEnabled(!!r.enabled)
-          ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`)
+
+          // Match CLI's _show_voice_status / _enable_voice_mode /
+          // _toggle_voice_tts output shape so users don't have to learn
+          // two vocabularies.
+          if (action === 'status') {
+            const mode = r.enabled ? 'ON' : 'OFF'
+            const tts = r.tts ? 'ON' : 'OFF'
+            ctx.transcript.sys('Voice Mode Status')
+            ctx.transcript.sys(`  Mode:       ${mode}`)
+            ctx.transcript.sys(`  TTS:        ${tts}`)
+            ctx.transcript.sys('  Record key: Ctrl+B')
+
+            // CLI's "Requirements:" block — surfaces STT/audio setup issues
+            // so the user sees "STT provider: MISSING ..." instead of
+            // silently failing on every Ctrl+B press.
+            if (r.details) {
+              ctx.transcript.sys('')
+              ctx.transcript.sys('  Requirements:')
+
+              for (const line of r.details.split('\n')) {
+                if (line.trim()) {
+                  ctx.transcript.sys(`    ${line}`)
+                }
+              }
+            }
+
+            return
+          }
+
+          if (action === 'tts') {
+            ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
+
+            return
+          }
+
+          // on/off — mirror cli.py:_enable_voice_mode's 3-line output
+          if (r.enabled) {
+            const tts = r.tts ? ' (TTS enabled)' : ''
+            ctx.transcript.sys(`Voice mode enabled${tts}`)
+            ctx.transcript.sys('  Ctrl+B to start/stop recording')
+            ctx.transcript.sys('  /voice tts  to toggle speech output')
+            ctx.transcript.sys('  /voice off  to disable voice mode')
+          } else {
+            ctx.transcript.sys('Voice mode disabled.')
+          }
        })
      )
    }
--- a/ui-tui/src/app/useInputHandlers.ts
+++ b/ui-tui/src/app/useInputHandlers.ts
@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
    }
  }

-  const voiceStop = () => {
-    voice.setRecording(false)
-    voice.setProcessing(true)
+  // CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
+  // (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
+  // Ctrl+B while the mode is off sys-nudges the user. While the mode is
+  // on, the first press starts a continuous loop (gateway → start_continuous,
+  // VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
+  // The gateway publishes voice.status + voice.transcript events that
+  // createGatewayEventHandler turns into UI badges and composer injection.
+  const voiceRecordToggle = () => {
+    if (!voice.enabled) {
+      return actions.sys('voice: mode is off — enable with /voice on')
+    }
+
+    const starting = !voice.recording
+    const action = starting ? 'start' : 'stop'
+
+    // Optimistic UI — flip the REC badge immediately so the user gets
+    // feedback while the RPC round-trips; the voice.status event is the
+    // authoritative source and may correct us.
+    if (starting) {
+      voice.setRecording(true)
+    } else {
+      voice.setRecording(false)
+      voice.setProcessing(false)
+    }

    gateway
-      .rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
-      .then(r => {
-        if (!r) {
-          return
+      .rpc<VoiceRecordResponse>('voice.record', { action })
+      .catch((e: Error) => {
+        // Revert optimistic UI on failure.
+        if (starting) {
+          voice.setRecording(false)
        }

-        const transcript = String(r.text || '').trim()
-
-        if (!transcript) {
-          return actions.sys('voice: no speech detected')
-        }
-
-        cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
-      })
-      .catch((e: Error) => actions.sys(`voice error: ${e.message}`))
-      .finally(() => {
-        voice.setProcessing(false)
-        patchUiState({ status: 'ready' })
+        actions.sys(`voice error: ${e.message}`)
      })
  }

-  const voiceStart = () =>
-    gateway
-      .rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
-      .then(r => {
-        if (!r) {
-          return
-        }
-
-        voice.setRecording(true)
-        patchUiState({ status: 'recording…' })
-      })
-      .catch((e: Error) => actions.sys(`voice error: ${e.message}`))
-
  useInput((ch, key) => {
    const live = getUiState()

@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
    }

    if (isVoiceToggleKey(key, ch)) {
-      return voice.recording ? voiceStop() : voiceStart()
+      return voiceRecordToggle()
    }

    if (isAction(key, ch, 'g')) {
--- a/ui-tui/src/app/useMainApp.ts
+++ b/ui-tui/src/app/useMainApp.ts
@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
    composer: { actions: composerActions, refs: composerRefs, state: composerState },
    gateway,
    terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
-    voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
+    voice: {
+      enabled: voiceEnabled,
+      recording: voiceRecording,
+      setProcessing: setVoiceProcessing,
+      setRecording: setVoiceRecording,
+      setVoiceEnabled
+    },
    wheelStep: WHEEL_SCROLL_STEP
  })

  const onEvent = useMemo(
    () =>
      createGatewayEventHandler({
+        composer: { setInput: composerActions.setInput },
        gateway,
        session: {
          STARTUP_RESUME_ID,
@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
          resumeById: session.resumeById,
          setCatalog
        },
+        submission: { submitRef },
        system: { bellOnComplete, stdout, sys },
-        transcript: { appendMessage, panel, setHistoryItems }
+        transcript: { appendMessage, panel, setHistoryItems },
+        voice: {
+          setProcessing: setVoiceProcessing,
+          setRecording: setVoiceRecording,
+          setVoiceEnabled
+        }
      }),
    [
      appendMessage,
      bellOnComplete,
+      composerActions.setInput,
      gateway,
      panel,
      session.newSession,
      session.resetSession,
      session.resumeById,
+      setVoiceEnabled,
+      setVoiceProcessing,
+      setVoiceRecording,
      stdout,
+      submitRef,
      sys
    ]
  )
--- a/ui-tui/src/gatewayTypes.ts
+++ b/ui-tui/src/gatewayTypes.ts
@ -236,10 +236,16 @@ export interface ImageAttachResponse {
 // ── Voice ────────────────────────────────────────────────────────────

 export interface VoiceToggleResponse {
+  audio_available?: boolean
+  available?: boolean
+  details?: string
  enabled?: boolean
+  stt_available?: boolean
+  tts?: boolean
 }

 export interface VoiceRecordResponse {
+  status?: string
  text?: string
 }

@ -368,6 +374,8 @@ export type GatewayEvent =
  | { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
  | { payload?: undefined; session_id?: string; type: 'message.start' }
  | { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
+  | { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
+  | { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
  | { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
  | { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
  | { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }