feat(tui): match CLI's voice slash + VAD-continuous recording model

The TUI had drifted from the CLI's voice model in two ways: - /voice on was lighting up the microphone immediately and Ctrl+B was interpreted as a mode toggle. The CLI separates the two: /voice on just flips the umbrella bit, recording only starts once the user presses Ctrl+B, which also sets _voice_continuous so the VAD loop auto-restarts until the user presses Ctrl+B again or three silent cycles pass. - /voice tts was missing entirely, so users couldn't turn agent reply speech on/off from inside the TUI. This commit brings the TUI to parity. Python - hermes_cli/voice.py: continuous-mode API (start_continuous, stop_continuous, is_continuous_active) layered on the existing PTT wrappers. The silence callback transcribes, fires on_transcript, tracks consecutive no-speech cycles, and auto-restarts — mirroring cli.py:_voice_stop_and_transcribe + _restart_recording. - tui_gateway/server.py: - voice.toggle now supports on / off / tts / status. The umbrella bit lives in HERMES_VOICE + display.voice_enabled; tts lives in HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down any active continuous loop so a toggle-off really releases the microphone. - voice.record start/stop now drives start_continuous/stop_continuous. start is refused with a clear error when the mode is off, matching cli.py:handle_voice_record's early return on `not _voice_mode`. - New voice.transcript / voice.status events emit through _voice_emit (remembers the sid that last enabled the mode so events land in the right session). TypeScript - gatewayTypes.ts: voice.status + voice.transcript event discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse gains status for the new "started/stopped" responses. - interfaces.ts: GatewayEventHandlerContext gains composer.setInput + submission.submitRef + voice.{setRecording, setProcessing, setVoiceEnabled}; InputHandlerContext.voice gains enabled + setVoiceEnabled for the mode-aware Ctrl+B handler. - createGatewayEventHandler.ts: voice.status drives REC/STT badges; voice.transcript auto-submits when the composer is empty (CLI _pending_input.put parity) and appends when a draft is in flight. no_speech_limit flips voice off + sys line. - useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop), not voice.toggle, and nudges the user with a sys line when the mode is off instead of silently flipping it on. - useMainApp.ts: wires the new event-handler context fields. - slash/commands/session.ts: /voice handles on / off / tts / status with CLI-matching output ("voice: mode on · tts off"). Backward compat preserved for voice.record (was always PTT shape; gateway still honours start/stop with mode-gating added).
2026-04-25 00:51:20 +00:00 · 2026-04-24 00:55:17 +03:00 · 2026-04-24 00:55:17 +03:00 · 04c489b587
commit 04c489b587
parent 0bb460b070
10 changed files with 861 additions and 78 deletions
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -3455,43 +3455,154 @@ def _(rid, params: dict) -> dict:
 # ── Methods: voice ───────────────────────────────────────────────────


+_voice_sid_lock = threading.Lock()
+_voice_event_sid: str = ""
+
+
+def _voice_emit(event: str, payload: dict | None = None) -> None:
+    """Emit a voice event toward the session that most recently turned the
+    mode on. Voice is process-global (one microphone), so there's only ever
+    one sid to target; the TUI handler treats an empty sid as "active
+    session". Kept separate from _emit to make the lack of per-call sid
+    argument explicit."""
+    with _voice_sid_lock:
+        sid = _voice_event_sid
+    _emit(event, sid, payload)
+
+
+def _voice_mode_enabled() -> bool:
+    """Current voice-mode flag. HERMES_VOICE env var wins over config so
+    the gateway and CLI agree when one of them was launched with an
+    explicit override."""
+    env = os.environ.get("HERMES_VOICE", "").strip()
+    if env in {"0", "1"}:
+        return env == "1"
+    return bool(_load_cfg().get("display", {}).get("voice_enabled", False))
+
+
+def _voice_tts_enabled() -> bool:
+    """Whether agent replies should be spoken back via TTS."""
+    env = os.environ.get("HERMES_VOICE_TTS", "").strip()
+    if env in {"0", "1"}:
+        return env == "1"
+    return bool(_load_cfg().get("display", {}).get("voice_tts", False))
+
+
@method("voice.toggle")
 def _(rid, params: dict) -> dict:
+    """CLI parity for the ``/voice`` slash command.
+
+    Subcommands:
+
+    * ``status`` — report mode + TTS flags (default when action is unknown).
+    * ``on`` / ``off`` — flip voice *mode* (the umbrella bit). Turning it
+      off also tears down any active continuous recording loop. Does NOT
+      start recording on its own; recording is driven by ``voice.record``
+      (Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split.
+    * ``tts`` — toggle speech-output of agent replies. Requires mode on
+      (mirrors CLI's _toggle_voice_tts guard).
+    """
    action = params.get("action", "status")
+
    if action == "status":
-        env = os.environ.get("HERMES_VOICE", "").strip()
-        if env in {"0", "1"}:
-            return _ok(rid, {"enabled": env == "1"})
-        return _ok(
-            rid,
-            {
-                "enabled": bool(
-                    _load_cfg().get("display", {}).get("voice_enabled", False)
-                )
-            },
-        )
+        # Mirror CLI's _show_voice_status: include STT/TTS provider
+        # availability so the user can tell at a glance *why* voice mode
+        # isn't working ("STT provider: MISSING ..." is the common case).
+        payload: dict = {
+            "enabled": _voice_mode_enabled(),
+            "tts": _voice_tts_enabled(),
+        }
+        try:
+            from tools.voice_mode import check_voice_requirements
+
+            reqs = check_voice_requirements()
+            payload["available"] = bool(reqs.get("available"))
+            payload["audio_available"] = bool(reqs.get("audio_available"))
+            payload["stt_available"] = bool(reqs.get("stt_available"))
+            payload["details"] = reqs.get("details") or ""
+        except Exception as e:
+            # check_voice_requirements pulls optional transcription deps —
+            # swallow so /voice status always returns something useful.
+            logger.warning("voice.toggle status: requirements probe failed: %s", e)
+
+        return _ok(rid, payload)
+
    if action in ("on", "off"):
        enabled = action == "on"
        os.environ["HERMES_VOICE"] = "1" if enabled else "0"
        _write_config_key("display.voice_enabled", enabled)
-        return _ok(rid, {"enabled": action == "on"})
+
+        if not enabled:
+            # Disabling the mode must tear the continuous loop down; the
+            # loop holds the microphone and would otherwise keep running.
+            try:
+                from hermes_cli.voice import stop_continuous
+
+                stop_continuous()
+            except ImportError:
+                pass
+            except Exception as e:
+                logger.warning("voice: stop_continuous failed during toggle off: %s", e)
+
+        return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()})
+
+    if action == "tts":
+        if not _voice_mode_enabled():
+            return _err(rid, 4014, "enable voice mode first: /voice on")
+        new_value = not _voice_tts_enabled()
+        os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0"
+        _write_config_key("display.voice_tts", new_value)
+        return _ok(rid, {"enabled": True, "tts": new_value})
+
    return _err(rid, 4013, f"unknown voice action: {action}")


@method("voice.record")
 def _(rid, params: dict) -> dict:
+    """VAD-driven continuous record loop, CLI-parity.
+
+    ``start`` turns on a VAD loop that emits ``voice.transcript`` events
+    for each detected utterance and auto-restarts for the next turn.
+    ``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while-
+    recording branch clearing ``_voice_continuous``). Three consecutive
+    silent cycles stop the loop automatically and emit a
+    ``voice.transcript`` with ``no_speech_limit=True``.
+    """
    action = params.get("action", "start")
+
+    if action not in {"start", "stop"}:
+        return _err(rid, 4019, f"unknown voice action: {action}")
+
    try:
        if action == "start":
-            from hermes_cli.voice import start_recording
+            if not _voice_mode_enabled():
+                return _err(rid, 4015, "voice mode is off — enable with /voice on")

-            start_recording()
+            with _voice_sid_lock:
+                global _voice_event_sid
+                _voice_event_sid = params.get("session_id") or _voice_event_sid
+
+            from hermes_cli.voice import start_continuous
+
+            voice_cfg = _load_cfg().get("voice", {})
+            start_continuous(
+                on_transcript=lambda t: _voice_emit(
+                    "voice.transcript", {"text": t}
+                ),
+                on_status=lambda s: _voice_emit("voice.status", {"state": s}),
+                on_silent_limit=lambda: _voice_emit(
+                    "voice.transcript", {"no_speech_limit": True}
+                ),
+                silence_threshold=voice_cfg.get("silence_threshold", 200),
+                silence_duration=voice_cfg.get("silence_duration", 3.0),
+            )
            return _ok(rid, {"status": "recording"})
-        if action == "stop":
-            from hermes_cli.voice import stop_and_transcribe

-            return _ok(rid, {"text": stop_and_transcribe() or ""})
-        return _err(rid, 4019, f"unknown voice action: {action}")
+        # action == "stop"
+        from hermes_cli.voice import stop_continuous
+
+        stop_continuous()
+        return _ok(rid, {"status": "stopped"})
    except ImportError:
        return _err(
            rid, 5025, "voice module not available — install audio dependencies"