feat(tui): match CLI's voice slash + VAD-continuous recording model

The TUI had drifted from the CLI's voice model in two ways:

- /voice on was lighting up the microphone immediately and Ctrl+B was
  interpreted as a mode toggle.  The CLI separates the two: /voice on
  just flips the umbrella bit, recording only starts once the user
  presses Ctrl+B, which also sets _voice_continuous so the VAD loop
  auto-restarts until the user presses Ctrl+B again or three silent
  cycles pass.
- /voice tts was missing entirely, so users couldn't turn agent reply
  speech on/off from inside the TUI.

This commit brings the TUI to parity.

Python

- hermes_cli/voice.py: continuous-mode API (start_continuous,
  stop_continuous, is_continuous_active) layered on the existing PTT
  wrappers. The silence callback transcribes, fires on_transcript,
  tracks consecutive no-speech cycles, and auto-restarts — mirroring
  cli.py:_voice_stop_and_transcribe + _restart_recording.
- tui_gateway/server.py:
  - voice.toggle now supports on / off / tts / status.  The umbrella
    bit lives in HERMES_VOICE + display.voice_enabled; tts lives in
    HERMES_VOICE_TTS + display.voice_tts.  /voice off also tears down
    any active continuous loop so a toggle-off really releases the
    microphone.
  - voice.record start/stop now drives start_continuous/stop_continuous.
    start is refused with a clear error when the mode is off, matching
    cli.py:handle_voice_record's early return on `not _voice_mode`.
  - New voice.transcript / voice.status events emit through
    _voice_emit (remembers the sid that last enabled the mode so
    events land in the right session).

TypeScript

- gatewayTypes.ts: voice.status + voice.transcript event
  discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse
  gains status for the new "started/stopped" responses.
- interfaces.ts: GatewayEventHandlerContext gains composer.setInput +
  submission.submitRef + voice.{setRecording, setProcessing,
  setVoiceEnabled}; InputHandlerContext.voice gains enabled +
  setVoiceEnabled for the mode-aware Ctrl+B handler.
- createGatewayEventHandler.ts: voice.status drives REC/STT badges;
  voice.transcript auto-submits when the composer is empty (CLI
  _pending_input.put parity) and appends when a draft is in flight.
  no_speech_limit flips voice off + sys line.
- useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop),
  not voice.toggle, and nudges the user with a sys line when the
  mode is off instead of silently flipping it on.
- useMainApp.ts: wires the new event-handler context fields.
- slash/commands/session.ts: /voice handles on / off / tts / status
  with CLI-matching output ("voice: mode on · tts off").

Backward compat preserved for voice.record (was always PTT shape;
gateway still honours start/stop with mode-gating added).
This commit is contained in:
0xbyt4 2026-04-24 00:55:17 +03:00 committed by Teknium
parent 0bb460b070
commit 04c489b587
10 changed files with 861 additions and 78 deletions

View file

@ -3455,43 +3455,154 @@ def _(rid, params: dict) -> dict:
# ── Methods: voice ───────────────────────────────────────────────────
_voice_sid_lock = threading.Lock()
_voice_event_sid: str = ""
def _voice_emit(event: str, payload: dict | None = None) -> None:
"""Emit a voice event toward the session that most recently turned the
mode on. Voice is process-global (one microphone), so there's only ever
one sid to target; the TUI handler treats an empty sid as "active
session". Kept separate from _emit to make the lack of per-call sid
argument explicit."""
with _voice_sid_lock:
sid = _voice_event_sid
_emit(event, sid, payload)
def _voice_mode_enabled() -> bool:
"""Current voice-mode flag. HERMES_VOICE env var wins over config so
the gateway and CLI agree when one of them was launched with an
explicit override."""
env = os.environ.get("HERMES_VOICE", "").strip()
if env in {"0", "1"}:
return env == "1"
return bool(_load_cfg().get("display", {}).get("voice_enabled", False))
def _voice_tts_enabled() -> bool:
"""Whether agent replies should be spoken back via TTS."""
env = os.environ.get("HERMES_VOICE_TTS", "").strip()
if env in {"0", "1"}:
return env == "1"
return bool(_load_cfg().get("display", {}).get("voice_tts", False))
@method("voice.toggle")
def _(rid, params: dict) -> dict:
"""CLI parity for the ``/voice`` slash command.
Subcommands:
* ``status`` report mode + TTS flags (default when action is unknown).
* ``on`` / ``off`` flip voice *mode* (the umbrella bit). Turning it
off also tears down any active continuous recording loop. Does NOT
start recording on its own; recording is driven by ``voice.record``
(Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split.
* ``tts`` toggle speech-output of agent replies. Requires mode on
(mirrors CLI's _toggle_voice_tts guard).
"""
action = params.get("action", "status")
if action == "status":
env = os.environ.get("HERMES_VOICE", "").strip()
if env in {"0", "1"}:
return _ok(rid, {"enabled": env == "1"})
return _ok(
rid,
{
"enabled": bool(
_load_cfg().get("display", {}).get("voice_enabled", False)
)
},
)
# Mirror CLI's _show_voice_status: include STT/TTS provider
# availability so the user can tell at a glance *why* voice mode
# isn't working ("STT provider: MISSING ..." is the common case).
payload: dict = {
"enabled": _voice_mode_enabled(),
"tts": _voice_tts_enabled(),
}
try:
from tools.voice_mode import check_voice_requirements
reqs = check_voice_requirements()
payload["available"] = bool(reqs.get("available"))
payload["audio_available"] = bool(reqs.get("audio_available"))
payload["stt_available"] = bool(reqs.get("stt_available"))
payload["details"] = reqs.get("details") or ""
except Exception as e:
# check_voice_requirements pulls optional transcription deps —
# swallow so /voice status always returns something useful.
logger.warning("voice.toggle status: requirements probe failed: %s", e)
return _ok(rid, payload)
if action in ("on", "off"):
enabled = action == "on"
os.environ["HERMES_VOICE"] = "1" if enabled else "0"
_write_config_key("display.voice_enabled", enabled)
return _ok(rid, {"enabled": action == "on"})
if not enabled:
# Disabling the mode must tear the continuous loop down; the
# loop holds the microphone and would otherwise keep running.
try:
from hermes_cli.voice import stop_continuous
stop_continuous()
except ImportError:
pass
except Exception as e:
logger.warning("voice: stop_continuous failed during toggle off: %s", e)
return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()})
if action == "tts":
if not _voice_mode_enabled():
return _err(rid, 4014, "enable voice mode first: /voice on")
new_value = not _voice_tts_enabled()
os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0"
_write_config_key("display.voice_tts", new_value)
return _ok(rid, {"enabled": True, "tts": new_value})
return _err(rid, 4013, f"unknown voice action: {action}")
@method("voice.record")
def _(rid, params: dict) -> dict:
"""VAD-driven continuous record loop, CLI-parity.
``start`` turns on a VAD loop that emits ``voice.transcript`` events
for each detected utterance and auto-restarts for the next turn.
``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while-
recording branch clearing ``_voice_continuous``). Three consecutive
silent cycles stop the loop automatically and emit a
``voice.transcript`` with ``no_speech_limit=True``.
"""
action = params.get("action", "start")
if action not in {"start", "stop"}:
return _err(rid, 4019, f"unknown voice action: {action}")
try:
if action == "start":
from hermes_cli.voice import start_recording
if not _voice_mode_enabled():
return _err(rid, 4015, "voice mode is off — enable with /voice on")
start_recording()
with _voice_sid_lock:
global _voice_event_sid
_voice_event_sid = params.get("session_id") or _voice_event_sid
from hermes_cli.voice import start_continuous
voice_cfg = _load_cfg().get("voice", {})
start_continuous(
on_transcript=lambda t: _voice_emit(
"voice.transcript", {"text": t}
),
on_status=lambda s: _voice_emit("voice.status", {"state": s}),
on_silent_limit=lambda: _voice_emit(
"voice.transcript", {"no_speech_limit": True}
),
silence_threshold=voice_cfg.get("silence_threshold", 200),
silence_duration=voice_cfg.get("silence_duration", 3.0),
)
return _ok(rid, {"status": "recording"})
if action == "stop":
from hermes_cli.voice import stop_and_transcribe
return _ok(rid, {"text": stop_and_transcribe() or ""})
return _err(rid, 4019, f"unknown voice action: {action}")
# action == "stop"
from hermes_cli.voice import stop_continuous
stop_continuous()
return _ok(rid, {"status": "stopped"})
except ImportError:
return _err(
rid, 5025, "voice module not available — install audio dependencies"