mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tui): match CLI's voice slash + VAD-continuous recording model
The TUI had drifted from the CLI's voice model in two ways:
- /voice on was lighting up the microphone immediately and Ctrl+B was
interpreted as a mode toggle. The CLI separates the two: /voice on
just flips the umbrella bit, recording only starts once the user
presses Ctrl+B, which also sets _voice_continuous so the VAD loop
auto-restarts until the user presses Ctrl+B again or three silent
cycles pass.
- /voice tts was missing entirely, so users couldn't turn agent reply
speech on/off from inside the TUI.
This commit brings the TUI to parity.
Python
- hermes_cli/voice.py: continuous-mode API (start_continuous,
stop_continuous, is_continuous_active) layered on the existing PTT
wrappers. The silence callback transcribes, fires on_transcript,
tracks consecutive no-speech cycles, and auto-restarts — mirroring
cli.py:_voice_stop_and_transcribe + _restart_recording.
- tui_gateway/server.py:
- voice.toggle now supports on / off / tts / status. The umbrella
bit lives in HERMES_VOICE + display.voice_enabled; tts lives in
HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down
any active continuous loop so a toggle-off really releases the
microphone.
- voice.record start/stop now drives start_continuous/stop_continuous.
start is refused with a clear error when the mode is off, matching
cli.py:handle_voice_record's early return on `not _voice_mode`.
- New voice.transcript / voice.status events emit through
_voice_emit (remembers the sid that last enabled the mode so
events land in the right session).
TypeScript
- gatewayTypes.ts: voice.status + voice.transcript event
discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse
gains status for the new "started/stopped" responses.
- interfaces.ts: GatewayEventHandlerContext gains composer.setInput +
submission.submitRef + voice.{setRecording, setProcessing,
setVoiceEnabled}; InputHandlerContext.voice gains enabled +
setVoiceEnabled for the mode-aware Ctrl+B handler.
- createGatewayEventHandler.ts: voice.status drives REC/STT badges;
voice.transcript auto-submits when the composer is empty (CLI
_pending_input.put parity) and appends when a draft is in flight.
no_speech_limit flips voice off + sys line.
- useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop),
not voice.toggle, and nudges the user with a sys line when the
mode is off instead of silently flipping it on.
- useMainApp.ts: wires the new event-handler context fields.
- slash/commands/session.ts: /voice handles on / off / tts / status
with CLI-matching output ("voice: mode on · tts off").
Backward compat preserved for voice.record (was always PTT shape;
gateway still honours start/stop with mode-gating added).
This commit is contained in:
parent
0bb460b070
commit
04c489b587
10 changed files with 861 additions and 78 deletions
|
|
@ -2,18 +2,31 @@
|
|||
|
||||
Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
|
||||
(text-to-speech) behind idempotent, stateful entry points that the gateway's
|
||||
``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a
|
||||
dedicated thread. The gateway imports this module lazily so missing optional
|
||||
audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError``
|
||||
at call time, not at startup.
|
||||
``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can
|
||||
call from a dedicated thread. The gateway imports this module lazily so that
|
||||
missing optional audio deps (sounddevice, faster-whisper, numpy) surface as
|
||||
an ``ImportError`` at call time, not at startup.
|
||||
|
||||
Two usage modes are exposed:
|
||||
|
||||
* **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single
|
||||
manually-bounded capture used when the caller drives the start/stop pair
|
||||
explicitly.
|
||||
* **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors
|
||||
the classic CLI voice mode: recording auto-stops on silence, transcribes,
|
||||
hands the result to a callback, and then auto-restarts for the next turn.
|
||||
Three consecutive no-speech cycles stop the loop and fire
|
||||
``on_silent_limit`` so the UI can turn the mode off.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
from typing import Optional
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from tools.voice_mode import (
|
||||
create_audio_recorder,
|
||||
|
|
@ -24,15 +37,71 @@ from tools.voice_mode import (
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _debug(msg: str) -> None:
|
||||
"""Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1.
|
||||
|
||||
Goes to stderr so the TUI gateway wraps it as a gateway.stderr event,
|
||||
which createGatewayEventHandler shows as an Activity line — exactly
|
||||
what we need to diagnose "why didn't the loop auto-restart?" in the
|
||||
user's real terminal without shipping a separate debug RPC.
|
||||
"""
|
||||
if os.environ.get("HERMES_VOICE_DEBUG", "").strip() == "1":
|
||||
print(f"[voice] {msg}", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def _beeps_enabled() -> bool:
|
||||
"""CLI parity: voice.beep_enabled in config.yaml (default True)."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
|
||||
voice_cfg = load_config().get("voice", {})
|
||||
if isinstance(voice_cfg, dict):
|
||||
return bool(voice_cfg.get("beep_enabled", True))
|
||||
except Exception:
|
||||
pass
|
||||
return True
|
||||
|
||||
|
||||
def _play_beep(frequency: int, count: int = 1) -> None:
|
||||
"""Audible cue matching cli.py's record/stop beeps.
|
||||
|
||||
880 Hz single-beep on start (cli.py:_voice_start_recording line 7532),
|
||||
660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585).
|
||||
Best-effort — sounddevice failures are silently swallowed so the
|
||||
voice loop never breaks because a speaker was unavailable.
|
||||
"""
|
||||
if not _beeps_enabled():
|
||||
return
|
||||
try:
|
||||
from tools.voice_mode import play_beep
|
||||
|
||||
play_beep(frequency=frequency, count=count)
|
||||
except Exception as e:
|
||||
_debug(f"beep {frequency}Hz failed: {e}")
|
||||
|
||||
# ── Push-to-talk state ───────────────────────────────────────────────
|
||||
_recorder = None
|
||||
_recorder_lock = threading.Lock()
|
||||
|
||||
# ── Continuous (VAD) state ───────────────────────────────────────────
|
||||
_continuous_lock = threading.Lock()
|
||||
_continuous_active = False
|
||||
_continuous_recorder: Any = None
|
||||
_continuous_on_transcript: Optional[Callable[[str], None]] = None
|
||||
_continuous_on_status: Optional[Callable[[str], None]] = None
|
||||
_continuous_on_silent_limit: Optional[Callable[[], None]] = None
|
||||
_continuous_no_speech_count = 0
|
||||
_CONTINUOUS_NO_SPEECH_LIMIT = 3
|
||||
|
||||
|
||||
# ── Push-to-talk API ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
def start_recording() -> None:
|
||||
"""Begin capturing from the default input device.
|
||||
"""Begin capturing from the default input device (push-to-talk).
|
||||
|
||||
Idempotent — calling again while a recording is in progress is a no-op,
|
||||
which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops).
|
||||
Idempotent — calling again while a recording is in progress is a no-op.
|
||||
"""
|
||||
global _recorder
|
||||
|
||||
|
|
@ -40,20 +109,15 @@ def start_recording() -> None:
|
|||
if _recorder is not None and getattr(_recorder, "is_recording", False):
|
||||
return
|
||||
rec = create_audio_recorder()
|
||||
# No silence callback: the TUI drives start/stop explicitly via
|
||||
# the voice.record RPC. VAD auto-stop is a CLI-mode feature.
|
||||
rec.start()
|
||||
_recorder = rec
|
||||
|
||||
|
||||
def stop_and_transcribe() -> Optional[str]:
|
||||
"""Stop the active recording, transcribe it, and return the text.
|
||||
"""Stop the active push-to-talk recording, transcribe, return text.
|
||||
|
||||
Returns ``None`` when no recording is active, when the microphone
|
||||
captured no speech, or when Whisper returned a known hallucination
|
||||
token (silence artefacts like "Thanks for watching!"). The caller
|
||||
treats ``None`` as "no speech detected" and leaves the composer
|
||||
untouched.
|
||||
captured no speech, or when Whisper returned a known hallucination.
|
||||
"""
|
||||
global _recorder
|
||||
|
||||
|
|
@ -73,27 +137,281 @@ def stop_and_transcribe() -> Optional[str]:
|
|||
except Exception as e:
|
||||
logger.warning("voice transcription failed: %s", e)
|
||||
return None
|
||||
finally:
|
||||
try:
|
||||
if os.path.isfile(wav_path):
|
||||
os.unlink(wav_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
text = (result.get("text") or "").strip()
|
||||
# transcribe_recording returns {"success": bool, "transcript": str, ...}
|
||||
# — matches cli.py:_voice_stop_and_transcribe's result.get("transcript").
|
||||
if not result.get("success"):
|
||||
return None
|
||||
text = (result.get("transcript") or "").strip()
|
||||
if not text or is_whisper_hallucination(text):
|
||||
return None
|
||||
|
||||
return text
|
||||
|
||||
|
||||
# ── Continuous (VAD) API ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def start_continuous(
|
||||
on_transcript: Callable[[str], None],
|
||||
on_status: Optional[Callable[[str], None]] = None,
|
||||
on_silent_limit: Optional[Callable[[], None]] = None,
|
||||
silence_threshold: int = 200,
|
||||
silence_duration: float = 3.0,
|
||||
) -> None:
|
||||
"""Start a VAD-driven continuous recording loop.
|
||||
|
||||
The loop calls ``on_transcript(text)`` each time speech is detected and
|
||||
transcribed successfully, then auto-restarts. After
|
||||
``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
|
||||
picked up at all) the loop stops itself and calls ``on_silent_limit``
|
||||
so the UI can reflect "voice off". Idempotent — calling while already
|
||||
active is a no-op.
|
||||
|
||||
``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
|
||||
``"idle"`` so the UI can show a live indicator.
|
||||
"""
|
||||
global _continuous_active, _continuous_recorder
|
||||
global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
|
||||
global _continuous_no_speech_count
|
||||
|
||||
with _continuous_lock:
|
||||
if _continuous_active:
|
||||
_debug("start_continuous: already active — no-op")
|
||||
return
|
||||
_continuous_active = True
|
||||
_continuous_on_transcript = on_transcript
|
||||
_continuous_on_status = on_status
|
||||
_continuous_on_silent_limit = on_silent_limit
|
||||
_continuous_no_speech_count = 0
|
||||
|
||||
if _continuous_recorder is None:
|
||||
_continuous_recorder = create_audio_recorder()
|
||||
|
||||
_continuous_recorder._silence_threshold = silence_threshold
|
||||
_continuous_recorder._silence_duration = silence_duration
|
||||
rec = _continuous_recorder
|
||||
|
||||
_debug(
|
||||
f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)"
|
||||
)
|
||||
|
||||
# CLI parity: single 880 Hz beep *before* opening the stream — placing
|
||||
# the beep after stream.start() on macOS triggers a CoreAudio conflict
|
||||
# (cli.py:7528 comment).
|
||||
_play_beep(frequency=880, count=1)
|
||||
|
||||
try:
|
||||
rec.start(on_silence_stop=_continuous_on_silence)
|
||||
except Exception as e:
|
||||
logger.error("failed to start continuous recording: %s", e)
|
||||
_debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}")
|
||||
with _continuous_lock:
|
||||
_continuous_active = False
|
||||
raise
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("listening")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def stop_continuous() -> None:
|
||||
"""Stop the active continuous loop and release the microphone.
|
||||
|
||||
Idempotent — calling while not active is a no-op. Any in-flight
|
||||
transcription completes but its result is discarded (the callback
|
||||
checks ``_continuous_active`` before firing).
|
||||
"""
|
||||
global _continuous_active, _continuous_on_transcript
|
||||
global _continuous_on_status, _continuous_on_silent_limit
|
||||
global _continuous_recorder, _continuous_no_speech_count
|
||||
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
return
|
||||
_continuous_active = False
|
||||
rec = _continuous_recorder
|
||||
on_status = _continuous_on_status
|
||||
_continuous_on_transcript = None
|
||||
_continuous_on_status = None
|
||||
_continuous_on_silent_limit = None
|
||||
_continuous_no_speech_count = 0
|
||||
|
||||
if rec is not None:
|
||||
try:
|
||||
# cancel() (not stop()) discards buffered frames — the loop
|
||||
# is over, we don't want to transcribe a half-captured turn.
|
||||
rec.cancel()
|
||||
except Exception as e:
|
||||
logger.warning("failed to cancel recorder: %s", e)
|
||||
|
||||
# Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the
|
||||
# silence-auto-stop path plays).
|
||||
_play_beep(frequency=660, count=2)
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("idle")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def is_continuous_active() -> bool:
|
||||
"""Whether a continuous voice loop is currently running."""
|
||||
with _continuous_lock:
|
||||
return _continuous_active
|
||||
|
||||
|
||||
def _continuous_on_silence() -> None:
|
||||
"""AudioRecorder silence callback — runs in a daemon thread.
|
||||
|
||||
Stops the current capture, transcribes, delivers the text via
|
||||
``on_transcript``, and — if the loop is still active — starts the
|
||||
next capture. Three consecutive silent cycles end the loop.
|
||||
"""
|
||||
global _continuous_active, _continuous_no_speech_count
|
||||
|
||||
_debug("_continuous_on_silence: fired")
|
||||
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
_debug("_continuous_on_silence: loop inactive — abort")
|
||||
return
|
||||
rec = _continuous_recorder
|
||||
on_transcript = _continuous_on_transcript
|
||||
on_status = _continuous_on_status
|
||||
on_silent_limit = _continuous_on_silent_limit
|
||||
|
||||
if rec is None:
|
||||
_debug("_continuous_on_silence: no recorder — abort")
|
||||
return
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("transcribing")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
wav_path = rec.stop()
|
||||
# Peak RMS is the critical diagnostic when stop() returns None despite
|
||||
# the VAD firing — tells us at a glance whether the mic was too quiet
|
||||
# for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree.
|
||||
peak_rms = getattr(rec, "_peak_rms", -1)
|
||||
_debug(
|
||||
f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})"
|
||||
)
|
||||
|
||||
# CLI parity: double 660 Hz beep after the stream stops (safe from the
|
||||
# CoreAudio conflict that blocks pre-start beeps).
|
||||
_play_beep(frequency=660, count=2)
|
||||
|
||||
transcript: Optional[str] = None
|
||||
|
||||
if wav_path:
|
||||
try:
|
||||
result = transcribe_recording(wav_path)
|
||||
# transcribe_recording returns {"success": bool, "transcript": str,
|
||||
# "error": str?} — NOT {"text": str}. Using the wrong key silently
|
||||
# produced empty transcripts even when Groq/local STT returned fine,
|
||||
# which masqueraded as "not hearing the user" to the caller.
|
||||
success = bool(result.get("success"))
|
||||
text = (result.get("transcript") or "").strip()
|
||||
err = result.get("error")
|
||||
_debug(
|
||||
f"_continuous_on_silence: transcribe -> success={success} "
|
||||
f"text={text!r} err={err!r}"
|
||||
)
|
||||
if success and text and not is_whisper_hallucination(text):
|
||||
transcript = text
|
||||
except Exception as e:
|
||||
logger.warning("continuous transcription failed: %s", e)
|
||||
_debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}")
|
||||
finally:
|
||||
try:
|
||||
if os.path.isfile(wav_path):
|
||||
os.unlink(wav_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
with _continuous_lock:
|
||||
if not _continuous_active:
|
||||
# User stopped us while we were transcribing — discard.
|
||||
_debug("_continuous_on_silence: stopped during transcribe — no restart")
|
||||
return
|
||||
if transcript:
|
||||
_continuous_no_speech_count = 0
|
||||
else:
|
||||
_continuous_no_speech_count += 1
|
||||
should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT
|
||||
no_speech = _continuous_no_speech_count
|
||||
|
||||
if transcript and on_transcript:
|
||||
try:
|
||||
on_transcript(transcript)
|
||||
except Exception as e:
|
||||
logger.warning("on_transcript callback raised: %s", e)
|
||||
|
||||
if should_halt:
|
||||
_debug(f"_continuous_on_silence: {no_speech} silent cycles — halting")
|
||||
with _continuous_lock:
|
||||
_continuous_active = False
|
||||
_continuous_no_speech_count = 0
|
||||
if on_silent_limit:
|
||||
try:
|
||||
on_silent_limit()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
rec.cancel()
|
||||
except Exception:
|
||||
pass
|
||||
if on_status:
|
||||
try:
|
||||
on_status("idle")
|
||||
except Exception:
|
||||
pass
|
||||
return
|
||||
|
||||
# Restart for the next turn.
|
||||
_debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
|
||||
_play_beep(frequency=880, count=1)
|
||||
try:
|
||||
rec.start(on_silence_stop=_continuous_on_silence)
|
||||
except Exception as e:
|
||||
logger.error("failed to restart continuous recording: %s", e)
|
||||
_debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
|
||||
with _continuous_lock:
|
||||
_continuous_active = False
|
||||
return
|
||||
|
||||
if on_status:
|
||||
try:
|
||||
on_status("listening")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ── TTS API ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def speak_text(text: str) -> None:
|
||||
"""Synthesize ``text`` with the configured TTS provider and play it.
|
||||
|
||||
The gateway spawns a daemon thread to call this so the RPC returns
|
||||
immediately. Failures are logged and swallowed — the UI already
|
||||
acknowledged "speaking" by the time we get here.
|
||||
immediately. Failures are logged and swallowed.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return
|
||||
|
||||
# Lazy import — tts_tool pulls optional provider SDKs (OpenAI,
|
||||
# ElevenLabs, etc.) and config-reading machinery that we don't
|
||||
# want to load at module import time.
|
||||
# Lazy import — tts_tool pulls optional provider SDKs.
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue