mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tui): match CLI's voice slash + VAD-continuous recording model
The TUI had drifted from the CLI's voice model in two ways:
- /voice on was lighting up the microphone immediately and Ctrl+B was
interpreted as a mode toggle. The CLI separates the two: /voice on
just flips the umbrella bit, recording only starts once the user
presses Ctrl+B, which also sets _voice_continuous so the VAD loop
auto-restarts until the user presses Ctrl+B again or three silent
cycles pass.
- /voice tts was missing entirely, so users couldn't turn agent reply
speech on/off from inside the TUI.
This commit brings the TUI to parity.
Python
- hermes_cli/voice.py: continuous-mode API (start_continuous,
stop_continuous, is_continuous_active) layered on the existing PTT
wrappers. The silence callback transcribes, fires on_transcript,
tracks consecutive no-speech cycles, and auto-restarts — mirroring
cli.py:_voice_stop_and_transcribe + _restart_recording.
- tui_gateway/server.py:
- voice.toggle now supports on / off / tts / status. The umbrella
bit lives in HERMES_VOICE + display.voice_enabled; tts lives in
HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down
any active continuous loop so a toggle-off really releases the
microphone.
- voice.record start/stop now drives start_continuous/stop_continuous.
start is refused with a clear error when the mode is off, matching
cli.py:handle_voice_record's early return on `not _voice_mode`.
- New voice.transcript / voice.status events emit through
_voice_emit (remembers the sid that last enabled the mode so
events land in the right session).
TypeScript
- gatewayTypes.ts: voice.status + voice.transcript event
discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse
gains status for the new "started/stopped" responses.
- interfaces.ts: GatewayEventHandlerContext gains composer.setInput +
submission.submitRef + voice.{setRecording, setProcessing,
setVoiceEnabled}; InputHandlerContext.voice gains enabled +
setVoiceEnabled for the mode-aware Ctrl+B handler.
- createGatewayEventHandler.ts: voice.status drives REC/STT badges;
voice.transcript auto-submits when the composer is empty (CLI
_pending_input.put parity) and appends when a draft is in flight.
no_speech_limit flips voice off + sys line.
- useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop),
not voice.toggle, and nudges the user with a sys line when the
mode is off instead of silently flipping it on.
- useMainApp.ts: wires the new event-handler context fields.
- slash/commands/session.ts: /voice handles on / off / tts / status
with CLI-matching output ("voice: mode on · tts off").
Backward compat preserved for voice.record (was always PTT shape;
gateway still honours start/stop with mode-gating added).
This commit is contained in:
parent
0bb460b070
commit
04c489b587
10 changed files with 861 additions and 78 deletions
|
|
@ -2,18 +2,31 @@
|
||||||
|
|
||||||
Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
|
Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
|
||||||
(text-to-speech) behind idempotent, stateful entry points that the gateway's
|
(text-to-speech) behind idempotent, stateful entry points that the gateway's
|
||||||
``voice.record`` and ``voice.tts`` JSON-RPC handlers can call from a
|
``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can
|
||||||
dedicated thread. The gateway imports this module lazily so missing optional
|
call from a dedicated thread. The gateway imports this module lazily so that
|
||||||
audio deps (sounddevice, faster-whisper, numpy) surface as an ``ImportError``
|
missing optional audio deps (sounddevice, faster-whisper, numpy) surface as
|
||||||
at call time, not at startup.
|
an ``ImportError`` at call time, not at startup.
|
||||||
|
|
||||||
|
Two usage modes are exposed:
|
||||||
|
|
||||||
|
* **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single
|
||||||
|
manually-bounded capture used when the caller drives the start/stop pair
|
||||||
|
explicitly.
|
||||||
|
* **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors
|
||||||
|
the classic CLI voice mode: recording auto-stops on silence, transcribes,
|
||||||
|
hands the result to a callback, and then auto-restarts for the next turn.
|
||||||
|
Three consecutive no-speech cycles stop the loop and fire
|
||||||
|
``on_silent_limit`` so the UI can turn the mode off.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
import threading
|
import threading
|
||||||
from typing import Optional
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
from tools.voice_mode import (
|
from tools.voice_mode import (
|
||||||
create_audio_recorder,
|
create_audio_recorder,
|
||||||
|
|
@ -24,15 +37,71 @@ from tools.voice_mode import (
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _debug(msg: str) -> None:
|
||||||
|
"""Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1.
|
||||||
|
|
||||||
|
Goes to stderr so the TUI gateway wraps it as a gateway.stderr event,
|
||||||
|
which createGatewayEventHandler shows as an Activity line — exactly
|
||||||
|
what we need to diagnose "why didn't the loop auto-restart?" in the
|
||||||
|
user's real terminal without shipping a separate debug RPC.
|
||||||
|
"""
|
||||||
|
if os.environ.get("HERMES_VOICE_DEBUG", "").strip() == "1":
|
||||||
|
print(f"[voice] {msg}", file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _beeps_enabled() -> bool:
|
||||||
|
"""CLI parity: voice.beep_enabled in config.yaml (default True)."""
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
|
||||||
|
voice_cfg = load_config().get("voice", {})
|
||||||
|
if isinstance(voice_cfg, dict):
|
||||||
|
return bool(voice_cfg.get("beep_enabled", True))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _play_beep(frequency: int, count: int = 1) -> None:
|
||||||
|
"""Audible cue matching cli.py's record/stop beeps.
|
||||||
|
|
||||||
|
880 Hz single-beep on start (cli.py:_voice_start_recording line 7532),
|
||||||
|
660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585).
|
||||||
|
Best-effort — sounddevice failures are silently swallowed so the
|
||||||
|
voice loop never breaks because a speaker was unavailable.
|
||||||
|
"""
|
||||||
|
if not _beeps_enabled():
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from tools.voice_mode import play_beep
|
||||||
|
|
||||||
|
play_beep(frequency=frequency, count=count)
|
||||||
|
except Exception as e:
|
||||||
|
_debug(f"beep {frequency}Hz failed: {e}")
|
||||||
|
|
||||||
|
# ── Push-to-talk state ───────────────────────────────────────────────
|
||||||
_recorder = None
|
_recorder = None
|
||||||
_recorder_lock = threading.Lock()
|
_recorder_lock = threading.Lock()
|
||||||
|
|
||||||
|
# ── Continuous (VAD) state ───────────────────────────────────────────
|
||||||
|
_continuous_lock = threading.Lock()
|
||||||
|
_continuous_active = False
|
||||||
|
_continuous_recorder: Any = None
|
||||||
|
_continuous_on_transcript: Optional[Callable[[str], None]] = None
|
||||||
|
_continuous_on_status: Optional[Callable[[str], None]] = None
|
||||||
|
_continuous_on_silent_limit: Optional[Callable[[], None]] = None
|
||||||
|
_continuous_no_speech_count = 0
|
||||||
|
_CONTINUOUS_NO_SPEECH_LIMIT = 3
|
||||||
|
|
||||||
|
|
||||||
|
# ── Push-to-talk API ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def start_recording() -> None:
|
def start_recording() -> None:
|
||||||
"""Begin capturing from the default input device.
|
"""Begin capturing from the default input device (push-to-talk).
|
||||||
|
|
||||||
Idempotent — calling again while a recording is in progress is a no-op,
|
Idempotent — calling again while a recording is in progress is a no-op.
|
||||||
which matches the TUI's toggle semantics (Ctrl+B starts, Ctrl+B stops).
|
|
||||||
"""
|
"""
|
||||||
global _recorder
|
global _recorder
|
||||||
|
|
||||||
|
|
@ -40,20 +109,15 @@ def start_recording() -> None:
|
||||||
if _recorder is not None and getattr(_recorder, "is_recording", False):
|
if _recorder is not None and getattr(_recorder, "is_recording", False):
|
||||||
return
|
return
|
||||||
rec = create_audio_recorder()
|
rec = create_audio_recorder()
|
||||||
# No silence callback: the TUI drives start/stop explicitly via
|
|
||||||
# the voice.record RPC. VAD auto-stop is a CLI-mode feature.
|
|
||||||
rec.start()
|
rec.start()
|
||||||
_recorder = rec
|
_recorder = rec
|
||||||
|
|
||||||
|
|
||||||
def stop_and_transcribe() -> Optional[str]:
|
def stop_and_transcribe() -> Optional[str]:
|
||||||
"""Stop the active recording, transcribe it, and return the text.
|
"""Stop the active push-to-talk recording, transcribe, return text.
|
||||||
|
|
||||||
Returns ``None`` when no recording is active, when the microphone
|
Returns ``None`` when no recording is active, when the microphone
|
||||||
captured no speech, or when Whisper returned a known hallucination
|
captured no speech, or when Whisper returned a known hallucination.
|
||||||
token (silence artefacts like "Thanks for watching!"). The caller
|
|
||||||
treats ``None`` as "no speech detected" and leaves the composer
|
|
||||||
untouched.
|
|
||||||
"""
|
"""
|
||||||
global _recorder
|
global _recorder
|
||||||
|
|
||||||
|
|
@ -73,27 +137,281 @@ def stop_and_transcribe() -> Optional[str]:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("voice transcription failed: %s", e)
|
logger.warning("voice transcription failed: %s", e)
|
||||||
return None
|
return None
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
if os.path.isfile(wav_path):
|
||||||
|
os.unlink(wav_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
text = (result.get("text") or "").strip()
|
# transcribe_recording returns {"success": bool, "transcript": str, ...}
|
||||||
|
# — matches cli.py:_voice_stop_and_transcribe's result.get("transcript").
|
||||||
|
if not result.get("success"):
|
||||||
|
return None
|
||||||
|
text = (result.get("transcript") or "").strip()
|
||||||
if not text or is_whisper_hallucination(text):
|
if not text or is_whisper_hallucination(text):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
# ── Continuous (VAD) API ─────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def start_continuous(
|
||||||
|
on_transcript: Callable[[str], None],
|
||||||
|
on_status: Optional[Callable[[str], None]] = None,
|
||||||
|
on_silent_limit: Optional[Callable[[], None]] = None,
|
||||||
|
silence_threshold: int = 200,
|
||||||
|
silence_duration: float = 3.0,
|
||||||
|
) -> None:
|
||||||
|
"""Start a VAD-driven continuous recording loop.
|
||||||
|
|
||||||
|
The loop calls ``on_transcript(text)`` each time speech is detected and
|
||||||
|
transcribed successfully, then auto-restarts. After
|
||||||
|
``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
|
||||||
|
picked up at all) the loop stops itself and calls ``on_silent_limit``
|
||||||
|
so the UI can reflect "voice off". Idempotent — calling while already
|
||||||
|
active is a no-op.
|
||||||
|
|
||||||
|
``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
|
||||||
|
``"idle"`` so the UI can show a live indicator.
|
||||||
|
"""
|
||||||
|
global _continuous_active, _continuous_recorder
|
||||||
|
global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
|
||||||
|
global _continuous_no_speech_count
|
||||||
|
|
||||||
|
with _continuous_lock:
|
||||||
|
if _continuous_active:
|
||||||
|
_debug("start_continuous: already active — no-op")
|
||||||
|
return
|
||||||
|
_continuous_active = True
|
||||||
|
_continuous_on_transcript = on_transcript
|
||||||
|
_continuous_on_status = on_status
|
||||||
|
_continuous_on_silent_limit = on_silent_limit
|
||||||
|
_continuous_no_speech_count = 0
|
||||||
|
|
||||||
|
if _continuous_recorder is None:
|
||||||
|
_continuous_recorder = create_audio_recorder()
|
||||||
|
|
||||||
|
_continuous_recorder._silence_threshold = silence_threshold
|
||||||
|
_continuous_recorder._silence_duration = silence_duration
|
||||||
|
rec = _continuous_recorder
|
||||||
|
|
||||||
|
_debug(
|
||||||
|
f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# CLI parity: single 880 Hz beep *before* opening the stream — placing
|
||||||
|
# the beep after stream.start() on macOS triggers a CoreAudio conflict
|
||||||
|
# (cli.py:7528 comment).
|
||||||
|
_play_beep(frequency=880, count=1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
rec.start(on_silence_stop=_continuous_on_silence)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("failed to start continuous recording: %s", e)
|
||||||
|
_debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}")
|
||||||
|
with _continuous_lock:
|
||||||
|
_continuous_active = False
|
||||||
|
raise
|
||||||
|
|
||||||
|
if on_status:
|
||||||
|
try:
|
||||||
|
on_status("listening")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def stop_continuous() -> None:
|
||||||
|
"""Stop the active continuous loop and release the microphone.
|
||||||
|
|
||||||
|
Idempotent — calling while not active is a no-op. Any in-flight
|
||||||
|
transcription completes but its result is discarded (the callback
|
||||||
|
checks ``_continuous_active`` before firing).
|
||||||
|
"""
|
||||||
|
global _continuous_active, _continuous_on_transcript
|
||||||
|
global _continuous_on_status, _continuous_on_silent_limit
|
||||||
|
global _continuous_recorder, _continuous_no_speech_count
|
||||||
|
|
||||||
|
with _continuous_lock:
|
||||||
|
if not _continuous_active:
|
||||||
|
return
|
||||||
|
_continuous_active = False
|
||||||
|
rec = _continuous_recorder
|
||||||
|
on_status = _continuous_on_status
|
||||||
|
_continuous_on_transcript = None
|
||||||
|
_continuous_on_status = None
|
||||||
|
_continuous_on_silent_limit = None
|
||||||
|
_continuous_no_speech_count = 0
|
||||||
|
|
||||||
|
if rec is not None:
|
||||||
|
try:
|
||||||
|
# cancel() (not stop()) discards buffered frames — the loop
|
||||||
|
# is over, we don't want to transcribe a half-captured turn.
|
||||||
|
rec.cancel()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("failed to cancel recorder: %s", e)
|
||||||
|
|
||||||
|
# Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the
|
||||||
|
# silence-auto-stop path plays).
|
||||||
|
_play_beep(frequency=660, count=2)
|
||||||
|
|
||||||
|
if on_status:
|
||||||
|
try:
|
||||||
|
on_status("idle")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def is_continuous_active() -> bool:
|
||||||
|
"""Whether a continuous voice loop is currently running."""
|
||||||
|
with _continuous_lock:
|
||||||
|
return _continuous_active
|
||||||
|
|
||||||
|
|
||||||
|
def _continuous_on_silence() -> None:
|
||||||
|
"""AudioRecorder silence callback — runs in a daemon thread.
|
||||||
|
|
||||||
|
Stops the current capture, transcribes, delivers the text via
|
||||||
|
``on_transcript``, and — if the loop is still active — starts the
|
||||||
|
next capture. Three consecutive silent cycles end the loop.
|
||||||
|
"""
|
||||||
|
global _continuous_active, _continuous_no_speech_count
|
||||||
|
|
||||||
|
_debug("_continuous_on_silence: fired")
|
||||||
|
|
||||||
|
with _continuous_lock:
|
||||||
|
if not _continuous_active:
|
||||||
|
_debug("_continuous_on_silence: loop inactive — abort")
|
||||||
|
return
|
||||||
|
rec = _continuous_recorder
|
||||||
|
on_transcript = _continuous_on_transcript
|
||||||
|
on_status = _continuous_on_status
|
||||||
|
on_silent_limit = _continuous_on_silent_limit
|
||||||
|
|
||||||
|
if rec is None:
|
||||||
|
_debug("_continuous_on_silence: no recorder — abort")
|
||||||
|
return
|
||||||
|
|
||||||
|
if on_status:
|
||||||
|
try:
|
||||||
|
on_status("transcribing")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
wav_path = rec.stop()
|
||||||
|
# Peak RMS is the critical diagnostic when stop() returns None despite
|
||||||
|
# the VAD firing — tells us at a glance whether the mic was too quiet
|
||||||
|
# for SILENCE_RMS_THRESHOLD (200) or the VAD + peak checks disagree.
|
||||||
|
peak_rms = getattr(rec, "_peak_rms", -1)
|
||||||
|
_debug(
|
||||||
|
f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# CLI parity: double 660 Hz beep after the stream stops (safe from the
|
||||||
|
# CoreAudio conflict that blocks pre-start beeps).
|
||||||
|
_play_beep(frequency=660, count=2)
|
||||||
|
|
||||||
|
transcript: Optional[str] = None
|
||||||
|
|
||||||
|
if wav_path:
|
||||||
|
try:
|
||||||
|
result = transcribe_recording(wav_path)
|
||||||
|
# transcribe_recording returns {"success": bool, "transcript": str,
|
||||||
|
# "error": str?} — NOT {"text": str}. Using the wrong key silently
|
||||||
|
# produced empty transcripts even when Groq/local STT returned fine,
|
||||||
|
# which masqueraded as "not hearing the user" to the caller.
|
||||||
|
success = bool(result.get("success"))
|
||||||
|
text = (result.get("transcript") or "").strip()
|
||||||
|
err = result.get("error")
|
||||||
|
_debug(
|
||||||
|
f"_continuous_on_silence: transcribe -> success={success} "
|
||||||
|
f"text={text!r} err={err!r}"
|
||||||
|
)
|
||||||
|
if success and text and not is_whisper_hallucination(text):
|
||||||
|
transcript = text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("continuous transcription failed: %s", e)
|
||||||
|
_debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}")
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
if os.path.isfile(wav_path):
|
||||||
|
os.unlink(wav_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
with _continuous_lock:
|
||||||
|
if not _continuous_active:
|
||||||
|
# User stopped us while we were transcribing — discard.
|
||||||
|
_debug("_continuous_on_silence: stopped during transcribe — no restart")
|
||||||
|
return
|
||||||
|
if transcript:
|
||||||
|
_continuous_no_speech_count = 0
|
||||||
|
else:
|
||||||
|
_continuous_no_speech_count += 1
|
||||||
|
should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT
|
||||||
|
no_speech = _continuous_no_speech_count
|
||||||
|
|
||||||
|
if transcript and on_transcript:
|
||||||
|
try:
|
||||||
|
on_transcript(transcript)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("on_transcript callback raised: %s", e)
|
||||||
|
|
||||||
|
if should_halt:
|
||||||
|
_debug(f"_continuous_on_silence: {no_speech} silent cycles — halting")
|
||||||
|
with _continuous_lock:
|
||||||
|
_continuous_active = False
|
||||||
|
_continuous_no_speech_count = 0
|
||||||
|
if on_silent_limit:
|
||||||
|
try:
|
||||||
|
on_silent_limit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
rec.cancel()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if on_status:
|
||||||
|
try:
|
||||||
|
on_status("idle")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return
|
||||||
|
|
||||||
|
# Restart for the next turn.
|
||||||
|
_debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
|
||||||
|
_play_beep(frequency=880, count=1)
|
||||||
|
try:
|
||||||
|
rec.start(on_silence_stop=_continuous_on_silence)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("failed to restart continuous recording: %s", e)
|
||||||
|
_debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
|
||||||
|
with _continuous_lock:
|
||||||
|
_continuous_active = False
|
||||||
|
return
|
||||||
|
|
||||||
|
if on_status:
|
||||||
|
try:
|
||||||
|
on_status("listening")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ── TTS API ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def speak_text(text: str) -> None:
|
def speak_text(text: str) -> None:
|
||||||
"""Synthesize ``text`` with the configured TTS provider and play it.
|
"""Synthesize ``text`` with the configured TTS provider and play it.
|
||||||
|
|
||||||
The gateway spawns a daemon thread to call this so the RPC returns
|
The gateway spawns a daemon thread to call this so the RPC returns
|
||||||
immediately. Failures are logged and swallowed — the UI already
|
immediately. Failures are logged and swallowed.
|
||||||
acknowledged "speaking" by the time we get here.
|
|
||||||
"""
|
"""
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
return
|
return
|
||||||
|
|
||||||
# Lazy import — tts_tool pulls optional provider SDKs (OpenAI,
|
# Lazy import — tts_tool pulls optional provider SDKs.
|
||||||
# ElevenLabs, etc.) and config-reading machinery that we don't
|
|
||||||
# want to load at module import time.
|
|
||||||
from tools.tts_tool import text_to_speech_tool
|
from tools.tts_tool import text_to_speech_tool
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -51,3 +51,205 @@ class TestSpeakTextGuards:
|
||||||
|
|
||||||
# Should simply return None without raising.
|
# Should simply return None without raising.
|
||||||
assert speak_text(text) is None
|
assert speak_text(text) is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestContinuousAPI:
|
||||||
|
"""Continuous (VAD) mode API — CLI-parity loop entry points."""
|
||||||
|
|
||||||
|
def test_continuous_exports(self):
|
||||||
|
from hermes_cli.voice import (
|
||||||
|
is_continuous_active,
|
||||||
|
start_continuous,
|
||||||
|
stop_continuous,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert callable(start_continuous)
|
||||||
|
assert callable(stop_continuous)
|
||||||
|
assert callable(is_continuous_active)
|
||||||
|
|
||||||
|
def test_not_active_by_default(self, monkeypatch):
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
# Isolate from any state left behind by other tests in the session.
|
||||||
|
monkeypatch.setattr(voice, "_continuous_active", False)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_recorder", None)
|
||||||
|
|
||||||
|
assert voice.is_continuous_active() is False
|
||||||
|
|
||||||
|
def test_stop_continuous_idempotent_when_inactive(self, monkeypatch):
|
||||||
|
"""stop_continuous must not raise when no loop is active — the
|
||||||
|
gateway's voice.toggle off path calls it unconditionally."""
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
monkeypatch.setattr(voice, "_continuous_active", False)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_recorder", None)
|
||||||
|
|
||||||
|
# Should return cleanly without exceptions
|
||||||
|
assert voice.stop_continuous() is None
|
||||||
|
assert voice.is_continuous_active() is False
|
||||||
|
|
||||||
|
def test_double_start_is_idempotent(self, monkeypatch):
|
||||||
|
"""A second start_continuous while already active is a no-op — prevents
|
||||||
|
two overlapping capture threads fighting over the microphone when the
|
||||||
|
UI double-fires (e.g. both /voice on and Ctrl+B within the same tick)."""
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
monkeypatch.setattr(voice, "_continuous_active", True)
|
||||||
|
called = {"n": 0}
|
||||||
|
|
||||||
|
class FakeRecorder:
|
||||||
|
def start(self, on_silence_stop=None):
|
||||||
|
called["n"] += 1
|
||||||
|
|
||||||
|
def cancel(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder())
|
||||||
|
|
||||||
|
voice.start_continuous(on_transcript=lambda _t: None)
|
||||||
|
|
||||||
|
# The guard inside start_continuous short-circuits before rec.start()
|
||||||
|
assert called["n"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestContinuousLoopSimulation:
|
||||||
|
"""End-to-end simulation of the VAD loop with a fake recorder.
|
||||||
|
|
||||||
|
Proves auto-restart works: the silence callback must trigger transcribe →
|
||||||
|
on_transcript → re-call rec.start(on_silence_stop=same_cb). Also covers
|
||||||
|
the 3-strikes no-speech halt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def fake_recorder(self, monkeypatch):
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
# Reset module state between tests.
|
||||||
|
monkeypatch.setattr(voice, "_continuous_active", False)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_recorder", None)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_no_speech_count", 0)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_on_transcript", None)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_on_status", None)
|
||||||
|
monkeypatch.setattr(voice, "_continuous_on_silent_limit", None)
|
||||||
|
|
||||||
|
class FakeRecorder:
|
||||||
|
_silence_threshold = 200
|
||||||
|
_silence_duration = 3.0
|
||||||
|
is_recording = False
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.start_calls = 0
|
||||||
|
self.last_callback = None
|
||||||
|
self.stopped = 0
|
||||||
|
self.cancelled = 0
|
||||||
|
# Preset WAV path returned by stop()
|
||||||
|
self.next_stop_wav = "/tmp/fake.wav"
|
||||||
|
|
||||||
|
def start(self, on_silence_stop=None):
|
||||||
|
self.start_calls += 1
|
||||||
|
self.last_callback = on_silence_stop
|
||||||
|
self.is_recording = True
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.stopped += 1
|
||||||
|
self.is_recording = False
|
||||||
|
return self.next_stop_wav
|
||||||
|
|
||||||
|
def cancel(self):
|
||||||
|
self.cancelled += 1
|
||||||
|
self.is_recording = False
|
||||||
|
|
||||||
|
rec = FakeRecorder()
|
||||||
|
monkeypatch.setattr(voice, "create_audio_recorder", lambda: rec)
|
||||||
|
# Skip real file ops in the silence callback.
|
||||||
|
monkeypatch.setattr(voice.os.path, "isfile", lambda _p: False)
|
||||||
|
return rec
|
||||||
|
|
||||||
|
def test_loop_auto_restarts_after_transcript(self, fake_recorder, monkeypatch):
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
voice,
|
||||||
|
"transcribe_recording",
|
||||||
|
lambda _p: {"success": True, "transcript": "hello world"},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
|
||||||
|
|
||||||
|
transcripts = []
|
||||||
|
statuses = []
|
||||||
|
|
||||||
|
voice.start_continuous(
|
||||||
|
on_transcript=lambda t: transcripts.append(t),
|
||||||
|
on_status=lambda s: statuses.append(s),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert fake_recorder.start_calls == 1
|
||||||
|
assert statuses == ["listening"]
|
||||||
|
|
||||||
|
# Simulate AudioRecorder's silence detector firing.
|
||||||
|
fake_recorder.last_callback()
|
||||||
|
|
||||||
|
assert transcripts == ["hello world"]
|
||||||
|
assert fake_recorder.start_calls == 2 # auto-restarted
|
||||||
|
assert statuses == ["listening", "transcribing", "listening"]
|
||||||
|
assert voice.is_continuous_active() is True
|
||||||
|
|
||||||
|
voice.stop_continuous()
|
||||||
|
|
||||||
|
def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch):
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
# Transcription returns no speech — fake_recorder.stop() returns the
|
||||||
|
# path, but transcribe returns empty text, counting as silence.
|
||||||
|
monkeypatch.setattr(
|
||||||
|
voice,
|
||||||
|
"transcribe_recording",
|
||||||
|
lambda _p: {"success": True, "transcript": ""},
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
|
||||||
|
|
||||||
|
transcripts = []
|
||||||
|
silent_limit_fired = []
|
||||||
|
|
||||||
|
voice.start_continuous(
|
||||||
|
on_transcript=lambda t: transcripts.append(t),
|
||||||
|
on_silent_limit=lambda: silent_limit_fired.append(True),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fire silence callback 3 times
|
||||||
|
for _ in range(3):
|
||||||
|
fake_recorder.last_callback()
|
||||||
|
|
||||||
|
assert transcripts == []
|
||||||
|
assert silent_limit_fired == [True]
|
||||||
|
assert voice.is_continuous_active() is False
|
||||||
|
assert fake_recorder.cancelled >= 1
|
||||||
|
|
||||||
|
def test_stop_during_transcription_discards_restart(self, fake_recorder, monkeypatch):
|
||||||
|
"""User hits Ctrl+B mid-transcription: the in-flight transcript must
|
||||||
|
still fire (it's a real utterance), but the loop must NOT restart."""
|
||||||
|
import hermes_cli.voice as voice
|
||||||
|
|
||||||
|
stop_triggered = {"flag": False}
|
||||||
|
|
||||||
|
def late_transcribe(_p):
|
||||||
|
# Simulate stop_continuous arriving while we're inside transcribe
|
||||||
|
voice.stop_continuous()
|
||||||
|
stop_triggered["flag"] = True
|
||||||
|
return {"success": True, "transcript": "final word"}
|
||||||
|
|
||||||
|
monkeypatch.setattr(voice, "transcribe_recording", late_transcribe)
|
||||||
|
monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False)
|
||||||
|
|
||||||
|
transcripts = []
|
||||||
|
voice.start_continuous(on_transcript=lambda t: transcripts.append(t))
|
||||||
|
|
||||||
|
initial_starts = fake_recorder.start_calls # 1
|
||||||
|
fake_recorder.last_callback()
|
||||||
|
|
||||||
|
assert stop_triggered["flag"] is True
|
||||||
|
# Loop is stopped — no auto-restart
|
||||||
|
assert fake_recorder.start_calls == initial_starts
|
||||||
|
# The in-flight transcript was suppressed because we stopped mid-flight
|
||||||
|
assert transcripts == []
|
||||||
|
assert voice.is_continuous_active() is False
|
||||||
|
|
|
||||||
|
|
@ -3455,43 +3455,154 @@ def _(rid, params: dict) -> dict:
|
||||||
# ── Methods: voice ───────────────────────────────────────────────────
|
# ── Methods: voice ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
_voice_sid_lock = threading.Lock()
|
||||||
|
_voice_event_sid: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
def _voice_emit(event: str, payload: dict | None = None) -> None:
|
||||||
|
"""Emit a voice event toward the session that most recently turned the
|
||||||
|
mode on. Voice is process-global (one microphone), so there's only ever
|
||||||
|
one sid to target; the TUI handler treats an empty sid as "active
|
||||||
|
session". Kept separate from _emit to make the lack of per-call sid
|
||||||
|
argument explicit."""
|
||||||
|
with _voice_sid_lock:
|
||||||
|
sid = _voice_event_sid
|
||||||
|
_emit(event, sid, payload)
|
||||||
|
|
||||||
|
|
||||||
|
def _voice_mode_enabled() -> bool:
|
||||||
|
"""Current voice-mode flag. HERMES_VOICE env var wins over config so
|
||||||
|
the gateway and CLI agree when one of them was launched with an
|
||||||
|
explicit override."""
|
||||||
|
env = os.environ.get("HERMES_VOICE", "").strip()
|
||||||
|
if env in {"0", "1"}:
|
||||||
|
return env == "1"
|
||||||
|
return bool(_load_cfg().get("display", {}).get("voice_enabled", False))
|
||||||
|
|
||||||
|
|
||||||
|
def _voice_tts_enabled() -> bool:
|
||||||
|
"""Whether agent replies should be spoken back via TTS."""
|
||||||
|
env = os.environ.get("HERMES_VOICE_TTS", "").strip()
|
||||||
|
if env in {"0", "1"}:
|
||||||
|
return env == "1"
|
||||||
|
return bool(_load_cfg().get("display", {}).get("voice_tts", False))
|
||||||
|
|
||||||
|
|
||||||
@method("voice.toggle")
|
@method("voice.toggle")
|
||||||
def _(rid, params: dict) -> dict:
|
def _(rid, params: dict) -> dict:
|
||||||
|
"""CLI parity for the ``/voice`` slash command.
|
||||||
|
|
||||||
|
Subcommands:
|
||||||
|
|
||||||
|
* ``status`` — report mode + TTS flags (default when action is unknown).
|
||||||
|
* ``on`` / ``off`` — flip voice *mode* (the umbrella bit). Turning it
|
||||||
|
off also tears down any active continuous recording loop. Does NOT
|
||||||
|
start recording on its own; recording is driven by ``voice.record``
|
||||||
|
(Ctrl+B) after mode is on, matching cli.py's enable/Ctrl+B split.
|
||||||
|
* ``tts`` — toggle speech-output of agent replies. Requires mode on
|
||||||
|
(mirrors CLI's _toggle_voice_tts guard).
|
||||||
|
"""
|
||||||
action = params.get("action", "status")
|
action = params.get("action", "status")
|
||||||
|
|
||||||
if action == "status":
|
if action == "status":
|
||||||
env = os.environ.get("HERMES_VOICE", "").strip()
|
# Mirror CLI's _show_voice_status: include STT/TTS provider
|
||||||
if env in {"0", "1"}:
|
# availability so the user can tell at a glance *why* voice mode
|
||||||
return _ok(rid, {"enabled": env == "1"})
|
# isn't working ("STT provider: MISSING ..." is the common case).
|
||||||
return _ok(
|
payload: dict = {
|
||||||
rid,
|
"enabled": _voice_mode_enabled(),
|
||||||
{
|
"tts": _voice_tts_enabled(),
|
||||||
"enabled": bool(
|
}
|
||||||
_load_cfg().get("display", {}).get("voice_enabled", False)
|
try:
|
||||||
)
|
from tools.voice_mode import check_voice_requirements
|
||||||
},
|
|
||||||
)
|
reqs = check_voice_requirements()
|
||||||
|
payload["available"] = bool(reqs.get("available"))
|
||||||
|
payload["audio_available"] = bool(reqs.get("audio_available"))
|
||||||
|
payload["stt_available"] = bool(reqs.get("stt_available"))
|
||||||
|
payload["details"] = reqs.get("details") or ""
|
||||||
|
except Exception as e:
|
||||||
|
# check_voice_requirements pulls optional transcription deps —
|
||||||
|
# swallow so /voice status always returns something useful.
|
||||||
|
logger.warning("voice.toggle status: requirements probe failed: %s", e)
|
||||||
|
|
||||||
|
return _ok(rid, payload)
|
||||||
|
|
||||||
if action in ("on", "off"):
|
if action in ("on", "off"):
|
||||||
enabled = action == "on"
|
enabled = action == "on"
|
||||||
os.environ["HERMES_VOICE"] = "1" if enabled else "0"
|
os.environ["HERMES_VOICE"] = "1" if enabled else "0"
|
||||||
_write_config_key("display.voice_enabled", enabled)
|
_write_config_key("display.voice_enabled", enabled)
|
||||||
return _ok(rid, {"enabled": action == "on"})
|
|
||||||
|
if not enabled:
|
||||||
|
# Disabling the mode must tear the continuous loop down; the
|
||||||
|
# loop holds the microphone and would otherwise keep running.
|
||||||
|
try:
|
||||||
|
from hermes_cli.voice import stop_continuous
|
||||||
|
|
||||||
|
stop_continuous()
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("voice: stop_continuous failed during toggle off: %s", e)
|
||||||
|
|
||||||
|
return _ok(rid, {"enabled": enabled, "tts": _voice_tts_enabled()})
|
||||||
|
|
||||||
|
if action == "tts":
|
||||||
|
if not _voice_mode_enabled():
|
||||||
|
return _err(rid, 4014, "enable voice mode first: /voice on")
|
||||||
|
new_value = not _voice_tts_enabled()
|
||||||
|
os.environ["HERMES_VOICE_TTS"] = "1" if new_value else "0"
|
||||||
|
_write_config_key("display.voice_tts", new_value)
|
||||||
|
return _ok(rid, {"enabled": True, "tts": new_value})
|
||||||
|
|
||||||
return _err(rid, 4013, f"unknown voice action: {action}")
|
return _err(rid, 4013, f"unknown voice action: {action}")
|
||||||
|
|
||||||
|
|
||||||
@method("voice.record")
|
@method("voice.record")
|
||||||
def _(rid, params: dict) -> dict:
|
def _(rid, params: dict) -> dict:
|
||||||
|
"""VAD-driven continuous record loop, CLI-parity.
|
||||||
|
|
||||||
|
``start`` turns on a VAD loop that emits ``voice.transcript`` events
|
||||||
|
for each detected utterance and auto-restarts for the next turn.
|
||||||
|
``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while-
|
||||||
|
recording branch clearing ``_voice_continuous``). Three consecutive
|
||||||
|
silent cycles stop the loop automatically and emit a
|
||||||
|
``voice.transcript`` with ``no_speech_limit=True``.
|
||||||
|
"""
|
||||||
action = params.get("action", "start")
|
action = params.get("action", "start")
|
||||||
|
|
||||||
|
if action not in {"start", "stop"}:
|
||||||
|
return _err(rid, 4019, f"unknown voice action: {action}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if action == "start":
|
if action == "start":
|
||||||
from hermes_cli.voice import start_recording
|
if not _voice_mode_enabled():
|
||||||
|
return _err(rid, 4015, "voice mode is off — enable with /voice on")
|
||||||
|
|
||||||
start_recording()
|
with _voice_sid_lock:
|
||||||
|
global _voice_event_sid
|
||||||
|
_voice_event_sid = params.get("session_id") or _voice_event_sid
|
||||||
|
|
||||||
|
from hermes_cli.voice import start_continuous
|
||||||
|
|
||||||
|
voice_cfg = _load_cfg().get("voice", {})
|
||||||
|
start_continuous(
|
||||||
|
on_transcript=lambda t: _voice_emit(
|
||||||
|
"voice.transcript", {"text": t}
|
||||||
|
),
|
||||||
|
on_status=lambda s: _voice_emit("voice.status", {"state": s}),
|
||||||
|
on_silent_limit=lambda: _voice_emit(
|
||||||
|
"voice.transcript", {"no_speech_limit": True}
|
||||||
|
),
|
||||||
|
silence_threshold=voice_cfg.get("silence_threshold", 200),
|
||||||
|
silence_duration=voice_cfg.get("silence_duration", 3.0),
|
||||||
|
)
|
||||||
return _ok(rid, {"status": "recording"})
|
return _ok(rid, {"status": "recording"})
|
||||||
if action == "stop":
|
|
||||||
from hermes_cli.voice import stop_and_transcribe
|
|
||||||
|
|
||||||
return _ok(rid, {"text": stop_and_transcribe() or ""})
|
# action == "stop"
|
||||||
return _err(rid, 4019, f"unknown voice action: {action}")
|
from hermes_cli.voice import stop_continuous
|
||||||
|
|
||||||
|
stop_continuous()
|
||||||
|
return _ok(rid, {"status": "stopped"})
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return _err(
|
return _err(
|
||||||
rid, 5025, "voice module not available — install audio dependencies"
|
rid, 5025, "voice module not available — install audio dependencies"
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
|
||||||
composer: {
|
composer: {
|
||||||
dequeue: () => undefined,
|
dequeue: () => undefined,
|
||||||
queueEditRef: ref<null | number>(null),
|
queueEditRef: ref<null | number>(null),
|
||||||
sendQueued: vi.fn()
|
sendQueued: vi.fn(),
|
||||||
|
setInput: vi.fn()
|
||||||
},
|
},
|
||||||
gateway: {
|
gateway: {
|
||||||
gw: { request: vi.fn() },
|
gw: { request: vi.fn() },
|
||||||
|
|
@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
|
||||||
resumeById: vi.fn(),
|
resumeById: vi.fn(),
|
||||||
setCatalog: vi.fn()
|
setCatalog: vi.fn()
|
||||||
},
|
},
|
||||||
|
submission: {
|
||||||
|
submitRef: { current: vi.fn() }
|
||||||
|
},
|
||||||
system: {
|
system: {
|
||||||
bellOnComplete: false,
|
bellOnComplete: false,
|
||||||
sys: vi.fn()
|
sys: vi.fn()
|
||||||
|
|
@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
|
||||||
panel: (title: string, sections: any[]) =>
|
panel: (title: string, sections: any[]) =>
|
||||||
appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
|
appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
|
||||||
setHistoryItems: vi.fn()
|
setHistoryItems: vi.fn()
|
||||||
|
},
|
||||||
|
voice: {
|
||||||
|
setProcessing: vi.fn(),
|
||||||
|
setRecording: vi.fn(),
|
||||||
|
setVoiceEnabled: vi.fn()
|
||||||
}
|
}
|
||||||
}) as any
|
}) as any
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
||||||
const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
|
const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
|
||||||
const { bellOnComplete, stdout, sys } = ctx.system
|
const { bellOnComplete, stdout, sys } = ctx.system
|
||||||
const { appendMessage, panel, setHistoryItems } = ctx.transcript
|
const { appendMessage, panel, setHistoryItems } = ctx.transcript
|
||||||
|
const { setInput } = ctx.composer
|
||||||
|
const { submitRef } = ctx.submission
|
||||||
|
const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice
|
||||||
|
|
||||||
let pendingThinkingStatus = ''
|
let pendingThinkingStatus = ''
|
||||||
let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
|
let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
|
||||||
|
|
@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case 'voice.status': {
|
||||||
|
// Continuous VAD loop reports its internal state so the status bar
|
||||||
|
// can show listening / transcribing / idle without polling.
|
||||||
|
const state = String(ev.payload?.state ?? '')
|
||||||
|
|
||||||
|
if (state === 'listening') {
|
||||||
|
setVoiceRecording(true)
|
||||||
|
setVoiceProcessing(false)
|
||||||
|
} else if (state === 'transcribing') {
|
||||||
|
setVoiceRecording(false)
|
||||||
|
setVoiceProcessing(true)
|
||||||
|
} else {
|
||||||
|
setVoiceRecording(false)
|
||||||
|
setVoiceProcessing(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'voice.transcript': {
|
||||||
|
// CLI parity: the 3-strikes silence detector flipped off automatically.
|
||||||
|
// Mirror that on the UI side and tell the user why the mode is off.
|
||||||
|
if (ev.payload?.no_speech_limit) {
|
||||||
|
setVoiceEnabled(false)
|
||||||
|
setVoiceRecording(false)
|
||||||
|
setVoiceProcessing(false)
|
||||||
|
sys('voice: no speech detected 3 times, continuous mode stopped')
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = String(ev.payload?.text ?? '').trim()
|
||||||
|
|
||||||
|
if (!text) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Match CLI's _pending_input.put(transcript): auto-submit when the
|
||||||
|
// composer is empty, otherwise append so the user can keep editing
|
||||||
|
// a partial draft they were working on.
|
||||||
|
setInput(prev => {
|
||||||
|
if (!prev) {
|
||||||
|
// defer submit so React commits the state change first
|
||||||
|
setTimeout(() => submitRef.current(text), 0)
|
||||||
|
|
||||||
|
return ''
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
|
||||||
|
})
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
case 'gateway.start_timeout': {
|
case 'gateway.start_timeout': {
|
||||||
const { cwd, python } = ev.payload ?? {}
|
const { cwd, python } = ev.payload ?? {}
|
||||||
const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
|
const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
|
||||||
|
|
|
||||||
|
|
@ -189,9 +189,11 @@ export interface InputHandlerContext {
|
||||||
stdout?: NodeJS.WriteStream
|
stdout?: NodeJS.WriteStream
|
||||||
}
|
}
|
||||||
voice: {
|
voice: {
|
||||||
|
enabled: boolean
|
||||||
recording: boolean
|
recording: boolean
|
||||||
setProcessing: StateSetter<boolean>
|
setProcessing: StateSetter<boolean>
|
||||||
setRecording: StateSetter<boolean>
|
setRecording: StateSetter<boolean>
|
||||||
|
setVoiceEnabled: StateSetter<boolean>
|
||||||
}
|
}
|
||||||
wheelStep: number
|
wheelStep: number
|
||||||
}
|
}
|
||||||
|
|
@ -201,6 +203,9 @@ export interface InputHandlerResult {
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface GatewayEventHandlerContext {
|
export interface GatewayEventHandlerContext {
|
||||||
|
composer: {
|
||||||
|
setInput: StateSetter<string>
|
||||||
|
}
|
||||||
gateway: GatewayServices
|
gateway: GatewayServices
|
||||||
session: {
|
session: {
|
||||||
STARTUP_RESUME_ID: string
|
STARTUP_RESUME_ID: string
|
||||||
|
|
@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
|
||||||
resumeById: (id: string) => void
|
resumeById: (id: string) => void
|
||||||
setCatalog: StateSetter<null | SlashCatalog>
|
setCatalog: StateSetter<null | SlashCatalog>
|
||||||
}
|
}
|
||||||
|
submission: {
|
||||||
|
submitRef: MutableRefObject<(value: string) => void>
|
||||||
|
}
|
||||||
system: {
|
system: {
|
||||||
bellOnComplete: boolean
|
bellOnComplete: boolean
|
||||||
stdout?: NodeJS.WriteStream
|
stdout?: NodeJS.WriteStream
|
||||||
|
|
@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
|
||||||
panel: (title: string, sections: PanelSection[]) => void
|
panel: (title: string, sections: PanelSection[]) => void
|
||||||
setHistoryItems: StateSetter<Msg[]>
|
setHistoryItems: StateSetter<Msg[]>
|
||||||
}
|
}
|
||||||
|
voice: {
|
||||||
|
setProcessing: StateSetter<boolean>
|
||||||
|
setRecording: StateSetter<boolean>
|
||||||
|
setVoiceEnabled: StateSetter<boolean>
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SlashHandlerContext {
|
export interface SlashHandlerContext {
|
||||||
|
|
|
||||||
|
|
@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
help: 'toggle voice input',
|
help: 'voice mode: [on|off|tts|status]',
|
||||||
name: 'voice',
|
name: 'voice',
|
||||||
run: (arg, ctx) => {
|
run: (arg, ctx) => {
|
||||||
const action = arg === 'on' || arg === 'off' ? arg : 'status'
|
const normalized = (arg ?? '').trim().toLowerCase()
|
||||||
|
|
||||||
|
const action =
|
||||||
|
normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
|
||||||
|
? normalized
|
||||||
|
: 'status'
|
||||||
|
|
||||||
ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
|
ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
|
||||||
ctx.guarded<VoiceToggleResponse>(r => {
|
ctx.guarded<VoiceToggleResponse>(r => {
|
||||||
ctx.voice.setVoiceEnabled(!!r.enabled)
|
ctx.voice.setVoiceEnabled(!!r.enabled)
|
||||||
ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`)
|
|
||||||
|
// Match CLI's _show_voice_status / _enable_voice_mode /
|
||||||
|
// _toggle_voice_tts output shape so users don't have to learn
|
||||||
|
// two vocabularies.
|
||||||
|
if (action === 'status') {
|
||||||
|
const mode = r.enabled ? 'ON' : 'OFF'
|
||||||
|
const tts = r.tts ? 'ON' : 'OFF'
|
||||||
|
ctx.transcript.sys('Voice Mode Status')
|
||||||
|
ctx.transcript.sys(` Mode: ${mode}`)
|
||||||
|
ctx.transcript.sys(` TTS: ${tts}`)
|
||||||
|
ctx.transcript.sys(' Record key: Ctrl+B')
|
||||||
|
|
||||||
|
// CLI's "Requirements:" block — surfaces STT/audio setup issues
|
||||||
|
// so the user sees "STT provider: MISSING ..." instead of
|
||||||
|
// silently failing on every Ctrl+B press.
|
||||||
|
if (r.details) {
|
||||||
|
ctx.transcript.sys('')
|
||||||
|
ctx.transcript.sys(' Requirements:')
|
||||||
|
|
||||||
|
for (const line of r.details.split('\n')) {
|
||||||
|
if (line.trim()) {
|
||||||
|
ctx.transcript.sys(` ${line}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (action === 'tts') {
|
||||||
|
ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// on/off — mirror cli.py:_enable_voice_mode's 3-line output
|
||||||
|
if (r.enabled) {
|
||||||
|
const tts = r.tts ? ' (TTS enabled)' : ''
|
||||||
|
ctx.transcript.sys(`Voice mode enabled${tts}`)
|
||||||
|
ctx.transcript.sys(' Ctrl+B to start/stop recording')
|
||||||
|
ctx.transcript.sys(' /voice tts to toggle speech output')
|
||||||
|
ctx.transcript.sys(' /voice off to disable voice mode')
|
||||||
|
} else {
|
||||||
|
ctx.transcript.sys('Voice mode disabled.')
|
||||||
|
}
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const voiceStop = () => {
|
// CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
|
||||||
voice.setRecording(false)
|
// (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
|
||||||
voice.setProcessing(true)
|
// Ctrl+B while the mode is off sys-nudges the user. While the mode is
|
||||||
|
// on, the first press starts a continuous loop (gateway → start_continuous,
|
||||||
|
// VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
|
||||||
|
// The gateway publishes voice.status + voice.transcript events that
|
||||||
|
// createGatewayEventHandler turns into UI badges and composer injection.
|
||||||
|
const voiceRecordToggle = () => {
|
||||||
|
if (!voice.enabled) {
|
||||||
|
return actions.sys('voice: mode is off — enable with /voice on')
|
||||||
|
}
|
||||||
|
|
||||||
|
const starting = !voice.recording
|
||||||
|
const action = starting ? 'start' : 'stop'
|
||||||
|
|
||||||
|
// Optimistic UI — flip the REC badge immediately so the user gets
|
||||||
|
// feedback while the RPC round-trips; the voice.status event is the
|
||||||
|
// authoritative source and may correct us.
|
||||||
|
if (starting) {
|
||||||
|
voice.setRecording(true)
|
||||||
|
} else {
|
||||||
|
voice.setRecording(false)
|
||||||
|
voice.setProcessing(false)
|
||||||
|
}
|
||||||
|
|
||||||
gateway
|
gateway
|
||||||
.rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
|
.rpc<VoiceRecordResponse>('voice.record', { action })
|
||||||
.then(r => {
|
.catch((e: Error) => {
|
||||||
if (!r) {
|
// Revert optimistic UI on failure.
|
||||||
return
|
if (starting) {
|
||||||
|
voice.setRecording(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
const transcript = String(r.text || '').trim()
|
actions.sys(`voice error: ${e.message}`)
|
||||||
|
|
||||||
if (!transcript) {
|
|
||||||
return actions.sys('voice: no speech detected')
|
|
||||||
}
|
|
||||||
|
|
||||||
cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
|
|
||||||
})
|
|
||||||
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
|
|
||||||
.finally(() => {
|
|
||||||
voice.setProcessing(false)
|
|
||||||
patchUiState({ status: 'ready' })
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
const voiceStart = () =>
|
|
||||||
gateway
|
|
||||||
.rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
|
|
||||||
.then(r => {
|
|
||||||
if (!r) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
voice.setRecording(true)
|
|
||||||
patchUiState({ status: 'recording…' })
|
|
||||||
})
|
|
||||||
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
|
|
||||||
|
|
||||||
useInput((ch, key) => {
|
useInput((ch, key) => {
|
||||||
const live = getUiState()
|
const live = getUiState()
|
||||||
|
|
||||||
|
|
@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isVoiceToggleKey(key, ch)) {
|
if (isVoiceToggleKey(key, ch)) {
|
||||||
return voice.recording ? voiceStop() : voiceStart()
|
return voiceRecordToggle()
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isAction(key, ch, 'g')) {
|
if (isAction(key, ch, 'g')) {
|
||||||
|
|
|
||||||
|
|
@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
|
||||||
composer: { actions: composerActions, refs: composerRefs, state: composerState },
|
composer: { actions: composerActions, refs: composerRefs, state: composerState },
|
||||||
gateway,
|
gateway,
|
||||||
terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
|
terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
|
||||||
voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
|
voice: {
|
||||||
|
enabled: voiceEnabled,
|
||||||
|
recording: voiceRecording,
|
||||||
|
setProcessing: setVoiceProcessing,
|
||||||
|
setRecording: setVoiceRecording,
|
||||||
|
setVoiceEnabled
|
||||||
|
},
|
||||||
wheelStep: WHEEL_SCROLL_STEP
|
wheelStep: WHEEL_SCROLL_STEP
|
||||||
})
|
})
|
||||||
|
|
||||||
const onEvent = useMemo(
|
const onEvent = useMemo(
|
||||||
() =>
|
() =>
|
||||||
createGatewayEventHandler({
|
createGatewayEventHandler({
|
||||||
|
composer: { setInput: composerActions.setInput },
|
||||||
gateway,
|
gateway,
|
||||||
session: {
|
session: {
|
||||||
STARTUP_RESUME_ID,
|
STARTUP_RESUME_ID,
|
||||||
|
|
@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
|
||||||
resumeById: session.resumeById,
|
resumeById: session.resumeById,
|
||||||
setCatalog
|
setCatalog
|
||||||
},
|
},
|
||||||
|
submission: { submitRef },
|
||||||
system: { bellOnComplete, stdout, sys },
|
system: { bellOnComplete, stdout, sys },
|
||||||
transcript: { appendMessage, panel, setHistoryItems }
|
transcript: { appendMessage, panel, setHistoryItems },
|
||||||
|
voice: {
|
||||||
|
setProcessing: setVoiceProcessing,
|
||||||
|
setRecording: setVoiceRecording,
|
||||||
|
setVoiceEnabled
|
||||||
|
}
|
||||||
}),
|
}),
|
||||||
[
|
[
|
||||||
appendMessage,
|
appendMessage,
|
||||||
bellOnComplete,
|
bellOnComplete,
|
||||||
|
composerActions.setInput,
|
||||||
gateway,
|
gateway,
|
||||||
panel,
|
panel,
|
||||||
session.newSession,
|
session.newSession,
|
||||||
session.resetSession,
|
session.resetSession,
|
||||||
session.resumeById,
|
session.resumeById,
|
||||||
|
setVoiceEnabled,
|
||||||
|
setVoiceProcessing,
|
||||||
|
setVoiceRecording,
|
||||||
stdout,
|
stdout,
|
||||||
|
submitRef,
|
||||||
sys
|
sys
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -236,10 +236,16 @@ export interface ImageAttachResponse {
|
||||||
// ── Voice ────────────────────────────────────────────────────────────
|
// ── Voice ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
export interface VoiceToggleResponse {
|
export interface VoiceToggleResponse {
|
||||||
|
audio_available?: boolean
|
||||||
|
available?: boolean
|
||||||
|
details?: string
|
||||||
enabled?: boolean
|
enabled?: boolean
|
||||||
|
stt_available?: boolean
|
||||||
|
tts?: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface VoiceRecordResponse {
|
export interface VoiceRecordResponse {
|
||||||
|
status?: string
|
||||||
text?: string
|
text?: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -368,6 +374,8 @@ export type GatewayEvent =
|
||||||
| { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
|
| { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
|
||||||
| { payload?: undefined; session_id?: string; type: 'message.start' }
|
| { payload?: undefined; session_id?: string; type: 'message.start' }
|
||||||
| { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
|
| { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
|
||||||
|
| { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
|
||||||
|
| { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
|
||||||
| { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
|
| { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
|
||||||
| { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
|
| { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
|
||||||
| { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }
|
| { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue