From 04cf4788ccc05003785992682e3cb25205e509cc Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Wed, 6 May 2026 15:49:59 -0700 Subject: [PATCH] fix(tui): restore voice push-to-talk parity (#20897) * fix(tui): restore classic CLI voice push-to-talk parity (cherry picked from commit 93b9ae301bb89f5b5e01b4b9f8ac91ffa74fbd9d) * fix(tui): harden voice push-to-talk stop flow Address review feedback from PR #16189 by stopping the active recorder before background transcription, documenting single-shot voice capture, and covering the TUI gateway flags with regression tests. * fix(tui): preserve silent voice strike tracking Keep single-shot voice recording's no-speech counter alive across starts so the TUI can still emit the three-strikes auto-disable event, and bind the auto-restart state at module scope for type checking. * fix(tui): clean up voice stop failure path Address follow-up review by naming the TUI flow as single-shot push-to-talk and cancelling the recorder when forced stop cannot produce a WAV. * fix(tui): report busy voice capture starts Return explicit start state from the voice wrapper so the TUI gateway does not report recording while forced-stop transcription is still cleaning up. * fix(tui): handle busy voice record responses Apply the gateway busy status immediately in the TUI and route forced-stop voice events to the session that sent the stop request. * fix(tui): clear voice recording on null response Treat a null voice.record RPC result as a failed optimistic start so the REC badge cannot stick after gateway-side errors. * fix(tui): count silent manual voice stops Preserve single-shot voice no-speech strikes through forced stop transcription so empty push-to-talk captures still trigger the three-strikes guard. --------- Co-authored-by: Montbra --- hermes_cli/voice.py | 184 ++++++++++++--- tests/hermes_cli/test_voice_wrapper.py | 219 +++++++++++++++++- tests/test_tui_gateway_server.py | 75 ++++++ tui_gateway/server.py | 23 +- ui-tui/src/__tests__/useInputHandlers.test.ts | 37 +++ ui-tui/src/app/useInputHandlers.ts | 44 +++- ui-tui/src/gatewayTypes.ts | 2 +- 7 files changed, 527 insertions(+), 57 deletions(-) create mode 100644 ui-tui/src/__tests__/useInputHandlers.test.ts diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py index f85f30c7bf..a4ee6a0842 100644 --- a/hermes_cli/voice.py +++ b/hermes_cli/voice.py @@ -281,6 +281,8 @@ _recorder_lock = threading.Lock() # ── Continuous (VAD) state ─────────────────────────────────────────── _continuous_lock = threading.Lock() _continuous_active = False +_continuous_stopping = False +_continuous_auto_restart: bool = True _continuous_recorder: Any = None # ── TTS-vs-STT feedback guard ──────────────────────────────────────── @@ -370,32 +372,43 @@ def start_continuous( on_silent_limit: Optional[Callable[[], None]] = None, silence_threshold: int = 200, silence_duration: float = 3.0, -) -> None: + auto_restart: bool = True, +) -> bool: """Start a VAD-driven continuous recording loop. The loop calls ``on_transcript(text)`` each time speech is detected and - transcribed successfully, then auto-restarts. After - ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech - picked up at all) the loop stops itself and calls ``on_silent_limit`` - so the UI can reflect "voice off". Idempotent — calling while already - active is a no-op. + transcribed successfully. If ``auto_restart`` is True, it auto-restarts + for the next turn and resets the no-speech counter for that loop. If + ``auto_restart`` is False, the first silence-triggered transcription ends + the loop and reports ``"idle"``; no-speech counts are retained across + starts so a push-to-talk caller can still enforce the three-strikes guard. + After ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech + picked up at all) the loop stops itself and calls ``on_silent_limit`` so the + UI can reflect "voice off". Returns False if a previous stop is still + transcribing/cleaning up; otherwise returns True. Idempotent — calling while + already active is a successful no-op. ``on_status`` is called with ``"listening"`` / ``"transcribing"`` / ``"idle"`` so the UI can show a live indicator. """ - global _continuous_active, _continuous_recorder + global _continuous_active, _continuous_recorder, _continuous_auto_restart global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit global _continuous_no_speech_count with _continuous_lock: if _continuous_active: _debug("start_continuous: already active — no-op") - return + return True + if _continuous_stopping: + _debug("start_continuous: stop/transcribe in progress — busy") + return False _continuous_active = True + _continuous_auto_restart = auto_restart _continuous_on_transcript = on_transcript _continuous_on_status = on_status _continuous_on_silent_limit = on_silent_limit - _continuous_no_speech_count = 0 + if auto_restart: + _continuous_no_speech_count = 0 if _continuous_recorder is None: _continuous_recorder = create_audio_recorder() @@ -428,15 +441,18 @@ def start_continuous( except Exception: pass + return True -def stop_continuous() -> None: + +def stop_continuous(force_transcribe: bool = False) -> None: """Stop the active continuous loop and release the microphone. - Idempotent — calling while not active is a no-op. Any in-flight - transcription completes but its result is discarded (the callback - checks ``_continuous_active`` before firing). + Idempotent — calling while not active is a no-op. If ``force_transcribe`` is + True, the recorder stops synchronously, then transcription/cleanup runs on a + background thread before reporting ``"idle"``. Otherwise the buffer is + discarded. """ - global _continuous_active, _continuous_on_transcript + global _continuous_active, _continuous_on_transcript, _continuous_stopping global _continuous_on_status, _continuous_on_silent_limit global _continuous_recorder, _continuous_no_speech_count @@ -446,18 +462,98 @@ def stop_continuous() -> None: _continuous_active = False rec = _continuous_recorder on_status = _continuous_on_status + on_transcript = _continuous_on_transcript + on_silent_limit = _continuous_on_silent_limit + auto_restart = _continuous_auto_restart + track_no_speech = force_transcribe and not auto_restart + _continuous_stopping = rec is not None _continuous_on_transcript = None _continuous_on_status = None _continuous_on_silent_limit = None - _continuous_no_speech_count = 0 + if not track_no_speech: + _continuous_no_speech_count = 0 if rec is not None: - try: - # cancel() (not stop()) discards buffered frames — the loop - # is over, we don't want to transcribe a half-captured turn. - rec.cancel() - except Exception as e: - logger.warning("failed to cancel recorder: %s", e) + if force_transcribe and on_transcript: + if on_status: + try: + on_status("transcribing") + except Exception: + pass + try: + wav_path = rec.stop() + except Exception as e: + logger.warning("failed to stop recorder: %s", e) + try: + rec.cancel() + except Exception as cancel_error: + logger.warning("failed to cancel recorder: %s", cancel_error) + wav_path = None + + def _transcribe_and_cleanup(): + global _continuous_no_speech_count, _continuous_stopping + transcript: Optional[str] = None + should_halt = False + + try: + if wav_path: + try: + result = transcribe_recording(wav_path) + if result.get("success"): + text = (result.get("transcript") or "").strip() + if text and not is_whisper_hallucination(text): + transcript = text + finally: + if os.path.isfile(wav_path): + os.unlink(wav_path) + except Exception as e: + logger.warning("failed to stop/transcribe recorder: %s", e) + finally: + if transcript: + try: + on_transcript(transcript) + except Exception as e: + logger.warning("on_transcript callback raised: %s", e) + + if track_no_speech: + with _continuous_lock: + if transcript: + _continuous_no_speech_count = 0 + else: + _continuous_no_speech_count += 1 + should_halt = ( + _continuous_no_speech_count + >= _CONTINUOUS_NO_SPEECH_LIMIT + ) + if should_halt: + _continuous_no_speech_count = 0 + if should_halt and on_silent_limit: + try: + on_silent_limit() + except Exception: + pass + + _play_beep(frequency=660, count=2) + with _continuous_lock: + _continuous_stopping = False + if on_status: + try: + on_status("idle") + except Exception: + pass + + threading.Thread(target=_transcribe_and_cleanup, daemon=True).start() + return + else: + try: + # cancel() (not stop()) discards buffered frames — the loop + # is over, we don't want to transcribe a half-captured turn. + rec.cancel() + except Exception as e: + logger.warning("failed to cancel recorder: %s", e) + + with _continuous_lock: + _continuous_stopping = False # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the # silence-auto-stop path plays). @@ -603,23 +699,39 @@ def _continuous_on_silence() -> None: _debug("_continuous_on_silence: stopped while waiting for TTS") return - # Restart for the next turn. - _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})") - _play_beep(frequency=880, count=1) - try: - rec.start(on_silence_stop=_continuous_on_silence) - except Exception as e: - logger.error("failed to restart continuous recording: %s", e) - _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}") + if _continuous_auto_restart: + # Restart for the next turn. + _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})") + _play_beep(frequency=880, count=1) + try: + rec.start(on_silence_stop=_continuous_on_silence) + except Exception as e: + logger.error("failed to restart continuous recording: %s", e) + _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}") + with _continuous_lock: + _continuous_active = False + if on_status: + try: + on_status("idle") + except Exception: + pass + return + + if on_status: + try: + on_status("listening") + except Exception: + pass + else: + # Do not auto-restart. Clean up state and notify idle. + _debug("_continuous_on_silence: auto_restart=False, stopping loop") with _continuous_lock: _continuous_active = False - return - - if on_status: - try: - on_status("listening") - except Exception: - pass + if on_status: + try: + on_status("idle") + except Exception: + pass # ── TTS API ────────────────────────────────────────────────────────── diff --git a/tests/hermes_cli/test_voice_wrapper.py b/tests/hermes_cli/test_voice_wrapper.py index 3caacf4313..c744c08d5b 100644 --- a/tests/hermes_cli/test_voice_wrapper.py +++ b/tests/hermes_cli/test_voice_wrapper.py @@ -309,6 +309,7 @@ class TestContinuousAPI: # Isolate from any state left behind by other tests in the session. monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_stopping", False, raising=False) monkeypatch.setattr(voice, "_continuous_recorder", None) assert voice.is_continuous_active() is False @@ -343,11 +344,20 @@ class TestContinuousAPI: monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder()) - voice.start_continuous(on_transcript=lambda _t: None) + started = voice.start_continuous(on_transcript=lambda _t: None) # The guard inside start_continuous short-circuits before rec.start() + assert started is True assert called["n"] == 0 + def test_start_returns_false_while_stopping(self, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_stopping", True, raising=False) + + assert voice.start_continuous(on_transcript=lambda _t: None) is False + class TestContinuousLoopSimulation: """End-to-end simulation of the VAD loop with a fake recorder. @@ -368,6 +378,8 @@ class TestContinuousLoopSimulation: monkeypatch.setattr(voice, "_continuous_on_transcript", None) monkeypatch.setattr(voice, "_continuous_on_status", None) monkeypatch.setattr(voice, "_continuous_on_silent_limit", None) + monkeypatch.setattr(voice, "_continuous_auto_restart", True, raising=False) + monkeypatch.setattr(voice, "_play_beep", lambda *_, **__: None) class FakeRecorder: _silence_threshold = 200 @@ -381,13 +393,20 @@ class TestContinuousLoopSimulation: self.cancelled = 0 # Preset WAV path returned by stop() self.next_stop_wav = "/tmp/fake.wav" + self.fail_stop = False + self.fail_next_start = False def start(self, on_silence_stop=None): + if self.fail_next_start: + self.fail_next_start = False + raise RuntimeError("boom") self.start_calls += 1 self.last_callback = on_silence_stop self.is_recording = True def stop(self): + if self.fail_stop: + raise RuntimeError("stop failed") self.stopped += 1 self.is_recording = False return self.next_stop_wav @@ -433,6 +452,204 @@ class TestContinuousLoopSimulation: voice.stop_continuous() + def test_auto_restart_false_stops_after_first_transcript(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "single shot"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + statuses = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_status=lambda s: statuses.append(s), + auto_restart=False, + ) + fake_recorder.last_callback() + + assert transcripts == ["single shot"] + assert fake_recorder.start_calls == 1 + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + + def test_auto_restart_false_retains_silent_strikes_across_starts( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": ""}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + silent_limit_fired = [] + + for _ in range(3): + voice.start_continuous( + on_transcript=lambda _t: None, + on_silent_limit=lambda: silent_limit_fired.append(True), + auto_restart=False, + ) + fake_recorder.last_callback() + + assert silent_limit_fired == [True] + assert voice.is_continuous_active() is False + assert fake_recorder.start_calls == 3 + + def test_force_transcribe_stop_delivers_current_buffer(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "manual stop"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + statuses = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_status=lambda s: statuses.append(s), + ) + voice.stop_continuous(force_transcribe=True) + + assert fake_recorder.stopped == 1 + assert transcripts == ["manual stop"] + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + + def test_force_transcribe_empty_single_shots_hit_silent_limit( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": ""}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + silent_limit_fired = [] + + for _ in range(3): + voice.start_continuous( + on_transcript=lambda _t: None, + on_silent_limit=lambda: silent_limit_fired.append(True), + auto_restart=False, + ) + voice.stop_continuous(force_transcribe=True) + + assert silent_limit_fired == [True] + assert fake_recorder.stopped == 3 + assert voice._continuous_no_speech_count == 0 + + def test_force_transcribe_valid_single_shot_resets_silent_strikes( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + monkeypatch.setattr(voice, "_continuous_no_speech_count", 2) + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "manual stop"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + silent_limit_fired = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_silent_limit=lambda: silent_limit_fired.append(True), + auto_restart=False, + ) + voice.stop_continuous(force_transcribe=True) + + assert transcripts == ["manual stop"] + assert silent_limit_fired == [] + assert voice._continuous_no_speech_count == 0 + + def test_force_transcribe_stop_failure_cancels_and_clears_stopping( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + fake_recorder.fail_stop = True + + statuses = [] + voice.start_continuous( + on_transcript=lambda _t: None, + on_status=lambda s: statuses.append(s), + ) + voice.stop_continuous(force_transcribe=True) + + assert fake_recorder.cancelled == 1 + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + assert voice._continuous_stopping is False + + def test_restart_failure_reports_idle(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "hello world"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + statuses = [] + voice.start_continuous(on_transcript=lambda _t: None, on_status=statuses.append) + + fake_recorder.fail_next_start = True + fake_recorder.last_callback() + + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch): import hermes_cli.voice as voice diff --git a/tests/test_tui_gateway_server.py b/tests/test_tui_gateway_server.py index 5a25a306ba..184f5606a8 100644 --- a/tests/test_tui_gateway_server.py +++ b/tests/test_tui_gateway_server.py @@ -204,6 +204,7 @@ def test_voice_record_start_handles_non_dict_voice_cfg(monkeypatch): assert resp["result"]["status"] == "recording" assert captured["silence_threshold"] == 200 assert captured["silence_duration"] == 3.0 + assert captured["auto_restart"] is False # Round-12 Copilot review regression on #19835: ``bool`` is a subclass # of ``int``, so the naive ``isinstance(threshold, (int, float))`` @@ -232,6 +233,80 @@ def test_voice_record_start_handles_non_dict_voice_cfg(monkeypatch): assert ( captured["silence_duration"] == 3.0 ), f"bool silence_duration leaked through for {bad_bool_cfg!r}" + assert captured["auto_restart"] is False + + +def test_voice_record_stop_forces_transcription(monkeypatch): + captured: dict = {} + + def fake_stop_continuous(**kwargs): + captured.update(kwargs) + + monkeypatch.setitem( + sys.modules, + "hermes_cli.voice", + types.SimpleNamespace( + start_continuous=lambda **_kwargs: None, + stop_continuous=fake_stop_continuous, + ), + ) + + resp = server.dispatch( + { + "id": "voice-record-stop", + "method": "voice.record", + "params": {"action": "stop"}, + } + ) + + assert resp["result"]["status"] == "stopped" + assert captured["force_transcribe"] is True + + +def test_voice_record_stop_updates_event_session_id(monkeypatch): + monkeypatch.setitem( + sys.modules, + "hermes_cli.voice", + types.SimpleNamespace( + start_continuous=lambda **_kwargs: True, + stop_continuous=lambda **_kwargs: None, + ), + ) + monkeypatch.setattr(server, "_voice_event_sid", "old-session") + + resp = server.dispatch( + { + "id": "voice-record-stop-session", + "method": "voice.record", + "params": {"action": "stop", "session_id": "new-session"}, + } + ) + + assert resp["result"]["status"] == "stopped" + assert server._voice_event_sid == "new-session" + + +def test_voice_record_start_reports_busy_when_stop_is_in_progress(monkeypatch): + monkeypatch.setitem( + sys.modules, + "hermes_cli.voice", + types.SimpleNamespace( + start_continuous=lambda **_kwargs: False, + stop_continuous=lambda **_kwargs: None, + ), + ) + monkeypatch.setenv("HERMES_VOICE", "1") + monkeypatch.setattr(server, "_load_cfg", lambda: {"voice": {}}) + + resp = server.dispatch( + { + "id": "voice-record-busy", + "method": "voice.record", + "params": {"action": "start"}, + } + ) + + assert resp["result"]["status"] == "busy" def test_voice_toggle_tts_branch_also_carries_record_key(monkeypatch): diff --git a/tui_gateway/server.py b/tui_gateway/server.py index b618c5bd56..4c36a561b1 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -5619,14 +5619,13 @@ def _(rid, params: dict) -> dict: @method("voice.record") def _(rid, params: dict) -> dict: - """VAD-driven continuous record loop, CLI-parity. + """VAD-bounded push-to-talk capture, CLI-parity. - ``start`` turns on a VAD loop that emits ``voice.transcript`` events - for each detected utterance and auto-restarts for the next turn. - ``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while- - recording branch clearing ``_voice_continuous``). Three consecutive - silent cycles stop the loop automatically and emit a - ``voice.transcript`` with ``no_speech_limit=True``. + ``start`` begins one VAD-bounded capture and emits ``voice.transcript`` + after silence stops the recorder. ``stop`` forces transcription of the + active buffer, matching classic CLI push-to-talk. The voice wrapper retains + no-speech counts across single-shot starts, so three consecutive silent + captures emit ``voice.transcript`` with ``no_speech_limit=True``. """ action = params.get("action", "start") @@ -5665,7 +5664,7 @@ def _(rid, params: dict) -> dict: if isinstance(duration, (int, float)) and not isinstance(duration, bool) else 3.0 ) - start_continuous( + started = start_continuous( on_transcript=lambda t: _voice_emit("voice.transcript", {"text": t}), on_status=lambda s: _voice_emit("voice.status", {"state": s}), on_silent_limit=lambda: _voice_emit( @@ -5673,13 +5672,19 @@ def _(rid, params: dict) -> dict: ), silence_threshold=safe_threshold, silence_duration=safe_duration, + auto_restart=False, ) + if started is False: + return _ok(rid, {"status": "busy"}) return _ok(rid, {"status": "recording"}) # action == "stop" + with _voice_sid_lock: + _voice_event_sid = params.get("session_id") or _voice_event_sid + from hermes_cli.voice import stop_continuous - stop_continuous() + stop_continuous(force_transcribe=True) return _ok(rid, {"status": "stopped"}) except ImportError: return _err( diff --git a/ui-tui/src/__tests__/useInputHandlers.test.ts b/ui-tui/src/__tests__/useInputHandlers.test.ts new file mode 100644 index 0000000000..066292abfa --- /dev/null +++ b/ui-tui/src/__tests__/useInputHandlers.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, it, vi } from 'vitest' + +import { applyVoiceRecordResponse } from '../app/useInputHandlers.js' + +describe('applyVoiceRecordResponse', () => { + it('reverts optimistic REC state when the gateway reports voice busy', () => { + const setProcessing = vi.fn() + const setRecording = vi.fn() + const sys = vi.fn() + + applyVoiceRecordResponse({ status: 'busy' }, true, { setProcessing, setRecording }, sys) + + expect(setRecording).toHaveBeenCalledWith(false) + expect(setProcessing).toHaveBeenCalledWith(true) + expect(sys).toHaveBeenCalledWith('voice: still transcribing; try again shortly') + }) + + it('keeps optimistic REC state for successful recording starts', () => { + const setProcessing = vi.fn() + const setRecording = vi.fn() + + applyVoiceRecordResponse({ status: 'recording' }, true, { setProcessing, setRecording }, vi.fn()) + + expect(setRecording).not.toHaveBeenCalled() + expect(setProcessing).not.toHaveBeenCalled() + }) + + it('reverts optimistic REC state when the gateway returns null', () => { + const setProcessing = vi.fn() + const setRecording = vi.fn() + + applyVoiceRecordResponse(null, true, { setProcessing, setRecording }, vi.fn()) + + expect(setRecording).toHaveBeenCalledWith(false) + expect(setProcessing).toHaveBeenCalledWith(false) + }) +}) diff --git a/ui-tui/src/app/useInputHandlers.ts b/ui-tui/src/app/useInputHandlers.ts index 3d85a500d8..ce25af70ed 100644 --- a/ui-tui/src/app/useInputHandlers.ts +++ b/ui-tui/src/app/useInputHandlers.ts @@ -23,6 +23,26 @@ import { getUiState } from './uiStore.js' const isCtrl = (key: { ctrl: boolean }, ch: string, target: string) => key.ctrl && ch.toLowerCase() === target +export function applyVoiceRecordResponse( + response: null | VoiceRecordResponse, + starting: boolean, + voice: Pick, + sys: (text: string) => void +) { + if (!starting || response?.status === 'recording') { + return + } + + voice.setRecording(false) + + if (response?.status === 'busy') { + voice.setProcessing(true) + sys('voice: still transcribing; try again shortly') + } else { + voice.setProcessing(false) + } +} + export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { const { actions, composer, gateway, terminal, voice, wheelStep } = ctx const { actions: cActions, refs: cRefs, state: cState } = composer @@ -157,11 +177,12 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { } } - // CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop + // CLI parity: Ctrl+B toggles a VAD-bounded push-to-talk capture // (NOT the voice-mode umbrella bit). The mode is enabled via /voice on; // Ctrl+B while the mode is off sys-nudges the user. While the mode is - // on, the first press starts a continuous loop (gateway → start_continuous, - // VAD auto-stop → transcribe → auto-restart), a subsequent press stops it. + // on, the first press starts a single VAD-bounded capture + // (gateway -> start_continuous(auto_restart=false), VAD auto-stop -> + // transcribe -> idle), a subsequent press stops and transcribes it. // The gateway publishes voice.status + voice.transcript events that // createGatewayEventHandler turns into UI badges and composer injection. const voiceRecordToggle = () => { @@ -182,14 +203,17 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { voice.setProcessing(false) } - gateway.rpc('voice.record', { action }).catch((e: Error) => { - // Revert optimistic UI on failure. - if (starting) { - voice.setRecording(false) - } + gateway + .rpc('voice.record', { action, session_id: getUiState().sid }) + .then(r => applyVoiceRecordResponse(r, starting, voice, actions.sys)) + .catch((e: Error) => { + // Revert optimistic UI on failure. + if (starting) { + voice.setRecording(false) + } - actions.sys(`voice error: ${e.message}`) - }) + actions.sys(`voice error: ${e.message}`) + }) } useInput((ch, key) => { diff --git a/ui-tui/src/gatewayTypes.ts b/ui-tui/src/gatewayTypes.ts index 0dacd790f0..8c5cb18b23 100644 --- a/ui-tui/src/gatewayTypes.ts +++ b/ui-tui/src/gatewayTypes.ts @@ -295,7 +295,7 @@ export interface VoiceToggleResponse { } export interface VoiceRecordResponse { - status?: string + status?: 'busy' | 'recording' | 'stopped' text?: string }