fix(tui): restore voice push-to-talk parity (#20897)

* fix(tui): restore classic CLI voice push-to-talk parity (cherry picked from commit 93b9ae301b) * fix(tui): harden voice push-to-talk stop flow Address review feedback from PR #16189 by stopping the active recorder before background transcription, documenting single-shot voice capture, and covering the TUI gateway flags with regression tests. * fix(tui): preserve silent voice strike tracking Keep single-shot voice recording's no-speech counter alive across starts so the TUI can still emit the three-strikes auto-disable event, and bind the auto-restart state at module scope for type checking. * fix(tui): clean up voice stop failure path Address follow-up review by naming the TUI flow as single-shot push-to-talk and cancelling the recorder when forced stop cannot produce a WAV. * fix(tui): report busy voice capture starts Return explicit start state from the voice wrapper so the TUI gateway does not report recording while forced-stop transcription is still cleaning up. * fix(tui): handle busy voice record responses Apply the gateway busy status immediately in the TUI and route forced-stop voice events to the session that sent the stop request. * fix(tui): clear voice recording on null response Treat a null voice.record RPC result as a failed optimistic start so the REC badge cannot stick after gateway-side errors. * fix(tui): count silent manual voice stops Preserve single-shot voice no-speech strikes through forced stop transcription so empty push-to-talk captures still trigger the three-strikes guard. --------- Co-authored-by: Montbra <montbra@gmail.com>
2026-05-08 03:01:47 +00:00 · 2026-05-06 15:49:59 -07:00 · 2026-05-06 15:49:59 -07:00 · 04cf4788cc
commit 04cf4788cc
parent 5ccab51fa8
7 changed files with 527 additions and 57 deletions
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@ -281,6 +281,8 @@ _recorder_lock = threading.Lock()
 # ── Continuous (VAD) state ───────────────────────────────────────────
 _continuous_lock = threading.Lock()
 _continuous_active = False
+_continuous_stopping = False
+_continuous_auto_restart: bool = True
 _continuous_recorder: Any = None

 # ── TTS-vs-STT feedback guard ────────────────────────────────────────
@ -370,32 +372,43 @@ def start_continuous(
    on_silent_limit: Optional[Callable[[], None]] = None,
    silence_threshold: int = 200,
    silence_duration: float = 3.0,
-) -> None:
+    auto_restart: bool = True,
+) -> bool:
    """Start a VAD-driven continuous recording loop.

    The loop calls ``on_transcript(text)`` each time speech is detected and
-    transcribed successfully, then auto-restarts. After
-    ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
-    picked up at all) the loop stops itself and calls ``on_silent_limit``
-    so the UI can reflect "voice off". Idempotent — calling while already
-    active is a no-op.
+    transcribed successfully. If ``auto_restart`` is True, it auto-restarts
+    for the next turn and resets the no-speech counter for that loop. If
+    ``auto_restart`` is False, the first silence-triggered transcription ends
+    the loop and reports ``"idle"``; no-speech counts are retained across
+    starts so a push-to-talk caller can still enforce the three-strikes guard.
+    After ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
+    picked up at all) the loop stops itself and calls ``on_silent_limit`` so the
+    UI can reflect "voice off". Returns False if a previous stop is still
+    transcribing/cleaning up; otherwise returns True. Idempotent — calling while
+    already active is a successful no-op.

    ``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
    ``"idle"`` so the UI can show a live indicator.
    """
-    global _continuous_active, _continuous_recorder
+    global _continuous_active, _continuous_recorder, _continuous_auto_restart
    global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
    global _continuous_no_speech_count

    with _continuous_lock:
        if _continuous_active:
            _debug("start_continuous: already active — no-op")
-            return
+            return True
+        if _continuous_stopping:
+            _debug("start_continuous: stop/transcribe in progress — busy")
+            return False
        _continuous_active = True
+        _continuous_auto_restart = auto_restart
        _continuous_on_transcript = on_transcript
        _continuous_on_status = on_status
        _continuous_on_silent_limit = on_silent_limit
-        _continuous_no_speech_count = 0
+        if auto_restart:
+            _continuous_no_speech_count = 0

        if _continuous_recorder is None:
            _continuous_recorder = create_audio_recorder()
@ -428,15 +441,18 @@ def start_continuous(
        except Exception:
            pass

+    return True

-def stop_continuous() -> None:
+
+def stop_continuous(force_transcribe: bool = False) -> None:
    """Stop the active continuous loop and release the microphone.

-    Idempotent — calling while not active is a no-op. Any in-flight
-    transcription completes but its result is discarded (the callback
-    checks ``_continuous_active`` before firing).
+    Idempotent — calling while not active is a no-op. If ``force_transcribe`` is
+    True, the recorder stops synchronously, then transcription/cleanup runs on a
+    background thread before reporting ``"idle"``. Otherwise the buffer is
+    discarded.
    """
-    global _continuous_active, _continuous_on_transcript
+    global _continuous_active, _continuous_on_transcript, _continuous_stopping
    global _continuous_on_status, _continuous_on_silent_limit
    global _continuous_recorder, _continuous_no_speech_count

@ -446,18 +462,98 @@ def stop_continuous() -> None:
        _continuous_active = False
        rec = _continuous_recorder
        on_status = _continuous_on_status
+        on_transcript = _continuous_on_transcript
+        on_silent_limit = _continuous_on_silent_limit
+        auto_restart = _continuous_auto_restart
+        track_no_speech = force_transcribe and not auto_restart
+        _continuous_stopping = rec is not None
        _continuous_on_transcript = None
        _continuous_on_status = None
        _continuous_on_silent_limit = None
-        _continuous_no_speech_count = 0
+        if not track_no_speech:
+            _continuous_no_speech_count = 0

    if rec is not None:
-        try:
-            # cancel() (not stop()) discards buffered frames — the loop
-            # is over, we don't want to transcribe a half-captured turn.
-            rec.cancel()
-        except Exception as e:
-            logger.warning("failed to cancel recorder: %s", e)
+        if force_transcribe and on_transcript:
+            if on_status:
+                try:
+                    on_status("transcribing")
+                except Exception:
+                    pass
+            try:
+                wav_path = rec.stop()
+            except Exception as e:
+                logger.warning("failed to stop recorder: %s", e)
+                try:
+                    rec.cancel()
+                except Exception as cancel_error:
+                    logger.warning("failed to cancel recorder: %s", cancel_error)
+                wav_path = None
+
+            def _transcribe_and_cleanup():
+                global _continuous_no_speech_count, _continuous_stopping
+                transcript: Optional[str] = None
+                should_halt = False
+
+                try:
+                    if wav_path:
+                        try:
+                            result = transcribe_recording(wav_path)
+                            if result.get("success"):
+                                text = (result.get("transcript") or "").strip()
+                                if text and not is_whisper_hallucination(text):
+                                    transcript = text
+                        finally:
+                            if os.path.isfile(wav_path):
+                                os.unlink(wav_path)
+                except Exception as e:
+                    logger.warning("failed to stop/transcribe recorder: %s", e)
+                finally:
+                    if transcript:
+                        try:
+                            on_transcript(transcript)
+                        except Exception as e:
+                            logger.warning("on_transcript callback raised: %s", e)
+
+                    if track_no_speech:
+                        with _continuous_lock:
+                            if transcript:
+                                _continuous_no_speech_count = 0
+                            else:
+                                _continuous_no_speech_count += 1
+                                should_halt = (
+                                    _continuous_no_speech_count
+                                    >= _CONTINUOUS_NO_SPEECH_LIMIT
+                                )
+                                if should_halt:
+                                    _continuous_no_speech_count = 0
+                        if should_halt and on_silent_limit:
+                            try:
+                                on_silent_limit()
+                            except Exception:
+                                pass
+
+                    _play_beep(frequency=660, count=2)
+                    with _continuous_lock:
+                        _continuous_stopping = False
+                    if on_status:
+                        try:
+                            on_status("idle")
+                        except Exception:
+                            pass
+
+            threading.Thread(target=_transcribe_and_cleanup, daemon=True).start()
+            return
+        else:
+            try:
+                # cancel() (not stop()) discards buffered frames — the loop
+                # is over, we don't want to transcribe a half-captured turn.
+                rec.cancel()
+            except Exception as e:
+                logger.warning("failed to cancel recorder: %s", e)
+
+    with _continuous_lock:
+        _continuous_stopping = False

    # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the
    # silence-auto-stop path plays).
@ -603,23 +699,39 @@ def _continuous_on_silence() -> None:
                _debug("_continuous_on_silence: stopped while waiting for TTS")
                return

-    # Restart for the next turn.
-    _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
-    _play_beep(frequency=880, count=1)
-    try:
-        rec.start(on_silence_stop=_continuous_on_silence)
-    except Exception as e:
-        logger.error("failed to restart continuous recording: %s", e)
-        _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
+    if _continuous_auto_restart:
+        # Restart for the next turn.
+        _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
+        _play_beep(frequency=880, count=1)
+        try:
+            rec.start(on_silence_stop=_continuous_on_silence)
+        except Exception as e:
+            logger.error("failed to restart continuous recording: %s", e)
+            _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
+            with _continuous_lock:
+                _continuous_active = False
+            if on_status:
+                try:
+                    on_status("idle")
+                except Exception:
+                    pass
+            return
+
+        if on_status:
+            try:
+                on_status("listening")
+            except Exception:
+                pass
+    else:
+        # Do not auto-restart. Clean up state and notify idle.
+        _debug("_continuous_on_silence: auto_restart=False, stopping loop")
        with _continuous_lock:
            _continuous_active = False
-        return
-
-    if on_status:
-        try:
-            on_status("listening")
-        except Exception:
-            pass
+        if on_status:
+            try:
+                on_status("idle")
+            except Exception:
+                pass


 # ── TTS API ──────────────────────────────────────────────────────────