From 98418afd5d81a4e01813b819f3001dc360579d6c Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Fri, 24 Apr 2026 01:33:10 +0300
Subject: [PATCH] =?UTF-8?q?fix(tui):=20break=20TTS=E2=86=92STT=20feedback?=
 =?UTF-8?q?=20loop=20+=20colorize=20REC=20badge?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TTS feedback loop (hermes_cli/voice.py)

The VAD loop kept the microphone live while speak_text played the
agent's reply over the speakers, so the reply itself was picked up,
transcribed, and submitted — the agent then replied to its own echo
("Ha, looks like we're in a loop").

Ported cli.py:_voice_tts_done synchronisation:

- _tts_playing: threading.Event (initially set = "not playing").
- speak_text cancels the active recorder before opening the speakers,
  clears _tts_playing, and on exit waits 300 ms before re-starting the
  recorder — long enough for the OS audio device to settle so afplay
  and sounddevice don't race for it.
- _continuous_on_silence now waits on _tts_playing (up to 60 s) before
  re-arming the mic with another 300 ms gap, mirroring
  cli.py:10619-10621.  If the user flips voice off during the wait the
  loop exits cleanly instead of fighting for the device.

Without both halves the loop races: if the silence callback fires
before TTS starts it re-arms immediately; if TTS is already playing
the pause-and-resume path catches it.

Red REC badge (ui-tui appChrome + useMainApp)

Classic CLI (cli.py:_get_voice_status_fragments) renders "● REC" in
red and "◉ STT" in amber.  TUI was showing a dim "REC" with no dot,
making it hard to spot at a glance.  voiceLabel now emits the same
glyphs and appChrome colours them via t.color.error / t.color.warn,
falling back to dim for the idle label.
---
 hermes_cli/voice.py                 | 74 +++++++++++++++++++++++++++++
 ui-tui/src/app/useMainApp.ts        |  4 +-
 ui-tui/src/components/appChrome.tsx | 15 +++++-
 3 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py
index 448021d11..4deee8636 100644
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@@ -87,6 +87,18 @@ _recorder_lock = threading.Lock()
 _continuous_lock = threading.Lock()
 _continuous_active = False
 _continuous_recorder: Any = None
+
+# ── TTS-vs-STT feedback guard ────────────────────────────────────────
+# When TTS plays the agent reply over the speakers, the live microphone
+# picks it up and transcribes the agent's own voice as user input — an
+# infinite loop the agent happily joins ("Ha, looks like we're in a loop").
+# This Event mirrors cli.py:_voice_tts_done: cleared while speak_text is
+# playing, set while silent. _continuous_on_silence waits on it before
+# re-arming the recorder, and speak_text itself cancels any live capture
+# before starting playback so the tail of the previous utterance doesn't
+# leak into the mic.
+_tts_playing = threading.Event()
+_tts_playing.set()  # initially "not playing"
 _continuous_on_transcript: Optional[Callable[[str], None]] = None
 _continuous_on_status: Optional[Callable[[str], None]] = None
 _continuous_on_silent_limit: Optional[Callable[[], None]] = None
@@ -379,6 +391,23 @@ def _continuous_on_silence() -> None:
                 pass
         return
 
+    # CLI parity (cli.py:10619-10621): wait for any in-flight TTS to
+    # finish before re-arming the mic, then leave a small gap to avoid
+    # catching the tail of the speaker output.  Without this the voice
+    # loop becomes a feedback loop — the agent's spoken reply lands
+    # back in the mic and gets re-submitted.
+    if not _tts_playing.is_set():
+        _debug("_continuous_on_silence: waiting for TTS to finish")
+        _tts_playing.wait(timeout=60)
+        import time as _time
+        _time.sleep(0.3)
+
+        # User may have stopped the loop during the wait.
+        with _continuous_lock:
+            if not _continuous_active:
+                _debug("_continuous_on_silence: stopped while waiting for TTS")
+                return
+
     # Restart for the next turn.
     _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
     _play_beep(frequency=880, count=1)
@@ -409,6 +438,11 @@ def speak_text(text: str) -> None:
     MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
     of both extensions. Keeping these in sync means a voice-mode TTS
     session in the TUI sounds identical to one in the classic CLI.
+
+    While playback is in flight the module-level _tts_playing Event is
+    cleared so the continuous-recording loop knows to wait before
+    re-arming the mic (otherwise the agent's spoken reply feedback-loops
+    through the microphone and the agent ends up replying to itself).
     """
     if not text or not text.strip():
         return
@@ -417,6 +451,26 @@ def speak_text(text: str) -> None:
     import tempfile
     import time
 
+    # Cancel any live capture before we open the speakers — otherwise the
+    # last ~200ms of the user's turn tail + the first syllables of our TTS
+    # both end up in the next recording window.  The continuous loop will
+    # re-arm itself after _tts_playing flips back (see _continuous_on_silence).
+    paused_recording = False
+    with _continuous_lock:
+        if (
+            _continuous_active
+            and _continuous_recorder is not None
+            and getattr(_continuous_recorder, "is_recording", False)
+        ):
+            try:
+                _continuous_recorder.cancel()
+                paused_recording = True
+            except Exception as e:
+                logger.warning("failed to pause recorder for TTS: %s", e)
+
+    _tts_playing.clear()
+    _debug(f"speak_text: TTS begin (paused_recording={paused_recording})")
+
     try:
         from tools.tts_tool import text_to_speech_tool
 
@@ -463,3 +517,23 @@ def speak_text(text: str) -> None:
     except Exception as e:
         logger.warning("Voice TTS playback failed: %s", e)
         _debug(f"speak_text raised {type(e).__name__}: {e}")
+    finally:
+        _tts_playing.set()
+        _debug("speak_text: TTS done")
+
+        # Re-arm the mic so the user can answer without pressing Ctrl+B.
+        # Small delay lets the OS flush speaker output and afplay fully
+        # release the audio device before sounddevice re-opens the input.
+        if paused_recording:
+            time.sleep(0.3)
+            with _continuous_lock:
+                if _continuous_active and _continuous_recorder is not None:
+                    try:
+                        _continuous_recorder.start(
+                            on_silence_stop=_continuous_on_silence
+                        )
+                        _debug("speak_text: recording resumed after TTS")
+                    except Exception as e:
+                        logger.warning(
+                            "failed to resume recorder after TTS: %s", e
+                        )
diff --git a/ui-tui/src/app/useMainApp.ts b/ui-tui/src/app/useMainApp.ts
index c061aa5dd..7b742478e 100644
--- a/ui-tui/src/app/useMainApp.ts
+++ b/ui-tui/src/app/useMainApp.ts
@@ -716,7 +716,9 @@ export function useMainApp(gw: GatewayClient) {
       statusColor: statusColorOf(ui.status, ui.theme.color),
       stickyPrompt,
       turnStartedAt: ui.sid ? turnStartedAt : null,
-      voiceLabel: voiceRecording ? 'REC' : voiceProcessing ? 'STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
+      // CLI parity: the classic prompt_toolkit status bar shows a red dot
+      // on REC (cli.py:_get_voice_status_fragments line 2344).
+      voiceLabel: voiceRecording ? '● REC' : voiceProcessing ? '◉ STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
     }),
     [
       cwd,
diff --git a/ui-tui/src/components/appChrome.tsx b/ui-tui/src/components/appChrome.tsx
index 8de2a6301..7b697eedc 100644
--- a/ui-tui/src/components/appChrome.tsx
+++ b/ui-tui/src/components/appChrome.tsx
@@ -215,7 +215,20 @@ export function StatusRule({
             </Text>
           ) : null}
           <SpawnHud t={t} />
-          {voiceLabel ? <Text color={t.color.dim}> │ {voiceLabel}</Text> : null}
+          {voiceLabel ? (
+            <Text
+              color={
+                voiceLabel.startsWith('●')
+                  ? t.color.error
+                  : voiceLabel.startsWith('◉')
+                    ? t.color.warn
+                    : t.color.dim
+              }
+            >
+              {' │ '}
+              {voiceLabel}
+            </Text>
+          ) : null}
           {bgCount > 0 ? <Text color={t.color.dim}> │ {bgCount} bg</Text> : null}
           {showCost && typeof usage.cost_usd === 'number' ? (
             <Text color={t.color.dim}> │ ${usage.cost_usd.toFixed(4)}</Text>