From 98418afd5d81a4e01813b819f3001dc360579d6c Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Fri, 24 Apr 2026 01:33:10 +0300
Subject: [PATCH] =?UTF-8?q?fix(tui):=20break=20TTS=E2=86=92STT=20feedback?=
=?UTF-8?q?=20loop=20+=20colorize=20REC=20badge?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
TTS feedback loop (hermes_cli/voice.py)
The VAD loop kept the microphone live while speak_text played the
agent's reply over the speakers, so the reply itself was picked up,
transcribed, and submitted — the agent then replied to its own echo
("Ha, looks like we're in a loop").
Ported cli.py:_voice_tts_done synchronisation:
- _tts_playing: threading.Event (initially set = "not playing").
- speak_text cancels the active recorder before opening the speakers,
clears _tts_playing, and on exit waits 300 ms before re-starting the
recorder — long enough for the OS audio device to settle so afplay
and sounddevice don't race for it.
- _continuous_on_silence now waits on _tts_playing (up to 60 s) before
re-arming the mic with another 300 ms gap, mirroring
cli.py:10619-10621. If the user flips voice off during the wait the
loop exits cleanly instead of fighting for the device.
Without both halves the loop races: if the silence callback fires
before TTS starts it re-arms immediately; if TTS is already playing
the pause-and-resume path catches it.
Red REC badge (ui-tui appChrome + useMainApp)
Classic CLI (cli.py:_get_voice_status_fragments) renders "● REC" in
red and "◉ STT" in amber. TUI was showing a dim "REC" with no dot,
making it hard to spot at a glance. voiceLabel now emits the same
glyphs and appChrome colours them via t.color.error / t.color.warn,
falling back to dim for the idle label.
---
hermes_cli/voice.py | 74 +++++++++++++++++++++++++++++
ui-tui/src/app/useMainApp.ts | 4 +-
ui-tui/src/components/appChrome.tsx | 15 +++++-
3 files changed, 91 insertions(+), 2 deletions(-)
diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py
index 448021d11..4deee8636 100644
--- a/hermes_cli/voice.py
+++ b/hermes_cli/voice.py
@@ -87,6 +87,18 @@ _recorder_lock = threading.Lock()
_continuous_lock = threading.Lock()
_continuous_active = False
_continuous_recorder: Any = None
+
+# ── TTS-vs-STT feedback guard ────────────────────────────────────────
+# When TTS plays the agent reply over the speakers, the live microphone
+# picks it up and transcribes the agent's own voice as user input — an
+# infinite loop the agent happily joins ("Ha, looks like we're in a loop").
+# This Event mirrors cli.py:_voice_tts_done: cleared while speak_text is
+# playing, set while silent. _continuous_on_silence waits on it before
+# re-arming the recorder, and speak_text itself cancels any live capture
+# before starting playback so the tail of the previous utterance doesn't
+# leak into the mic.
+_tts_playing = threading.Event()
+_tts_playing.set() # initially "not playing"
_continuous_on_transcript: Optional[Callable[[str], None]] = None
_continuous_on_status: Optional[Callable[[str], None]] = None
_continuous_on_silent_limit: Optional[Callable[[], None]] = None
@@ -379,6 +391,23 @@ def _continuous_on_silence() -> None:
pass
return
+ # CLI parity (cli.py:10619-10621): wait for any in-flight TTS to
+ # finish before re-arming the mic, then leave a small gap to avoid
+ # catching the tail of the speaker output. Without this the voice
+ # loop becomes a feedback loop — the agent's spoken reply lands
+ # back in the mic and gets re-submitted.
+ if not _tts_playing.is_set():
+ _debug("_continuous_on_silence: waiting for TTS to finish")
+ _tts_playing.wait(timeout=60)
+ import time as _time
+ _time.sleep(0.3)
+
+ # User may have stopped the loop during the wait.
+ with _continuous_lock:
+ if not _continuous_active:
+ _debug("_continuous_on_silence: stopped while waiting for TTS")
+ return
+
# Restart for the next turn.
_debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
_play_beep(frequency=880, count=1)
@@ -409,6 +438,11 @@ def speak_text(text: str) -> None:
MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
of both extensions. Keeping these in sync means a voice-mode TTS
session in the TUI sounds identical to one in the classic CLI.
+
+ While playback is in flight the module-level _tts_playing Event is
+ cleared so the continuous-recording loop knows to wait before
+ re-arming the mic (otherwise the agent's spoken reply feedback-loops
+ through the microphone and the agent ends up replying to itself).
"""
if not text or not text.strip():
return
@@ -417,6 +451,26 @@ def speak_text(text: str) -> None:
import tempfile
import time
+ # Cancel any live capture before we open the speakers — otherwise the
+ # last ~200ms of the user's turn tail + the first syllables of our TTS
+ # both end up in the next recording window. The continuous loop will
+ # re-arm itself after _tts_playing flips back (see _continuous_on_silence).
+ paused_recording = False
+ with _continuous_lock:
+ if (
+ _continuous_active
+ and _continuous_recorder is not None
+ and getattr(_continuous_recorder, "is_recording", False)
+ ):
+ try:
+ _continuous_recorder.cancel()
+ paused_recording = True
+ except Exception as e:
+ logger.warning("failed to pause recorder for TTS: %s", e)
+
+ _tts_playing.clear()
+ _debug(f"speak_text: TTS begin (paused_recording={paused_recording})")
+
try:
from tools.tts_tool import text_to_speech_tool
@@ -463,3 +517,23 @@ def speak_text(text: str) -> None:
except Exception as e:
logger.warning("Voice TTS playback failed: %s", e)
_debug(f"speak_text raised {type(e).__name__}: {e}")
+ finally:
+ _tts_playing.set()
+ _debug("speak_text: TTS done")
+
+ # Re-arm the mic so the user can answer without pressing Ctrl+B.
+ # Small delay lets the OS flush speaker output and afplay fully
+ # release the audio device before sounddevice re-opens the input.
+ if paused_recording:
+ time.sleep(0.3)
+ with _continuous_lock:
+ if _continuous_active and _continuous_recorder is not None:
+ try:
+ _continuous_recorder.start(
+ on_silence_stop=_continuous_on_silence
+ )
+ _debug("speak_text: recording resumed after TTS")
+ except Exception as e:
+ logger.warning(
+ "failed to resume recorder after TTS: %s", e
+ )
diff --git a/ui-tui/src/app/useMainApp.ts b/ui-tui/src/app/useMainApp.ts
index c061aa5dd..7b742478e 100644
--- a/ui-tui/src/app/useMainApp.ts
+++ b/ui-tui/src/app/useMainApp.ts
@@ -716,7 +716,9 @@ export function useMainApp(gw: GatewayClient) {
statusColor: statusColorOf(ui.status, ui.theme.color),
stickyPrompt,
turnStartedAt: ui.sid ? turnStartedAt : null,
- voiceLabel: voiceRecording ? 'REC' : voiceProcessing ? 'STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
+ // CLI parity: the classic prompt_toolkit status bar shows a red dot
+ // on REC (cli.py:_get_voice_status_fragments line 2344).
+ voiceLabel: voiceRecording ? '● REC' : voiceProcessing ? '◉ STT' : `voice ${voiceEnabled ? 'on' : 'off'}`
}),
[
cwd,
diff --git a/ui-tui/src/components/appChrome.tsx b/ui-tui/src/components/appChrome.tsx
index 8de2a6301..7b697eedc 100644
--- a/ui-tui/src/components/appChrome.tsx
+++ b/ui-tui/src/components/appChrome.tsx
@@ -215,7 +215,20 @@ export function StatusRule({
) : null}
- {voiceLabel ? │ {voiceLabel} : null}
+ {voiceLabel ? (
+
+ {' │ '}
+ {voiceLabel}
+
+ ) : null}
{bgCount > 0 ? │ {bgCount} bg : null}
{showCost && typeof usage.cost_usd === 'number' ? (
│ ${usage.cost_usd.toFixed(4)}