mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(tui): voice TTS speak-back + transcript-key bug + auto-submit
Three issues surfaced during end-to-end testing of the CLI-parity voice
loop and are fixed together because they all blocked "speak → agent
responds → TTS reads it back" from working at all:
1. Wrong result key (hermes_cli/voice.py)
transcribe_recording() returns {"success": bool, "transcript": str},
matching cli.py:_voice_stop_and_transcribe. The wrapper was reading
result.get("text"), which is None, so every successful Groq / local
STT response was thrown away and the 3-strikes halt fired after
three silent-looking cycles. Fixed by reading "transcript" and also
honouring "success" like the CLI does. Updated the loop simulation
tests to return the correct shape.
2. TTS speak-back was missing (tui_gateway/server.py + hermes_cli/voice.py)
The TUI had a voice.toggle "tts" subcommand but nothing downstream
actually read the flag — agent replies never spoke. Mirrored
cli.py:8747-8754's dispatch: on message.complete with status ==
"complete", if _voice_tts_enabled() is true, spawn a daemon thread
running speak_text(response). Rewrote speak_text as a full port of
cli.py:_voice_speak_response — same markdown-strip regex pipeline
(code blocks, links, bold/italic, inline code, headers, list bullets,
horizontal rules, excessive newlines), same 4000-char cap, same
explicit mp3 output path, same MP3-over-OGG playback choice (afplay
misbehaves on OGG), same cleanup of both extensions. Keeps TUI TTS
audible output byte-for-byte identical to the classic CLI.
3. Auto-submit swallowed on non-empty composer (createGatewayEventHandler.ts)
The voice.transcript handler branched on prev input via a setInput
updater and fired submitRef.current inside the updater when prev was
empty. React strict mode double-invokes state updaters, which would
queue the submit twice; and when the composer had any content the
transcript was merely appended — the agent never saw it. CLI
_pending_input.put(transcript) unconditionally feeds the transcript
as the next turn, so match that: always clear the composer and
setTimeout(() => submitRef.current(text), 0) outside any updater.
Side effect can't run twice this way, and a half-typed draft on the
rare occasion is a fair trade vs. silently dropping the turn.
Also added peak_rms to the rec.stop debug line so "recording too quiet"
is diagnosable at a glance when HERMES_VOICE_DEBUG=1.
This commit is contained in:
parent
04c489b587
commit
42ff785771
3 changed files with 84 additions and 38 deletions
|
|
@ -21,7 +21,6 @@ Two usage modes are exposed:
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
|
@ -405,34 +404,62 @@ def _continuous_on_silence() -> None:
|
|||
def speak_text(text: str) -> None:
|
||||
"""Synthesize ``text`` with the configured TTS provider and play it.
|
||||
|
||||
The gateway spawns a daemon thread to call this so the RPC returns
|
||||
immediately. Failures are logged and swallowed.
|
||||
Mirrors cli.py:_voice_speak_response exactly — same markdown strip
|
||||
pipeline, same 4000-char cap, same explicit mp3 output path, same
|
||||
MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
|
||||
of both extensions. Keeping these in sync means a voice-mode TTS
|
||||
session in the TUI sounds identical to one in the classic CLI.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return
|
||||
|
||||
# Lazy import — tts_tool pulls optional provider SDKs.
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
try:
|
||||
raw = text_to_speech_tool(text)
|
||||
from tools.tts_tool import text_to_speech_tool
|
||||
|
||||
tts_text = text[:4000] if len(text) > 4000 else text
|
||||
tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text) # fenced code blocks
|
||||
tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text) # [text](url) → text
|
||||
tts_text = re.sub(r'https?://\S+', '', tts_text) # bare URLs
|
||||
tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold
|
||||
tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic
|
||||
tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # inline code
|
||||
tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers
|
||||
tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list bullets
|
||||
tts_text = re.sub(r'---+', '', tts_text) # horizontal rules
|
||||
tts_text = re.sub(r'\n{3,}', '\n\n', tts_text) # excess newlines
|
||||
tts_text = tts_text.strip()
|
||||
if not tts_text:
|
||||
return
|
||||
|
||||
# MP3 output path, pre-chosen so we can play the MP3 directly even
|
||||
# when text_to_speech_tool auto-converts to OGG for messaging
|
||||
# platforms. afplay's OGG support is flaky, MP3 always works.
|
||||
os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True)
|
||||
mp3_path = os.path.join(
|
||||
tempfile.gettempdir(),
|
||||
"hermes_voice",
|
||||
f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3",
|
||||
)
|
||||
|
||||
_debug(f"speak_text: synthesizing {len(tts_text)} chars -> {mp3_path}")
|
||||
text_to_speech_tool(text=tts_text, output_path=mp3_path)
|
||||
|
||||
if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0:
|
||||
_debug(f"speak_text: playing {mp3_path} ({os.path.getsize(mp3_path)} bytes)")
|
||||
play_audio_file(mp3_path)
|
||||
try:
|
||||
os.unlink(mp3_path)
|
||||
ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg"
|
||||
if os.path.isfile(ogg_path):
|
||||
os.unlink(ogg_path)
|
||||
except OSError:
|
||||
pass
|
||||
else:
|
||||
_debug(f"speak_text: TTS tool produced no audio at {mp3_path}")
|
||||
except Exception as e:
|
||||
logger.warning("TTS synthesis failed: %s", e)
|
||||
return
|
||||
|
||||
try:
|
||||
result = json.loads(raw) if isinstance(raw, str) else raw
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("TTS returned non-JSON result")
|
||||
return
|
||||
|
||||
if not isinstance(result, dict):
|
||||
return
|
||||
|
||||
file_path = result.get("file_path")
|
||||
if not file_path:
|
||||
err = result.get("error") or "no file_path in TTS result"
|
||||
logger.warning("TTS succeeded but produced no audio: %s", err)
|
||||
return
|
||||
|
||||
play_audio_file(file_path)
|
||||
logger.warning("Voice TTS playback failed: %s", e)
|
||||
_debug(f"speak_text raised {type(e).__name__}: {e}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue