From 1a6fbef8a9c046ee2d45da8534663b64453b6502 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:17:05 +0300 Subject: [PATCH 01/93] feat: add voice mode with push-to-talk and TTS output for CLI Implements Issue #314 Phase 2 & 3: - /voice command to toggle voice mode (on/off/tts/status) - Ctrl+Space push-to-talk recording via sounddevice - Whisper STT transcription via existing transcription_tools - Optional TTS response playback via existing tts_tool - Visual indicators in prompt (recording/transcribing/voice) - 21 unit tests, all mocked (no real mic/API) - Optional deps: sounddevice, numpy (pip install hermes-agent[voice]) --- cli.py | 278 +++++++++++++++++++++++++- hermes_cli/commands.py | 1 + hermes_cli/config.py | 6 + pyproject.toml | 2 + tests/tools/test_voice_mode.py | 347 +++++++++++++++++++++++++++++++++ tools/voice_mode.py | 344 ++++++++++++++++++++++++++++++++ 6 files changed, 977 insertions(+), 1 deletion(-) create mode 100644 tests/tools/test_voice_mode.py create mode 100644 tools/voice_mode.py diff --git a/cli.py b/cli.py index 253cdd085e..b86e2bb82c 100755 --- a/cli.py +++ b/cli.py @@ -3017,6 +3017,8 @@ class HermesCLI: self._handle_background_command(cmd_original) elif cmd_lower.startswith("/skin"): self._handle_skin_command(cmd_original) + elif cmd_lower.startswith("/voice"): + self._handle_voice_command(cmd_original) else: # Check for user-defined quick commands (bypass agent loop, no LLM call) base_cmd = cmd_lower.split()[0] @@ -3511,6 +3513,201 @@ class HermesCLI: except Exception as e: print(f" ❌ MCP reload failed: {e}") + # ==================================================================== + # Voice mode methods + # ==================================================================== + + def _voice_start_recording(self): + """Start capturing audio from the microphone.""" + from tools.voice_mode import AudioRecorder, check_voice_requirements + + reqs = check_voice_requirements() + if not reqs["audio_available"]: + raise RuntimeError( + "Voice mode requires sounddevice and numpy.\n" + "Install with: pip install sounddevice numpy\n" + "Or: pip install hermes-agent[voice]" + ) + if not reqs["stt_key_set"]: + raise RuntimeError( + "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n" + "Get one at: https://platform.openai.com/api-keys" + ) + + if self._voice_recorder is None: + self._voice_recorder = AudioRecorder() + + self._voice_recorder.start() + self._voice_recording = True + _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+Space to stop, Ctrl+C to cancel){_RST}") + + def _voice_stop_and_transcribe(self): + """Stop recording, transcribe via STT, and queue the transcript as input.""" + try: + if self._voice_recorder is None: + return + + wav_path = self._voice_recorder.stop() + self._voice_recording = False + + if wav_path is None: + _cprint(f"{_DIM}No speech detected (recording too short).{_RST}") + return + + self._voice_processing = True + if hasattr(self, '_app') and self._app: + self._app.invalidate() + _cprint(f"{_DIM}Transcribing...{_RST}") + + # Get STT model from config + stt_model = None + try: + from hermes_cli.config import load_config + stt_config = load_config().get("stt", {}) + stt_model = stt_config.get("model") + except Exception: + pass + + from tools.voice_mode import transcribe_recording + result = transcribe_recording(wav_path, model=stt_model) + + if result.get("success") and result.get("transcript", "").strip(): + transcript = result["transcript"].strip() + _cprint(f"\n{_GOLD}●{_RST} {_BOLD}{transcript}{_RST}") + self._pending_input.put(transcript) + elif result.get("success"): + _cprint(f"{_DIM}No speech detected.{_RST}") + else: + error = result.get("error", "Unknown error") + _cprint(f"\n{_DIM}Transcription failed: {error}{_RST}") + + except Exception as e: + _cprint(f"\n{_DIM}Voice processing error: {e}{_RST}") + finally: + self._voice_processing = False + if hasattr(self, '_app') and self._app: + self._app.invalidate() + # Clean up temp file + try: + if wav_path and os.path.isfile(wav_path): + os.unlink(wav_path) + except Exception: + pass + + def _voice_speak_response(self, text: str): + """Speak the agent's response aloud using TTS (runs in background thread).""" + if not self._voice_tts: + return + try: + from tools.tts_tool import text_to_speech_tool + from tools.voice_mode import play_audio_file + import json + + # Truncate to TTS limit + tts_text = text[:4000] if len(text) > 4000 else text + result_json = text_to_speech_tool(text=tts_text) + result = json.loads(result_json) + + if result.get("success") and result.get("file_path"): + play_audio_file(result["file_path"]) + except Exception as e: + logger.debug("Voice TTS playback failed: %s", e) + + def _handle_voice_command(self, command: str): + """Handle /voice [on|off|tts|status] command.""" + parts = command.strip().split(maxsplit=1) + subcommand = parts[1].lower().strip() if len(parts) > 1 else "" + + if subcommand == "on": + self._enable_voice_mode() + elif subcommand == "off": + self._disable_voice_mode() + elif subcommand == "tts": + self._toggle_voice_tts() + elif subcommand == "status": + self._show_voice_status() + elif subcommand == "": + # Toggle + if self._voice_mode: + self._disable_voice_mode() + else: + self._enable_voice_mode() + else: + print(f"Unknown voice subcommand: {subcommand}") + print("Usage: /voice [on|off|tts|status]") + + def _enable_voice_mode(self): + """Enable voice mode after checking requirements.""" + from tools.voice_mode import check_voice_requirements + + reqs = check_voice_requirements() + if not reqs["available"]: + _cprint(f"\n{_GOLD}Voice mode requirements not met:{_RST}") + for line in reqs["details"].split("\n"): + _cprint(f" {_DIM}{line}{_RST}") + if reqs["missing_packages"]: + _cprint(f"\n {_BOLD}Install: pip install {' '.join(reqs['missing_packages'])}{_RST}") + _cprint(f" {_DIM}Or: pip install hermes-agent[voice]{_RST}") + return + + self._voice_mode = True + + # Check config for auto_tts + try: + from hermes_cli.config import load_config + voice_config = load_config().get("voice", {}) + if voice_config.get("auto_tts", False): + self._voice_tts = True + except Exception: + pass + + tts_status = " (TTS enabled)" if self._voice_tts else "" + _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") + _cprint(f" {_DIM}Ctrl+Space to start/stop recording{_RST}") + _cprint(f" {_DIM}/voice tts to toggle speech output{_RST}") + _cprint(f" {_DIM}/voice off to disable voice mode{_RST}") + + def _disable_voice_mode(self): + """Disable voice mode and cancel any active recording.""" + if self._voice_recording and self._voice_recorder: + self._voice_recorder.cancel() + self._voice_recording = False + + self._voice_mode = False + self._voice_tts = False + _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") + + def _toggle_voice_tts(self): + """Toggle TTS output for voice mode.""" + if not self._voice_mode: + _cprint(f"{_DIM}Enable voice mode first: /voice on{_RST}") + return + + self._voice_tts = not self._voice_tts + status = "enabled" if self._voice_tts else "disabled" + + if self._voice_tts: + from tools.tts_tool import check_tts_requirements + if not check_tts_requirements(): + _cprint(f"{_DIM}Warning: No TTS provider available. Install edge-tts or set API keys.{_RST}") + + _cprint(f"{_GOLD}Voice TTS {status}.{_RST}") + + def _show_voice_status(self): + """Show current voice mode status.""" + from tools.voice_mode import check_voice_requirements + + reqs = check_voice_requirements() + + _cprint(f"\n{_BOLD}Voice Mode Status{_RST}") + _cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}") + _cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}") + _cprint(f" Recording: {'YES' if self._voice_recording else 'no'}") + _cprint(f" Record key: Ctrl+Space") + _cprint(f"\n {_BOLD}Requirements:{_RST}") + for line in reqs["details"].split("\n"): + _cprint(f" {line}") + def _clarify_callback(self, question, choices): """ Platform callback for the clarify tool. Called from the agent thread. @@ -3876,12 +4073,23 @@ class HermesCLI: padding=(1, 2), )) + + # Play terminal bell when agent finishes (if enabled). # Works over SSH — the bell propagates to the user's terminal. if self.bell_on_complete: sys.stdout.write("\a") sys.stdout.flush() - + + # Speak response aloud if voice TTS is enabled + if self._voice_tts and response: + threading.Thread( + target=self._voice_speak_response, + args=(response,), + daemon=True, + ).start() + + # Combine all interrupt messages (user may have typed multiple while waiting) # and re-queue as one prompt for process_loop if pending_message and hasattr(self, '_pending_input'): @@ -3964,6 +4172,10 @@ class HermesCLI: def _get_tui_prompt_fragments(self): """Return the prompt_toolkit fragments for the current interactive state.""" symbol, state_suffix = self._get_tui_prompt_symbols() + if self._voice_recording: + return [("class:voice-recording", f"● {state_suffix}")] + if self._voice_processing: + return [("class:voice-processing", f"◉ {state_suffix}")] if self._sudo_state: return [("class:sudo-prompt", f"🔐 {state_suffix}")] if self._secret_state: @@ -3978,6 +4190,8 @@ class HermesCLI: return [("class:prompt-working", f"{self._command_spinner_frame()} {state_suffix}")] if self._agent_running: return [("class:prompt-working", f"⚕ {state_suffix}")] + if self._voice_mode: + return [("class:voice-prompt", f"🎤 {state_suffix}")] return [("class:prompt", symbol)] def _get_tui_prompt_text(self) -> str: @@ -4070,6 +4284,13 @@ class HermesCLI: self._attached_images: list[Path] = [] self._image_counter = 0 + # Voice mode state + self._voice_mode = False # Whether voice mode is enabled + self._voice_tts = False # Whether TTS output is enabled + self._voice_recorder = None # AudioRecorder instance (lazy init) + self._voice_recording = False # Whether currently recording + self._voice_processing = False # Whether STT is in progress + # Register callbacks so terminal_tool prompts route through our UI set_sudo_password_callback(self._sudo_password_callback) set_approval_callback(self._approval_callback) @@ -4254,6 +4475,7 @@ class HermesCLI: """Handle Ctrl+C - cancel interactive prompts, interrupt agent, or exit. Priority: + 0. Cancel active voice recording 1. Cancel active sudo/approval/clarify prompt 2. Interrupt the running agent (first press) 3. Force exit (second press within 2s, or when idle) @@ -4261,6 +4483,14 @@ class HermesCLI: import time as _time now = _time.time() + # Cancel active voice recording + if cli_ref._voice_recording and cli_ref._voice_recorder: + cli_ref._voice_recorder.cancel() + cli_ref._voice_recording = False + _cprint(f"\n{_DIM}Recording cancelled.{_RST}") + event.app.invalidate() + return + # Cancel sudo prompt if self._sudo_state: self._sudo_state["response_queue"].put("") @@ -4367,6 +4597,30 @@ class HermesCLI: # No image found — show a hint pass # silent when no image (avoid noise on accidental press) + @kb.add('c-space') + def handle_ctrl_space(event): + """Toggle voice recording when voice mode is active.""" + if not cli_ref._voice_mode: + return + if cli_ref._agent_running: + return + # Block recording during interactive prompts + if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: + return + if cli_ref._voice_recording: + cli_ref._voice_recording = False + event.app.invalidate() + threading.Thread( + target=cli_ref._voice_stop_and_transcribe, + daemon=True, + ).start() + else: + try: + cli_ref._voice_start_recording() + event.app.invalidate() + except Exception as e: + _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") + # Dynamic prompt: shows Hermes symbol when agent is working, # or answer prompt when clarify freetext mode is active. cli_ref = self @@ -4460,6 +4714,10 @@ class HermesCLI: return Transformation(fragments=ti.fragments) def _get_placeholder(): + if cli_ref._voice_recording: + return "recording... Ctrl+Space to stop, Ctrl+C to cancel" + if cli_ref._voice_processing: + return "transcribing..." if cli_ref._sudo_state: return "type password (hidden), Enter to skip" if cli_ref._secret_state: @@ -4476,6 +4734,8 @@ class HermesCLI: return f"{frame} {status}" if cli_ref._agent_running: return "type a message + Enter to interrupt, Ctrl+C to cancel" + if cli_ref._voice_mode: + return "type or Ctrl+Space to record" return "" input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder)) @@ -4869,6 +5129,10 @@ class HermesCLI: 'approval-cmd': '#AAAAAA italic', 'approval-choice': '#AAAAAA', 'approval-selected': '#FFD700 bold', + # Voice mode + 'voice-prompt': '#87CEEB', + 'voice-recording': '#FF4444 bold', + 'voice-processing': '#FFA500 italic', } style = PTStyle.from_dict(self._build_tui_style_dict()) @@ -4993,6 +5257,18 @@ class HermesCLI: self.agent.flush_memories(self.conversation_history) except Exception: pass + # Cancel active voice recording + if hasattr(self, '_voice_recorder') and self._voice_recorder and self._voice_recording: + try: + self._voice_recorder.cancel() + except Exception: + pass + # Clean up old temp voice recordings + try: + from tools.voice_mode import cleanup_temp_recordings + cleanup_temp_recordings() + except Exception: + pass # Unregister callbacks to avoid dangling references set_sudo_password_callback(None) set_approval_callback(None) diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 57899cf085..a9a1a67ba7 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -37,6 +37,7 @@ COMMANDS_BY_CATEGORY = { "/verbose": "Cycle tool progress display: off → new → all → verbose", "/reasoning": "Manage reasoning effort and display (usage: /reasoning [level|show|hide])", "/skin": "Show or change the display skin/theme", + "/voice": "Toggle voice mode (Ctrl+B to record). Usage: /voice [on|off|tts|status]", }, "Tools & Skills": { "/tools": "List available tools", diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 02edad1fae..dbbe41c10b 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -202,6 +202,12 @@ DEFAULT_CONFIG = { "model": "whisper-1", # whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe }, }, + + "voice": { + "record_key": "ctrl+space", + "max_recording_seconds": 120, + "auto_tts": False, + }, "human_delay": { "mode": "off", diff --git a/pyproject.toml b/pyproject.toml index 7e4197724b..eb005ab942 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ cron = ["croniter"] slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] cli = ["simple-term-menu"] tts-premium = ["elevenlabs"] +voice = ["sounddevice>=0.4.6", "numpy>=1.24.0"] pty = [ "ptyprocess>=0.7.0; sys_platform != 'win32'", "pywinpty>=2.0.0; sys_platform == 'win32'", @@ -78,6 +79,7 @@ all = [ "hermes-agent[mcp]", "hermes-agent[homeassistant]", "hermes-agent[acp]", + "hermes-agent[voice]", ] [project.scripts] diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py new file mode 100644 index 0000000000..fe841f5cb7 --- /dev/null +++ b/tests/tools/test_voice_mode.py @@ -0,0 +1,347 @@ +"""Tests for tools.voice_mode -- all mocked, no real microphone or API calls.""" + +import os +import struct +import time +import wave +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +# ============================================================================ +# Fixtures +# ============================================================================ + +@pytest.fixture +def sample_wav(tmp_path): + """Create a minimal valid WAV file (1 second of silence at 16kHz).""" + wav_path = tmp_path / "test.wav" + n_frames = 16000 # 1 second at 16kHz + silence = struct.pack(f"<{n_frames}h", *([0] * n_frames)) + + with wave.open(str(wav_path), "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(16000) + wf.writeframes(silence) + + return str(wav_path) + + +@pytest.fixture +def temp_voice_dir(tmp_path, monkeypatch): + """Redirect _TEMP_DIR to a temporary path.""" + voice_dir = tmp_path / "hermes_voice" + voice_dir.mkdir() + monkeypatch.setattr("tools.voice_mode._TEMP_DIR", str(voice_dir)) + return voice_dir + + +@pytest.fixture +def mock_sd(monkeypatch): + """Replace tools.voice_mode.sd with a MagicMock (sounddevice may not be installed).""" + mock = MagicMock() + monkeypatch.setattr("tools.voice_mode.sd", mock) + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + # Also ensure numpy is available (use real numpy if installed, else mock) + try: + import numpy as real_np + monkeypatch.setattr("tools.voice_mode.np", real_np) + except ImportError: + monkeypatch.setattr("tools.voice_mode.np", MagicMock()) + return mock + + +# ============================================================================ +# check_voice_requirements +# ============================================================================ + +class TestCheckVoiceRequirements: + def test_all_requirements_met(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key") + + from tools.voice_mode import check_voice_requirements + + result = check_voice_requirements() + assert result["available"] is True + assert result["audio_available"] is True + assert result["stt_key_set"] is True + assert result["missing_packages"] == [] + + def test_missing_audio_packages(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key") + + from tools.voice_mode import check_voice_requirements + + result = check_voice_requirements() + assert result["available"] is False + assert result["audio_available"] is False + assert "sounddevice" in result["missing_packages"] + assert "numpy" in result["missing_packages"] + + def test_missing_stt_key(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + + from tools.voice_mode import check_voice_requirements + + result = check_voice_requirements() + assert result["available"] is False + assert result["stt_key_set"] is False + assert "STT API key: MISSING" in result["details"] + + +# ============================================================================ +# AudioRecorder +# ============================================================================ + +class TestAudioRecorderStart: + def test_start_raises_without_audio(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + with pytest.raises(RuntimeError, match="sounddevice and numpy"): + recorder.start() + + def test_start_creates_and_starts_stream(self, mock_sd): + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + + assert recorder.is_recording is True + mock_sd.InputStream.assert_called_once() + mock_stream.start.assert_called_once() + + def test_double_start_is_noop(self, mock_sd): + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + recorder.start() # second call should be noop + + assert mock_sd.InputStream.call_count == 1 + + +class TestAudioRecorderStop: + def test_stop_returns_none_when_not_recording(self): + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + assert recorder.stop() is None + + def test_stop_writes_wav_file(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder, SAMPLE_RATE + + recorder = AudioRecorder() + recorder.start() + + # Simulate captured audio frames (1 second of silence) + frame = np.zeros((SAMPLE_RATE, 1), dtype="int16") + recorder._frames = [frame] + + wav_path = recorder.stop() + + assert wav_path is not None + assert os.path.isfile(wav_path) + assert wav_path.endswith(".wav") + assert recorder.is_recording is False + + # Verify it is a valid WAV + with wave.open(wav_path, "rb") as wf: + assert wf.getnchannels() == 1 + assert wf.getsampwidth() == 2 + assert wf.getframerate() == SAMPLE_RATE + + def test_stop_returns_none_for_very_short_recording(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + + # Very short recording (100 samples = ~6ms at 16kHz) + frame = np.zeros((100, 1), dtype="int16") + recorder._frames = [frame] + + wav_path = recorder.stop() + assert wav_path is None + + +class TestAudioRecorderCancel: + def test_cancel_discards_frames(self, mock_sd): + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + recorder._frames = [MagicMock()] # simulate captured data + + recorder.cancel() + + assert recorder.is_recording is False + assert recorder._frames == [] + mock_stream.stop.assert_called_once() + mock_stream.close.assert_called_once() + + def test_cancel_when_not_recording_is_safe(self): + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.cancel() # should not raise + assert recorder.is_recording is False + + +class TestAudioRecorderProperties: + def test_elapsed_seconds_when_not_recording(self): + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + assert recorder.elapsed_seconds == 0.0 + + def test_elapsed_seconds_when_recording(self, mock_sd): + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + + # Force start time to 1 second ago + recorder._start_time = time.monotonic() - 1.0 + elapsed = recorder.elapsed_seconds + assert 0.9 < elapsed < 2.0 + + recorder.cancel() + + +# ============================================================================ +# transcribe_recording +# ============================================================================ + +class TestTranscribeRecording: + def test_delegates_to_transcribe_audio(self): + mock_transcribe = MagicMock(return_value={ + "success": True, + "transcript": "hello world", + }) + + with patch("tools.transcription_tools.transcribe_audio", mock_transcribe): + from tools.voice_mode import transcribe_recording + result = transcribe_recording("/tmp/test.wav", model="whisper-1") + + assert result["success"] is True + assert result["transcript"] == "hello world" + mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1") + + +# ============================================================================ +# play_audio_file +# ============================================================================ + +class TestPlayAudioFile: + def test_play_wav_via_sounddevice(self, monkeypatch, sample_wav): + np = pytest.importorskip("numpy") + + mock_sd = MagicMock() + monkeypatch.setattr("tools.voice_mode.sd", mock_sd) + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.setattr("tools.voice_mode.np", np) + + from tools.voice_mode import play_audio_file + + result = play_audio_file(sample_wav) + + assert result is True + mock_sd.play.assert_called_once() + mock_sd.wait.assert_called_once() + + def test_returns_false_when_no_player(self, monkeypatch, sample_wav): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + monkeypatch.setattr("shutil.which", lambda _: None) + + from tools.voice_mode import play_audio_file + + result = play_audio_file(sample_wav) + assert result is False + + def test_returns_false_for_missing_file(self): + from tools.voice_mode import play_audio_file + + result = play_audio_file("/nonexistent/file.wav") + assert result is False + + +# ============================================================================ +# cleanup_temp_recordings +# ============================================================================ + +class TestCleanupTempRecordings: + def test_old_files_deleted(self, temp_voice_dir): + # Create an "old" file + old_file = temp_voice_dir / "recording_20240101_000000.wav" + old_file.write_bytes(b"\x00" * 100) + # Set mtime to 2 hours ago + old_mtime = time.time() - 7200 + os.utime(str(old_file), (old_mtime, old_mtime)) + + from tools.voice_mode import cleanup_temp_recordings + + deleted = cleanup_temp_recordings(max_age_seconds=3600) + assert deleted == 1 + assert not old_file.exists() + + def test_recent_files_preserved(self, temp_voice_dir): + # Create a "recent" file + recent_file = temp_voice_dir / "recording_20260303_120000.wav" + recent_file.write_bytes(b"\x00" * 100) + + from tools.voice_mode import cleanup_temp_recordings + + deleted = cleanup_temp_recordings(max_age_seconds=3600) + assert deleted == 0 + assert recent_file.exists() + + def test_nonexistent_dir_returns_zero(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._TEMP_DIR", "/nonexistent/dir") + + from tools.voice_mode import cleanup_temp_recordings + + assert cleanup_temp_recordings() == 0 + + def test_non_recording_files_ignored(self, temp_voice_dir): + # Create a file that doesn't match the pattern + other_file = temp_voice_dir / "other_file.txt" + other_file.write_bytes(b"\x00" * 100) + old_mtime = time.time() - 7200 + os.utime(str(other_file), (old_mtime, old_mtime)) + + from tools.voice_mode import cleanup_temp_recordings + + deleted = cleanup_temp_recordings(max_age_seconds=3600) + assert deleted == 0 + assert other_file.exists() diff --git a/tools/voice_mode.py b/tools/voice_mode.py new file mode 100644 index 0000000000..2138020131 --- /dev/null +++ b/tools/voice_mode.py @@ -0,0 +1,344 @@ +"""Voice Mode -- Push-to-talk audio recording and playback for the CLI. + +Provides audio capture via sounddevice, WAV encoding via stdlib wave, +STT dispatch via tools.transcription_tools, and TTS playback via +sounddevice or system audio players. + +Dependencies (optional): + pip install sounddevice numpy + or: pip install hermes-agent[voice] +""" + +import logging +import os +import platform +import shutil +import subprocess +import tempfile +import threading +import time +import wave +from pathlib import Path +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Optional imports with graceful degradation +# --------------------------------------------------------------------------- +try: + import sounddevice as sd + import numpy as np + + _HAS_AUDIO = True +except ImportError: + sd = None # type: ignore[assignment] + np = None # type: ignore[assignment] + _HAS_AUDIO = False + +# --------------------------------------------------------------------------- +# Recording parameters +# --------------------------------------------------------------------------- +SAMPLE_RATE = 16000 # Whisper native rate +CHANNELS = 1 # Mono +DTYPE = "int16" # 16-bit PCM +SAMPLE_WIDTH = 2 # bytes per sample (int16) +MAX_RECORDING_SECONDS = 120 # Safety cap + +# Temp directory for voice recordings +_TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice") + + +# ============================================================================ +# AudioRecorder +# ============================================================================ +class AudioRecorder: + """Thread-safe audio recorder using sounddevice.InputStream. + + Usage:: + + recorder = AudioRecorder() + recorder.start() + # ... user speaks ... + wav_path = recorder.stop() # returns path to WAV file + # or + recorder.cancel() # discard without saving + """ + + def __init__(self) -> None: + self._lock = threading.Lock() + self._stream: Any = None + self._frames: List[Any] = [] + self._recording = False + self._start_time: float = 0.0 + + # -- public properties --------------------------------------------------- + + @property + def is_recording(self) -> bool: + return self._recording + + @property + def elapsed_seconds(self) -> float: + if not self._recording: + return 0.0 + return time.monotonic() - self._start_time + + # -- public methods ------------------------------------------------------ + + def start(self) -> None: + """Start capturing audio from the default input device. + + Raises ``RuntimeError`` if sounddevice/numpy are not installed + or if a recording is already in progress. + """ + if not _HAS_AUDIO: + raise RuntimeError( + "Voice mode requires sounddevice and numpy.\n" + "Install with: pip install sounddevice numpy\n" + "Or: pip install hermes-agent[voice]" + ) + + with self._lock: + if self._recording: + return # already recording + + self._frames = [] + self._start_time = time.monotonic() + + def _callback(indata, frames, time_info, status): # noqa: ARG001 + if status: + logger.debug("sounddevice status: %s", status) + self._frames.append(indata.copy()) + + self._stream = sd.InputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype=DTYPE, + callback=_callback, + ) + self._stream.start() + self._recording = True + logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS) + + def stop(self) -> Optional[str]: + """Stop recording and write captured audio to a WAV file. + + Returns: + Path to the WAV file, or ``None`` if no audio was captured. + """ + with self._lock: + if not self._recording: + return None + + self._recording = False + + if self._stream is not None: + try: + self._stream.stop() + self._stream.close() + except Exception: + pass + self._stream = None + + if not self._frames: + return None + + # Concatenate frames and write WAV + audio_data = np.concatenate(self._frames, axis=0) + self._frames = [] + + elapsed = time.monotonic() - self._start_time + logger.info("Voice recording stopped (%.1fs, %d samples)", elapsed, len(audio_data)) + + # Skip very short recordings (< 0.3s of audio) + min_samples = int(SAMPLE_RATE * 0.3) + if len(audio_data) < min_samples: + logger.debug("Recording too short (%d samples), discarding", len(audio_data)) + return None + + return self._write_wav(audio_data) + + def cancel(self) -> None: + """Stop recording and discard all captured audio.""" + with self._lock: + self._recording = False + self._frames = [] + + if self._stream is not None: + try: + self._stream.stop() + self._stream.close() + except Exception: + pass + self._stream = None + + logger.info("Voice recording cancelled") + + # -- private helpers ----------------------------------------------------- + + @staticmethod + def _write_wav(audio_data) -> str: + """Write numpy int16 audio data to a WAV file. + + Returns the file path. + """ + os.makedirs(_TEMP_DIR, exist_ok=True) + timestamp = time.strftime("%Y%m%d_%H%M%S") + wav_path = os.path.join(_TEMP_DIR, f"recording_{timestamp}.wav") + + with wave.open(wav_path, "wb") as wf: + wf.setnchannels(CHANNELS) + wf.setsampwidth(SAMPLE_WIDTH) + wf.setframerate(SAMPLE_RATE) + wf.writeframes(audio_data.tobytes()) + + file_size = os.path.getsize(wav_path) + logger.info("WAV written: %s (%d bytes)", wav_path, file_size) + return wav_path + + +# ============================================================================ +# STT dispatch +# ============================================================================ +def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str, Any]: + """Transcribe a WAV recording using the existing Whisper pipeline. + + Delegates to ``tools.transcription_tools.transcribe_audio()``. + + Args: + wav_path: Path to the WAV file. + model: Whisper model name (default: from config or ``whisper-1``). + + Returns: + Dict with ``success``, ``transcript``, and optionally ``error``. + """ + from tools.transcription_tools import transcribe_audio + + return transcribe_audio(wav_path, model=model) + + +# ============================================================================ +# Audio playback +# ============================================================================ +def play_audio_file(file_path: str) -> bool: + """Play an audio file through the default output device. + + Strategy: + 1. WAV files via ``sounddevice.play()`` when available. + 2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform), + ``aplay`` (Linux ALSA). + + Returns: + ``True`` if playback succeeded, ``False`` otherwise. + """ + if not os.path.isfile(file_path): + logger.warning("Audio file not found: %s", file_path) + return False + + # Try sounddevice for WAV files + if _HAS_AUDIO and file_path.endswith(".wav"): + try: + with wave.open(file_path, "rb") as wf: + frames = wf.readframes(wf.getnframes()) + audio_data = np.frombuffer(frames, dtype=np.int16) + sample_rate = wf.getframerate() + + sd.play(audio_data, samplerate=sample_rate) + sd.wait() + return True + except Exception as e: + logger.debug("sounddevice playback failed: %s", e) + + # Fall back to system audio players + system = platform.system() + players = [] + + if system == "Darwin": + players.append(["afplay", file_path]) + players.append(["ffplay", "-nodisp", "-autoexit", "-loglevel", "quiet", file_path]) + if system == "Linux": + players.append(["aplay", "-q", file_path]) + + for cmd in players: + exe = shutil.which(cmd[0]) + if exe: + try: + subprocess.run(cmd, capture_output=True, timeout=300) + return True + except Exception as e: + logger.debug("System player %s failed: %s", cmd[0], e) + + logger.warning("No audio player available for %s", file_path) + return False + + +# ============================================================================ +# Requirements check +# ============================================================================ +def check_voice_requirements() -> Dict[str, Any]: + """Check if all voice mode requirements are met. + + Returns: + Dict with ``available``, ``audio_available``, ``stt_key_set``, + ``missing_packages``, and ``details``. + """ + stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY")) + missing: List[str] = [] + + if not _HAS_AUDIO: + missing.extend(["sounddevice", "numpy"]) + + available = _HAS_AUDIO and stt_key_set + details_parts = [] + + if _HAS_AUDIO: + details_parts.append("Audio capture: OK") + else: + details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)") + + if stt_key_set: + details_parts.append("STT API key: OK") + else: + details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)") + + return { + "available": available, + "audio_available": _HAS_AUDIO, + "stt_key_set": stt_key_set, + "missing_packages": missing, + "details": "\n".join(details_parts), + } + + +# ============================================================================ +# Temp file cleanup +# ============================================================================ +def cleanup_temp_recordings(max_age_seconds: int = 3600) -> int: + """Remove old temporary voice recording files. + + Args: + max_age_seconds: Delete files older than this (default: 1 hour). + + Returns: + Number of files deleted. + """ + if not os.path.isdir(_TEMP_DIR): + return 0 + + deleted = 0 + now = time.time() + + for entry in os.scandir(_TEMP_DIR): + if entry.is_file() and entry.name.startswith("recording_") and entry.name.endswith(".wav"): + try: + age = now - entry.stat().st_mtime + if age > max_age_seconds: + os.unlink(entry.path) + deleted += 1 + except OSError: + pass + + if deleted: + logger.debug("Cleaned up %d old voice recordings", deleted) + return deleted From ec32e9a5406d32bc8923ffa0be2c196919a57ccd Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 17:17:40 +0300 Subject: [PATCH 02/93] feat: add Groq STT support and fix voice mode keybinding - Add multi-provider STT support (OpenAI > Groq fallback) in transcription_tools - Auto-correct model selection when provider doesn't support the configured model - Change voice record key from Ctrl+Space to Ctrl+R (macOS compatibility) - Fix duplicate transcript echo in voice pipeline - Add GROQ_API_KEY to .env.example --- .env.example | 3 + cli.py | 11 +- hermes_cli/config.py | 2 +- tools/transcription_tools.py | 370 +++++++++++++++-------------------- tools/voice_mode.py | 12 +- 5 files changed, 173 insertions(+), 225 deletions(-) diff --git a/.env.example b/.env.example index a5153d1d07..3d3ad1de96 100644 --- a/.env.example +++ b/.env.example @@ -275,3 +275,6 @@ WANDB_API_KEY= # GITHUB_APP_ID= # GITHUB_APP_PRIVATE_KEY_PATH= # GITHUB_APP_INSTALLATION_ID= + +# Groq API key (free tier — used for Whisper STT in voice mode) +# GROQ_API_KEY= diff --git a/cli.py b/cli.py index b86e2bb82c..1eb9e35100 100755 --- a/cli.py +++ b/cli.py @@ -3539,7 +3539,7 @@ class HermesCLI: self._voice_recorder.start() self._voice_recording = True - _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+Space to stop, Ctrl+C to cancel){_RST}") + _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}") def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" @@ -3573,7 +3573,6 @@ class HermesCLI: if result.get("success") and result.get("transcript", "").strip(): transcript = result["transcript"].strip() - _cprint(f"\n{_GOLD}●{_RST} {_BOLD}{transcript}{_RST}") self._pending_input.put(transcript) elif result.get("success"): _cprint(f"{_DIM}No speech detected.{_RST}") @@ -3663,7 +3662,7 @@ class HermesCLI: tts_status = " (TTS enabled)" if self._voice_tts else "" _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") - _cprint(f" {_DIM}Ctrl+Space to start/stop recording{_RST}") + _cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}") _cprint(f" {_DIM}/voice tts to toggle speech output{_RST}") _cprint(f" {_DIM}/voice off to disable voice mode{_RST}") @@ -3703,7 +3702,7 @@ class HermesCLI: _cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}") _cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}") _cprint(f" Recording: {'YES' if self._voice_recording else 'no'}") - _cprint(f" Record key: Ctrl+Space") + _cprint(f" Record key: Ctrl+R") _cprint(f"\n {_BOLD}Requirements:{_RST}") for line in reqs["details"].split("\n"): _cprint(f" {line}") @@ -4715,7 +4714,7 @@ class HermesCLI: def _get_placeholder(): if cli_ref._voice_recording: - return "recording... Ctrl+Space to stop, Ctrl+C to cancel" + return "recording... Ctrl+R to stop, Ctrl+C to cancel" if cli_ref._voice_processing: return "transcribing..." if cli_ref._sudo_state: @@ -4735,7 +4734,7 @@ class HermesCLI: if cli_ref._agent_running: return "type a message + Enter to interrupt, Ctrl+C to cancel" if cli_ref._voice_mode: - return "type or Ctrl+Space to record" + return "type or Ctrl+R to record" return "" input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder)) diff --git a/hermes_cli/config.py b/hermes_cli/config.py index dbbe41c10b..174e4326e4 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -204,7 +204,7 @@ DEFAULT_CONFIG = { }, "voice": { - "record_key": "ctrl+space", + "record_key": "ctrl+r", "max_recording_seconds": 120, "auto_tts": False, }, diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 96b7a95e2d..7f217bc77e 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -2,19 +2,21 @@ """ Transcription Tools Module -Provides speech-to-text transcription with two providers: - - - **local** (default, free) — faster-whisper running locally, no API key needed. - Auto-downloads the model (~150 MB for ``base``) on first use. - - **openai** — OpenAI Whisper API, requires ``VOICE_TOOLS_OPENAI_KEY``. +Provides speech-to-text transcription using OpenAI-compatible Whisper APIs. +Supports multiple providers with automatic fallback: + 1. OpenAI (VOICE_TOOLS_OPENAI_KEY) -- paid + 2. Groq (GROQ_API_KEY) -- free tier available Used by the messaging gateway to automatically transcribe voice messages -sent by users on Telegram, Discord, WhatsApp, Slack, and Signal. +sent by users on Telegram, Discord, WhatsApp, and Slack. + +Supported models: + OpenAI: whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe + Groq: whisper-large-v3, whisper-large-v3-turbo, distil-whisper-large-v3-en Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg -Usage:: - +Usage: from tools.transcription_tools import transcribe_audio result = transcribe_audio("/path/to/audio.ogg") @@ -25,241 +27,181 @@ Usage:: import logging import os from pathlib import Path -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, Tuple logger = logging.getLogger(__name__) -# --------------------------------------------------------------------------- -# Optional imports — graceful degradation -# --------------------------------------------------------------------------- -try: - from faster_whisper import WhisperModel - _HAS_FASTER_WHISPER = True -except ImportError: - _HAS_FASTER_WHISPER = False - WhisperModel = None # type: ignore[assignment,misc] +# Default STT models per provider +DEFAULT_STT_MODEL = "whisper-1" +DEFAULT_GROQ_STT_MODEL = "whisper-large-v3-turbo" -try: - from openai import OpenAI, APIError, APIConnectionError, APITimeoutError - _HAS_OPENAI = True -except ImportError: - _HAS_OPENAI = False - -# --------------------------------------------------------------------------- -# Constants -# --------------------------------------------------------------------------- - -DEFAULT_PROVIDER = "local" -DEFAULT_LOCAL_MODEL = "base" -DEFAULT_OPENAI_MODEL = "whisper-1" - -SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} -MAX_FILE_SIZE = 25 * 1024 * 1024 # 25 MB - -# Singleton for the local model — loaded once, reused across calls -_local_model: Optional["WhisperModel"] = None -_local_model_name: Optional[str] = None - -# --------------------------------------------------------------------------- -# Config helpers -# --------------------------------------------------------------------------- +# Provider endpoints +GROQ_BASE_URL = "https://api.groq.com/openai/v1" +OPENAI_BASE_URL = "https://api.openai.com/v1" -def _load_stt_config() -> dict: - """Load the ``stt`` section from user config, falling back to defaults.""" - try: - from hermes_cli.config import load_config - return load_config().get("stt", {}) - except Exception: - return {} +def _resolve_stt_provider() -> Tuple[Optional[str], Optional[str], str]: + """Resolve which STT provider to use based on available API keys. - -def _get_provider(stt_config: dict) -> str: - """Determine which STT provider to use. - - Priority: - 1. Explicit config value (``stt.provider``) - 2. Auto-detect: local if faster-whisper available, else openai if key set - 3. Disabled (returns "none") + Returns: + Tuple of (api_key, base_url, provider_name). + api_key is None if no provider is available. """ - provider = stt_config.get("provider", DEFAULT_PROVIDER) + openai_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") + if openai_key: + return openai_key, OPENAI_BASE_URL, "openai" - if provider == "local": - if _HAS_FASTER_WHISPER: - return "local" - # Local requested but not available — fall back to openai if possible - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): - logger.info("faster-whisper not installed, falling back to OpenAI Whisper API") - return "openai" - return "none" + groq_key = os.getenv("GROQ_API_KEY") + if groq_key: + return groq_key, GROQ_BASE_URL, "groq" - if provider == "openai": - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): - return "openai" - # OpenAI requested but no key — fall back to local if possible - if _HAS_FASTER_WHISPER: - logger.info("VOICE_TOOLS_OPENAI_KEY not set, falling back to local faster-whisper") - return "local" - return "none" + return None, None, "none" - return provider # Unknown — let it fail downstream +# Supported audio formats +SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"} -# --------------------------------------------------------------------------- -# Shared validation -# --------------------------------------------------------------------------- - - -def _validate_audio_file(file_path: str) -> Optional[Dict[str, Any]]: - """Validate the audio file. Returns an error dict or None if OK.""" - audio_path = Path(file_path) - - if not audio_path.exists(): - return {"success": False, "transcript": "", "error": f"Audio file not found: {file_path}"} - if not audio_path.is_file(): - return {"success": False, "transcript": "", "error": f"Path is not a file: {file_path}"} - if audio_path.suffix.lower() not in SUPPORTED_FORMATS: - return { - "success": False, - "transcript": "", - "error": f"Unsupported format: {audio_path.suffix}. Supported: {', '.join(sorted(SUPPORTED_FORMATS))}", - } - try: - file_size = audio_path.stat().st_size - if file_size > MAX_FILE_SIZE: - return { - "success": False, - "transcript": "", - "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024):.0f}MB)", - } - except OSError as e: - return {"success": False, "transcript": "", "error": f"Failed to access file: {e}"} - - return None - -# --------------------------------------------------------------------------- -# Provider: local (faster-whisper) -# --------------------------------------------------------------------------- - - -def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: - """Transcribe using faster-whisper (local, free).""" - global _local_model, _local_model_name - - if not _HAS_FASTER_WHISPER: - return {"success": False, "transcript": "", "error": "faster-whisper not installed"} - - try: - # Lazy-load the model (downloads on first use, ~150 MB for 'base') - if _local_model is None or _local_model_name != model_name: - logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) - _local_model = WhisperModel(model_name, device="auto", compute_type="auto") - _local_model_name = model_name - - segments, info = _local_model.transcribe(file_path, beam_size=5) - transcript = " ".join(segment.text.strip() for segment in segments) - - logger.info( - "Transcribed %s via local whisper (%s, lang=%s, %.1fs audio)", - Path(file_path).name, model_name, info.language, info.duration, - ) - - return {"success": True, "transcript": transcript} - - except Exception as e: - logger.error("Local transcription failed: %s", e, exc_info=True) - return {"success": False, "transcript": "", "error": f"Local transcription failed: {e}"} - -# --------------------------------------------------------------------------- -# Provider: openai (Whisper API) -# --------------------------------------------------------------------------- - - -def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: - """Transcribe using OpenAI Whisper API (paid).""" - api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY") - if not api_key: - return {"success": False, "transcript": "", "error": "VOICE_TOOLS_OPENAI_KEY not set"} - - if not _HAS_OPENAI: - return {"success": False, "transcript": "", "error": "openai package not installed"} - - try: - client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1") - - with open(file_path, "rb") as audio_file: - transcription = client.audio.transcriptions.create( - model=model_name, - file=audio_file, - response_format="text", - ) - - transcript_text = str(transcription).strip() - logger.info("Transcribed %s via OpenAI API (%s, %d chars)", - Path(file_path).name, model_name, len(transcript_text)) - - return {"success": True, "transcript": transcript_text} - - except PermissionError: - return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} - except APIConnectionError as e: - return {"success": False, "transcript": "", "error": f"Connection error: {e}"} - except APITimeoutError as e: - return {"success": False, "transcript": "", "error": f"Request timeout: {e}"} - except APIError as e: - return {"success": False, "transcript": "", "error": f"API error: {e}"} - except Exception as e: - logger.error("OpenAI transcription failed: %s", e, exc_info=True) - return {"success": False, "transcript": "", "error": f"Transcription failed: {e}"} - -# --------------------------------------------------------------------------- -# Public API -# --------------------------------------------------------------------------- +# Maximum file size (25MB - OpenAI limit) +MAX_FILE_SIZE = 25 * 1024 * 1024 def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]: """ - Transcribe an audio file using the configured STT provider. + Transcribe an audio file using an OpenAI-compatible Whisper API. - Provider priority: - 1. User config (``stt.provider`` in config.yaml) - 2. Auto-detect: local faster-whisper if available, else OpenAI API + Automatically selects the provider based on available API keys: + VOICE_TOOLS_OPENAI_KEY (OpenAI) > GROQ_API_KEY (Groq). Args: file_path: Absolute path to the audio file to transcribe. - model: Override the model. If None, uses config or provider default. + model: Whisper model to use. Defaults per provider if not specified. Returns: dict with keys: - "success" (bool): Whether transcription succeeded - "transcript" (str): The transcribed text (empty on failure) - "error" (str, optional): Error message if success is False + - "provider" (str, optional): Which provider was used """ - # Validate input - error = _validate_audio_file(file_path) - if error: - return error + api_key, base_url, provider = _resolve_stt_provider() + if not api_key: + return { + "success": False, + "transcript": "", + "error": "No STT API key set. Set VOICE_TOOLS_OPENAI_KEY or GROQ_API_KEY.", + } - # Load config and determine provider - stt_config = _load_stt_config() - provider = _get_provider(stt_config) + audio_path = Path(file_path) + + # Validate file exists + if not audio_path.exists(): + return { + "success": False, + "transcript": "", + "error": f"Audio file not found: {file_path}", + } + + if not audio_path.is_file(): + return { + "success": False, + "transcript": "", + "error": f"Path is not a file: {file_path}", + } + + # Validate file extension + if audio_path.suffix.lower() not in SUPPORTED_FORMATS: + return { + "success": False, + "transcript": "", + "error": f"Unsupported file format: {audio_path.suffix}. Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}", + } + + # Validate file size + try: + file_size = audio_path.stat().st_size + if file_size > MAX_FILE_SIZE: + return { + "success": False, + "transcript": "", + "error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024)}MB)", + } + except OSError as e: + logger.error("Failed to get file size for %s: %s", file_path, e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Failed to access file: {e}", + } - if provider == "local": - local_cfg = stt_config.get("local", {}) - model_name = model or local_cfg.get("model", DEFAULT_LOCAL_MODEL) - return _transcribe_local(file_path, model_name) + # Use provided model, or fall back to provider default. + # If the caller passed an OpenAI-only model but we resolved to Groq, override it. + OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"} + GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"} - if provider == "openai": - openai_cfg = stt_config.get("openai", {}) - model_name = model or openai_cfg.get("model", DEFAULT_OPENAI_MODEL) - return _transcribe_openai(file_path, model_name) + if model is None: + model = DEFAULT_GROQ_STT_MODEL if provider == "groq" else DEFAULT_STT_MODEL + elif provider == "groq" and model in OPENAI_MODELS: + logger.info("Model %s not available on Groq, using %s", model, DEFAULT_GROQ_STT_MODEL) + model = DEFAULT_GROQ_STT_MODEL + elif provider == "openai" and model in GROQ_MODELS: + logger.info("Model %s not available on OpenAI, using %s", model, DEFAULT_STT_MODEL) + model = DEFAULT_STT_MODEL - # No provider available - return { - "success": False, - "transcript": "", - "error": ( - "No STT provider available. Install faster-whisper for free local " - "transcription, or set VOICE_TOOLS_OPENAI_KEY for the OpenAI Whisper API." - ), - } + try: + from openai import OpenAI, APIError, APIConnectionError, APITimeoutError + + client = OpenAI(api_key=api_key, base_url=base_url) + + with open(file_path, "rb") as audio_file: + transcription = client.audio.transcriptions.create( + model=model, + file=audio_file, + response_format="text", + ) + + # The response is a plain string when response_format="text" + transcript_text = str(transcription).strip() + + logger.info("Transcribed %s (%d chars, provider=%s)", audio_path.name, len(transcript_text), provider) + + return { + "success": True, + "transcript": transcript_text, + "provider": provider, + } + + except PermissionError: + logger.error("Permission denied accessing file: %s", file_path, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Permission denied: {file_path}", + } + except APIConnectionError as e: + logger.error("API connection error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Connection error: {e}", + } + except APITimeoutError as e: + logger.error("API timeout during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Request timeout: {e}", + } + except APIError as e: + logger.error("OpenAI API error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"API error: {e}", + } + except Exception as e: + logger.error("Unexpected error during transcription: %s", e, exc_info=True) + return { + "success": False, + "transcript": "", + "error": f"Transcription failed: {e}", + } diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 2138020131..7a7bb6b059 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -283,7 +283,9 @@ def check_voice_requirements() -> Dict[str, Any]: Dict with ``available``, ``audio_available``, ``stt_key_set``, ``missing_packages``, and ``details``. """ - stt_key_set = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY")) + openai_key = bool(os.getenv("VOICE_TOOLS_OPENAI_KEY")) + groq_key = bool(os.getenv("GROQ_API_KEY")) + stt_key_set = openai_key or groq_key missing: List[str] = [] if not _HAS_AUDIO: @@ -297,10 +299,12 @@ def check_voice_requirements() -> Dict[str, Any]: else: details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)") - if stt_key_set: - details_parts.append("STT API key: OK") + if openai_key: + details_parts.append("STT API key: OK (OpenAI)") + elif groq_key: + details_parts.append("STT API key: OK (Groq)") else: - details_parts.append("STT API key: MISSING (set VOICE_TOOLS_OPENAI_KEY)") + details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)") return { "available": available, From ea5b89825a939bf8fad3fad871a8bf7771d04c16 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 17:45:11 +0300 Subject: [PATCH 03/93] fix: voice mode TTS playback and keybinding issues - Change record key from c-@ to c-r (Ctrl+R) for macOS compatibility - Add missing tempfile and time imports that caused silent TTS crash - Use MP3 output for CLI TTS playback (afplay doesn't handle OGG well) - Strip markdown formatting from text before sending to TTS - Remove duplicate transcript echo in voice pipeline --- cli.py | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/cli.py b/cli.py index 1eb9e35100..66bf1b3e19 100755 --- a/cli.py +++ b/cli.py @@ -18,6 +18,8 @@ import shutil import sys import json import atexit +import tempfile +import time import uuid import textwrap from contextlib import contextmanager @@ -3601,14 +3603,37 @@ class HermesCLI: from tools.tts_tool import text_to_speech_tool from tools.voice_mode import play_audio_file import json + import re - # Truncate to TTS limit + # Strip markdown formatting for cleaner TTS tts_text = text[:4000] if len(text) > 4000 else text - result_json = text_to_speech_tool(text=tts_text) - result = json.loads(result_json) + tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold + tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic + tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # code + tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers + tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list items - if result.get("success") and result.get("file_path"): - play_audio_file(result["file_path"]) + # Use MP3 output for CLI playback (afplay doesn't handle OGG well). + # The TTS tool may auto-convert MP3->OGG, but the original MP3 remains. + os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True) + mp3_path = os.path.join( + tempfile.gettempdir(), "hermes_voice", + f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3", + ) + + text_to_speech_tool(text=tts_text, output_path=mp3_path) + + # Play the MP3 directly (the TTS tool returns OGG path but MP3 still exists) + if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0: + play_audio_file(mp3_path) + # Clean up + try: + os.unlink(mp3_path) + ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg" + if os.path.isfile(ogg_path): + os.unlink(ogg_path) + except OSError: + pass except Exception as e: logger.debug("Voice TTS playback failed: %s", e) From 37b01ab964a962161480704f67c284f10b368896 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 17:48:06 +0300 Subject: [PATCH 04/93] test: add transcription_tools tests for multi-provider STT - Provider resolution: OpenAI priority, Groq fallback, no keys - Model auto-correction: Groq corrects OpenAI models and vice versa - Success path: transcription, API errors, whitespace stripping - 12 new tests, 33 total voice-related tests --- tests/tools/test_transcription_tools.py | 199 ++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 tests/tools/test_transcription_tools.py diff --git a/tests/tools/test_transcription_tools.py b/tests/tools/test_transcription_tools.py new file mode 100644 index 0000000000..6750f28d3a --- /dev/null +++ b/tests/tools/test_transcription_tools.py @@ -0,0 +1,199 @@ +"""Tests for tools.transcription_tools -- provider resolution and model correction.""" + +import os +import struct +import wave +from unittest.mock import MagicMock, patch + +import pytest + + +# ============================================================================ +# Fixtures +# ============================================================================ + +@pytest.fixture +def sample_wav(tmp_path): + """Create a minimal valid WAV file (1 second of silence at 16kHz).""" + wav_path = tmp_path / "test.wav" + n_frames = 16000 + silence = struct.pack(f"<{n_frames}h", *([0] * n_frames)) + + with wave.open(str(wav_path), "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(16000) + wf.writeframes(silence) + + return str(wav_path) + + +@pytest.fixture(autouse=True) +def clean_env(monkeypatch): + """Ensure no real API keys leak into tests.""" + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("GROQ_API_KEY", raising=False) + + +# ============================================================================ +# _resolve_stt_provider +# ============================================================================ + +class TestResolveSTTProvider: + def test_openai_preferred_over_groq(self, monkeypatch): + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test") + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + from tools.transcription_tools import _resolve_stt_provider + key, url, provider = _resolve_stt_provider() + + assert provider == "openai" + assert key == "sk-test" + assert "openai.com" in url + + def test_groq_fallback(self, monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + from tools.transcription_tools import _resolve_stt_provider + key, url, provider = _resolve_stt_provider() + + assert provider == "groq" + assert key == "gsk-test" + assert "groq.com" in url + + def test_no_keys_returns_none(self): + from tools.transcription_tools import _resolve_stt_provider + key, url, provider = _resolve_stt_provider() + + assert provider == "none" + assert key is None + assert url is None + + +# ============================================================================ +# transcribe_audio -- no API key +# ============================================================================ + +class TestTranscribeAudioNoKey: + def test_returns_error_when_no_key(self): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio("/tmp/test.wav") + + assert result["success"] is False + assert "No STT API key" in result["error"] + + def test_returns_error_for_missing_file(self, monkeypatch): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + from tools.transcription_tools import transcribe_audio + result = transcribe_audio("/nonexistent/audio.wav") + + assert result["success"] is False + assert "not found" in result["error"] + + +# ============================================================================ +# Model auto-correction +# ============================================================================ + +class TestModelAutoCorrection: + def test_groq_corrects_openai_model(self, monkeypatch, sample_wav): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = "hello world" + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio, DEFAULT_GROQ_STT_MODEL + result = transcribe_audio(sample_wav, model="whisper-1") + + assert result["success"] is True + assert result["transcript"] == "hello world" + # Verify the model was corrected to Groq default + call_kwargs = mock_client.audio.transcriptions.create.call_args + assert call_kwargs.kwargs["model"] == DEFAULT_GROQ_STT_MODEL + + def test_openai_corrects_groq_model(self, monkeypatch, sample_wav): + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = "hello world" + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio, DEFAULT_STT_MODEL + result = transcribe_audio(sample_wav, model="whisper-large-v3-turbo") + + assert result["success"] is True + call_kwargs = mock_client.audio.transcriptions.create.call_args + assert call_kwargs.kwargs["model"] == DEFAULT_STT_MODEL + + def test_none_model_uses_provider_default(self, monkeypatch, sample_wav): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = "test" + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio, DEFAULT_GROQ_STT_MODEL + transcribe_audio(sample_wav, model=None) + + call_kwargs = mock_client.audio.transcriptions.create.call_args + assert call_kwargs.kwargs["model"] == DEFAULT_GROQ_STT_MODEL + + def test_compatible_model_not_overridden(self, monkeypatch, sample_wav): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = "test" + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio + transcribe_audio(sample_wav, model="whisper-large-v3") + + call_kwargs = mock_client.audio.transcriptions.create.call_args + assert call_kwargs.kwargs["model"] == "whisper-large-v3" + + +# ============================================================================ +# transcribe_audio -- success path +# ============================================================================ + +class TestTranscribeAudioSuccess: + def test_successful_transcription(self, monkeypatch, sample_wav): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = "hello world" + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(sample_wav) + + assert result["success"] is True + assert result["transcript"] == "hello world" + assert result["provider"] == "groq" + + def test_api_error_returns_failure(self, monkeypatch, sample_wav): + monkeypatch.setenv("GROQ_API_KEY", "gsk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.side_effect = Exception("API error") + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(sample_wav) + + assert result["success"] is False + assert "API error" in result["error"] + + def test_whitespace_transcript_stripped(self, monkeypatch, sample_wav): + monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test") + + mock_client = MagicMock() + mock_client.audio.transcriptions.create.return_value = " hello world \n" + + with patch("openai.OpenAI", return_value=mock_client): + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(sample_wav) + + assert result["transcript"] == "hello world" From c23928d089a35ce6b6ea72785a85aee9d301ffca Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 18:00:31 +0300 Subject: [PATCH 05/93] fix: improve voice mode robustness and add integration tests - Show TTS errors to user instead of silently logging - Improve markdown stripping: code blocks, URLs, links, horizontal rules - Fix stripping order: process markdown links before removing URLs - Add threading.Lock for voice state variables (cross-thread safety) - Add 14 CLI integration tests (markdown stripping, command parsing, thread safety) - Total: 47 voice-related tests --- cli.py | 64 +++++---- tests/tools/test_voice_cli_integration.py | 151 ++++++++++++++++++++++ 2 files changed, 192 insertions(+), 23 deletions(-) create mode 100644 tests/tools/test_voice_cli_integration.py diff --git a/cli.py b/cli.py index 66bf1b3e19..3b3032c408 100755 --- a/cli.py +++ b/cli.py @@ -3540,7 +3540,8 @@ class HermesCLI: self._voice_recorder = AudioRecorder() self._voice_recorder.start() - self._voice_recording = True + with self._voice_lock: + self._voice_recording = True _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}") def _voice_stop_and_transcribe(self): @@ -3550,13 +3551,15 @@ class HermesCLI: return wav_path = self._voice_recorder.stop() - self._voice_recording = False + with self._voice_lock: + self._voice_recording = False if wav_path is None: _cprint(f"{_DIM}No speech detected (recording too short).{_RST}") return - self._voice_processing = True + with self._voice_lock: + self._voice_processing = True if hasattr(self, '_app') and self._app: self._app.invalidate() _cprint(f"{_DIM}Transcribing...{_RST}") @@ -3585,7 +3588,8 @@ class HermesCLI: except Exception as e: _cprint(f"\n{_DIM}Voice processing error: {e}{_RST}") finally: - self._voice_processing = False + with self._voice_lock: + self._voice_processing = False if hasattr(self, '_app') and self._app: self._app.invalidate() # Clean up temp file @@ -3605,13 +3609,21 @@ class HermesCLI: import json import re - # Strip markdown formatting for cleaner TTS + # Strip markdown and non-speech content for cleaner TTS tts_text = text[:4000] if len(text) > 4000 else text + tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text) # fenced code blocks + tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text) # [text](url) -> text + tts_text = re.sub(r'https?://\S+', '', tts_text) # URLs tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic - tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # code + tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # inline code tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list items + tts_text = re.sub(r'---+', '', tts_text) # horizontal rules + tts_text = re.sub(r'\n{3,}', '\n\n', tts_text) # excessive newlines + tts_text = tts_text.strip() + if not tts_text: + return # Use MP3 output for CLI playback (afplay doesn't handle OGG well). # The TTS tool may auto-convert MP3->OGG, but the original MP3 remains. @@ -3635,7 +3647,8 @@ class HermesCLI: except OSError: pass except Exception as e: - logger.debug("Voice TTS playback failed: %s", e) + logger.warning("Voice TTS playback failed: %s", e) + _cprint(f"{_DIM}TTS playback failed: {e}{_RST}") def _handle_voice_command(self, command: str): """Handle /voice [on|off|tts|status] command.""" @@ -3674,14 +3687,16 @@ class HermesCLI: _cprint(f" {_DIM}Or: pip install hermes-agent[voice]{_RST}") return - self._voice_mode = True + with self._voice_lock: + self._voice_mode = True # Check config for auto_tts try: from hermes_cli.config import load_config voice_config = load_config().get("voice", {}) if voice_config.get("auto_tts", False): - self._voice_tts = True + with self._voice_lock: + self._voice_tts = True except Exception: pass @@ -3693,12 +3708,12 @@ class HermesCLI: def _disable_voice_mode(self): """Disable voice mode and cancel any active recording.""" - if self._voice_recording and self._voice_recorder: - self._voice_recorder.cancel() - self._voice_recording = False - - self._voice_mode = False - self._voice_tts = False + with self._voice_lock: + if self._voice_recording and self._voice_recorder: + self._voice_recorder.cancel() + self._voice_recording = False + self._voice_mode = False + self._voice_tts = False _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -3707,7 +3722,8 @@ class HermesCLI: _cprint(f"{_DIM}Enable voice mode first: /voice on{_RST}") return - self._voice_tts = not self._voice_tts + with self._voice_lock: + self._voice_tts = not self._voice_tts status = "enabled" if self._voice_tts else "disabled" if self._voice_tts: @@ -4308,7 +4324,8 @@ class HermesCLI: self._attached_images: list[Path] = [] self._image_counter = 0 - # Voice mode state + # Voice mode state (protected by _voice_lock for cross-thread access) + self._voice_lock = threading.Lock() self._voice_mode = False # Whether voice mode is enabled self._voice_tts = False # Whether TTS output is enabled self._voice_recorder = None # AudioRecorder instance (lazy init) @@ -4508,12 +4525,13 @@ class HermesCLI: now = _time.time() # Cancel active voice recording - if cli_ref._voice_recording and cli_ref._voice_recorder: - cli_ref._voice_recorder.cancel() - cli_ref._voice_recording = False - _cprint(f"\n{_DIM}Recording cancelled.{_RST}") - event.app.invalidate() - return + with cli_ref._voice_lock: + if cli_ref._voice_recording and cli_ref._voice_recorder: + cli_ref._voice_recorder.cancel() + cli_ref._voice_recording = False + _cprint(f"\n{_DIM}Recording cancelled.{_RST}") + event.app.invalidate() + return # Cancel sudo prompt if self._sudo_state: diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py new file mode 100644 index 0000000000..7bb78e66c1 --- /dev/null +++ b/tests/tools/test_voice_cli_integration.py @@ -0,0 +1,151 @@ +"""Tests for CLI voice mode integration -- command parsing, markdown stripping, state management.""" + +import re +import threading + +import pytest + + +# ============================================================================ +# Markdown stripping (same logic as _voice_speak_response) +# ============================================================================ + +def _strip_markdown_for_tts(text: str) -> str: + """Replicate the markdown stripping logic from cli._voice_speak_response.""" + tts_text = text[:4000] if len(text) > 4000 else text + tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text) # fenced code blocks + tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text) # [text](url) -> text + tts_text = re.sub(r'https?://\S+', '', tts_text) # URLs + tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text) # bold + tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text) # italic + tts_text = re.sub(r'`(.+?)`', r'\1', tts_text) # inline code + tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE) # headers + tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE) # list items + tts_text = re.sub(r'---+', '', tts_text) # horizontal rules + tts_text = re.sub(r'\n{3,}', '\n\n', tts_text) # excessive newlines + return tts_text.strip() + + +class TestMarkdownStripping: + def test_strips_bold(self): + assert _strip_markdown_for_tts("This is **bold** text") == "This is bold text" + + def test_strips_italic(self): + assert _strip_markdown_for_tts("This is *italic* text") == "This is italic text" + + def test_strips_inline_code(self): + assert _strip_markdown_for_tts("Run `pip install foo`") == "Run pip install foo" + + def test_strips_fenced_code_blocks(self): + text = "Here is code:\n```python\nprint('hello')\n```\nDone." + result = _strip_markdown_for_tts(text) + assert "print" not in result + assert "Done." in result + + def test_strips_headers(self): + assert _strip_markdown_for_tts("## Summary\nSome text") == "Summary\nSome text" + + def test_strips_list_markers(self): + text = "- item one\n- item two\n* item three" + result = _strip_markdown_for_tts(text) + assert "item one" in result + assert "- " not in result + assert "* " not in result + + def test_strips_urls(self): + text = "Visit https://example.com for details" + result = _strip_markdown_for_tts(text) + assert "https://" not in result + assert "Visit" in result + + def test_strips_markdown_links(self): + text = "See [the docs](https://example.com/docs) for info" + result = _strip_markdown_for_tts(text) + assert "the docs" in result + assert "https://" not in result + assert "[" not in result + + def test_strips_horizontal_rules(self): + text = "Part one\n---\nPart two" + result = _strip_markdown_for_tts(text) + assert "---" not in result + assert "Part one" in result + assert "Part two" in result + + def test_empty_after_stripping_returns_empty(self): + text = "```python\nprint('hello')\n```" + result = _strip_markdown_for_tts(text) + assert result == "" + + def test_truncates_long_text(self): + text = "a" * 5000 + result = _strip_markdown_for_tts(text) + assert len(result) <= 4000 + + def test_complex_response(self): + text = ( + "## Answer\n\n" + "Here's how to do it:\n\n" + "```python\ndef hello():\n print('hi')\n```\n\n" + "Run it with `python main.py`. " + "See [docs](https://example.com) for more.\n\n" + "- Step one\n- Step two\n\n" + "---\n\n" + "**Good luck!**" + ) + result = _strip_markdown_for_tts(text) + assert "```" not in result + assert "https://" not in result + assert "**" not in result + assert "---" not in result + assert "Answer" in result + assert "Good luck!" in result + assert "docs" in result + + +# ============================================================================ +# Voice command parsing +# ============================================================================ + +class TestVoiceCommandParsing: + """Test _handle_voice_command logic without full CLI setup.""" + + def test_parse_subcommands(self): + """Verify subcommand extraction from /voice commands.""" + test_cases = [ + ("/voice on", "on"), + ("/voice off", "off"), + ("/voice tts", "tts"), + ("/voice status", "status"), + ("/voice", ""), + ("/voice ON ", "on"), + ] + for command, expected in test_cases: + parts = command.strip().split(maxsplit=1) + subcommand = parts[1].lower().strip() if len(parts) > 1 else "" + assert subcommand == expected, f"Failed for {command!r}: got {subcommand!r}" + + +# ============================================================================ +# Voice state thread safety +# ============================================================================ + +class TestVoiceStateLock: + def test_lock_protects_state(self): + """Verify that concurrent state changes don't corrupt state.""" + lock = threading.Lock() + state = {"recording": False, "count": 0} + + def toggle_many(n): + for _ in range(n): + with lock: + state["recording"] = not state["recording"] + state["count"] += 1 + + threads = [threading.Thread(target=toggle_many, args=(1000,)) for _ in range(4)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert state["count"] == 4000 From a69bd55b5a9926692b096d581d6856c7e2a0fefc Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 18:46:29 +0300 Subject: [PATCH 06/93] fix: isolate GROQ_API_KEY in test_missing_stt_key test The test was failing because GROQ_API_KEY leaked from the environment. Now both VOICE_TOOLS_OPENAI_KEY and GROQ_API_KEY are removed to properly test the "no STT key" scenario. --- tests/tools/test_voice_mode.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index fe841f5cb7..d9dcba2c37 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -86,6 +86,7 @@ class TestCheckVoiceRequirements: def test_missing_stt_key(self, monkeypatch): monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("GROQ_API_KEY", raising=False) from tools.voice_mode import check_voice_requirements From bfd9c97705c93726ae00dd4431ec8240b99e318d Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:56:00 +0300 Subject: [PATCH 07/93] feat: add Phase 4 low-latency features for voice mode - Audio cues: beep on record start (880Hz), double beep on stop (660Hz) - Silence detection: auto-stop recording after 3s of silence (RMS-based) - Continuous mode: auto-restart recording after agent responds - Ctrl+R starts continuous mode, Ctrl+R during recording exits it - Waits for TTS to finish before restarting to avoid recording speaker - Tests: 7 new tests for beep generation and silence detection --- cli.py | 56 +++++++++++- tests/tools/test_voice_mode.py | 151 +++++++++++++++++++++++++++++++++ tools/voice_mode.py | 82 +++++++++++++++++- 3 files changed, 283 insertions(+), 6 deletions(-) diff --git a/cli.py b/cli.py index 3b3032c408..9fb613c851 100755 --- a/cli.py +++ b/cli.py @@ -3539,10 +3539,27 @@ class HermesCLI: if self._voice_recorder is None: self._voice_recorder = AudioRecorder() - self._voice_recorder.start() + def _on_silence(): + """Called by AudioRecorder when silence is detected after speech.""" + with self._voice_lock: + if not self._voice_recording: + return + _cprint(f"\n{_DIM}Silence detected, auto-stopping...{_RST}") + if hasattr(self, '_app') and self._app: + self._app.invalidate() + self._voice_stop_and_transcribe() + + self._voice_recorder.start(on_silence_stop=_on_silence) with self._voice_lock: self._voice_recording = True - _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(Ctrl+R to stop, Ctrl+C to cancel){_RST}") + + # Audio cue: single beep on recording start + try: + from tools.voice_mode import play_beep + threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start() + except Exception: + pass + _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" @@ -3554,6 +3571,13 @@ class HermesCLI: with self._voice_lock: self._voice_recording = False + # Audio cue: double beep on recording stop + try: + from tools.voice_mode import play_beep + threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start() + except Exception: + pass + if wav_path is None: _cprint(f"{_DIM}No speech detected (recording too short).{_RST}") return @@ -3603,6 +3627,7 @@ class HermesCLI: """Speak the agent's response aloud using TTS (runs in background thread).""" if not self._voice_tts: return + self._voice_tts_done.clear() try: from tools.tts_tool import text_to_speech_tool from tools.voice_mode import play_audio_file @@ -3649,6 +3674,8 @@ class HermesCLI: except Exception as e: logger.warning("Voice TTS playback failed: %s", e) _cprint(f"{_DIM}TTS playback failed: {e}{_RST}") + finally: + self._voice_tts_done.set() def _handle_voice_command(self, command: str): """Handle /voice [on|off|tts|status] command.""" @@ -3714,6 +3741,7 @@ class HermesCLI: self._voice_recording = False self._voice_mode = False self._voice_tts = False + self._voice_continuous = False _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -4331,6 +4359,9 @@ class HermesCLI: self._voice_recorder = None # AudioRecorder instance (lazy init) self._voice_recording = False # Whether currently recording self._voice_processing = False # Whether STT is in progress + self._voice_continuous = False # Whether to auto-restart after agent responds + self._voice_tts_done = threading.Event() # Signals TTS playback finished + self._voice_tts_done.set() # Initially "done" (no TTS pending) # Register callbacks so terminal_tool prompts route through our UI set_sudo_password_callback(self._sudo_password_callback) @@ -4650,7 +4681,10 @@ class HermesCLI: if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: return if cli_ref._voice_recording: - cli_ref._voice_recording = False + # Manual stop via Ctrl+R: stop continuous mode + with cli_ref._voice_lock: + cli_ref._voice_continuous = False + cli_ref._voice_recording = False event.app.invalidate() threading.Thread( target=cli_ref._voice_stop_and_transcribe, @@ -4658,6 +4692,8 @@ class HermesCLI: ).start() else: try: + with cli_ref._voice_lock: + cli_ref._voice_continuous = True cli_ref._voice_start_recording() event.app.invalidate() except Exception as e: @@ -5267,13 +5303,25 @@ class HermesCLI: # Regular chat - run agent self._agent_running = True app.invalidate() # Refresh status line - + try: self.chat(user_input, images=submit_images or None) finally: self._agent_running = False self._spinner_text = "" app.invalidate() # Refresh status line + + # Continuous voice: auto-restart recording after agent responds + if self._voice_mode and self._voice_continuous and not self._voice_recording: + try: + # Wait for TTS to finish so we don't record the speaker + if self._voice_tts: + self._voice_tts_done.wait(timeout=60) + time.sleep(0.3) # Brief pause after TTS ends + self._voice_start_recording() + app.invalidate() + except Exception as e: + _cprint(f"{_DIM}Voice auto-restart failed: {e}{_RST}") except Exception as e: print(f"Error: {e}") diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index d9dcba2c37..ff1a99b2f2 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -346,3 +346,154 @@ class TestCleanupTempRecordings: deleted = cleanup_temp_recordings(max_age_seconds=3600) assert deleted == 0 assert other_file.exists() + + +# ============================================================================ +# play_beep +# ============================================================================ + +class TestPlayBeep: + def test_beep_calls_sounddevice_play(self, mock_sd): + np = pytest.importorskip("numpy") + + from tools.voice_mode import play_beep + + play_beep(frequency=880, duration=0.1, count=1) + + mock_sd.play.assert_called_once() + mock_sd.wait.assert_called_once() + # Verify audio data is int16 numpy array + audio_arg = mock_sd.play.call_args[0][0] + assert audio_arg.dtype == np.int16 + assert len(audio_arg) > 0 + + def test_beep_double_produces_longer_audio(self, mock_sd): + np = pytest.importorskip("numpy") + + from tools.voice_mode import play_beep + + play_beep(frequency=660, duration=0.1, count=2) + + audio_arg = mock_sd.play.call_args[0][0] + single_beep_samples = int(16000 * 0.1) + # Double beep should be longer than a single beep + assert len(audio_arg) > single_beep_samples + + def test_beep_noop_without_audio(self, monkeypatch): + monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + + from tools.voice_mode import play_beep + + # Should not raise + play_beep() + + def test_beep_handles_playback_error(self, mock_sd): + mock_sd.play.side_effect = Exception("device error") + + from tools.voice_mode import play_beep + + # Should not raise + play_beep() + + +# ============================================================================ +# Silence detection +# ============================================================================ + +class TestSilenceDetection: + def test_silence_callback_fires_after_speech_then_silence(self, mock_sd): + np = pytest.importorskip("numpy") + import threading + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder, SAMPLE_RATE + + recorder = AudioRecorder() + # Use very short silence duration for testing + recorder._silence_duration = 0.05 + + fired = threading.Event() + + def on_silence(): + fired.set() + + recorder.start(on_silence_stop=on_silence) + + # Get the callback function from InputStream constructor + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Simulate loud audio (speech) -- RMS well above threshold + loud_frame = np.full((1600, 1), 5000, dtype="int16") + callback(loud_frame, 1600, None, None) + assert recorder._has_spoken is True + + # Simulate silence + silent_frame = np.zeros((1600, 1), dtype="int16") + callback(silent_frame, 1600, None, None) + + # Wait a bit past the silence duration, then send another silent frame + time.sleep(0.06) + callback(silent_frame, 1600, None, None) + + # The callback should have been fired + assert fired.wait(timeout=1.0) is True + + recorder.cancel() + + def test_silence_without_speech_does_not_fire(self, mock_sd): + np = pytest.importorskip("numpy") + import threading + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder._silence_duration = 0.02 + + fired = threading.Event() + recorder.start(on_silence_stop=lambda: fired.set()) + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Only silence -- no speech detected, so callback should NOT fire + silent_frame = np.zeros((1600, 1), dtype="int16") + for _ in range(5): + callback(silent_frame, 1600, None, None) + time.sleep(0.01) + + assert fired.wait(timeout=0.2) is False + + recorder.cancel() + + def test_no_callback_means_no_silence_detection(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() # no on_silence_stop + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Even with speech then silence, nothing should happen + loud_frame = np.full((1600, 1), 5000, dtype="int16") + silent_frame = np.zeros((1600, 1), dtype="int16") + callback(loud_frame, 1600, None, None) + callback(silent_frame, 1600, None, None) + + # No crash, no callback + assert recorder._on_silence_stop is None + recorder.cancel() diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 7a7bb6b059..5abdc4d601 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -45,10 +45,51 @@ DTYPE = "int16" # 16-bit PCM SAMPLE_WIDTH = 2 # bytes per sample (int16) MAX_RECORDING_SECONDS = 120 # Safety cap +# Silence detection defaults +SILENCE_RMS_THRESHOLD = 200 # RMS below this = silence (int16 range 0-32767) +SILENCE_DURATION_SECONDS = 3.0 # Seconds of continuous silence before auto-stop + # Temp directory for voice recordings _TEMP_DIR = os.path.join(tempfile.gettempdir(), "hermes_voice") +# ============================================================================ +# Audio cues (beep tones) +# ============================================================================ +def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> None: + """Play a short beep tone using numpy + sounddevice. + + Args: + frequency: Tone frequency in Hz (default 880 = A5). + duration: Duration of each beep in seconds. + count: Number of beeps to play (with short gap between). + """ + if not _HAS_AUDIO: + return + try: + gap = 0.06 # seconds between beeps + samples_per_beep = int(SAMPLE_RATE * duration) + samples_per_gap = int(SAMPLE_RATE * gap) + + parts = [] + for i in range(count): + t = np.linspace(0, duration, samples_per_beep, endpoint=False) + # Apply fade in/out to avoid click artifacts + tone = np.sin(2 * np.pi * frequency * t) + fade_len = min(int(SAMPLE_RATE * 0.01), samples_per_beep // 4) + tone[:fade_len] *= np.linspace(0, 1, fade_len) + tone[-fade_len:] *= np.linspace(1, 0, fade_len) + parts.append((tone * 0.3 * 32767).astype(np.int16)) + if i < count - 1: + parts.append(np.zeros(samples_per_gap, dtype=np.int16)) + + audio = np.concatenate(parts) + sd.play(audio, samplerate=SAMPLE_RATE) + sd.wait() + except Exception as e: + logger.debug("Beep playback failed: %s", e) + + # ============================================================================ # AudioRecorder # ============================================================================ @@ -58,11 +99,14 @@ class AudioRecorder: Usage:: recorder = AudioRecorder() - recorder.start() + recorder.start(on_silence_stop=my_callback) # ... user speaks ... wav_path = recorder.stop() # returns path to WAV file # or recorder.cancel() # discard without saving + + If ``on_silence_stop`` is provided, recording automatically stops when + the user is silent for ``silence_duration`` seconds and calls the callback. """ def __init__(self) -> None: @@ -71,6 +115,12 @@ class AudioRecorder: self._frames: List[Any] = [] self._recording = False self._start_time: float = 0.0 + # Silence detection state + self._has_spoken = False + self._silence_start: float = 0.0 + self._on_silence_stop = None + self._silence_threshold: int = SILENCE_RMS_THRESHOLD + self._silence_duration: float = SILENCE_DURATION_SECONDS # -- public properties --------------------------------------------------- @@ -86,9 +136,14 @@ class AudioRecorder: # -- public methods ------------------------------------------------------ - def start(self) -> None: + def start(self, on_silence_stop=None) -> None: """Start capturing audio from the default input device. + Args: + on_silence_stop: Optional callback invoked (in a daemon thread) when + silence is detected after speech. The callback receives no arguments. + Use this to auto-stop recording and trigger transcription. + Raises ``RuntimeError`` if sounddevice/numpy are not installed or if a recording is already in progress. """ @@ -105,12 +160,35 @@ class AudioRecorder: self._frames = [] self._start_time = time.monotonic() + self._has_spoken = False + self._silence_start = 0.0 + self._on_silence_stop = on_silence_stop def _callback(indata, frames, time_info, status): # noqa: ARG001 if status: logger.debug("sounddevice status: %s", status) self._frames.append(indata.copy()) + # Silence detection: compute RMS of this chunk + if self._on_silence_stop is not None and self._recording: + rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) + now = time.monotonic() + + if rms > self._silence_threshold: + self._has_spoken = True + self._silence_start = 0.0 + elif self._has_spoken: + # User was speaking and now is silent + if self._silence_start == 0.0: + self._silence_start = now + elif now - self._silence_start >= self._silence_duration: + logger.info("Silence detected (%.1fs), auto-stopping", + self._silence_duration) + cb = self._on_silence_stop + self._on_silence_stop = None # fire only once + if cb: + threading.Thread(target=cb, daemon=True).start() + self._stream = sd.InputStream( samplerate=SAMPLE_RATE, channels=CHANNELS, From 32b033c11ce306d3c30e235d239824e696688f98 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 19:58:38 +0300 Subject: [PATCH 08/93] feat: add silence filter, hallucination guard, and continuous mode control - Skip silent recordings before STT call (RMS check in AudioRecorder.stop) - Filter known Whisper hallucinations ("Thank you.", "Bye." etc.) - Continuous mode: Ctrl+R starts loop, Ctrl+R during recording exits it - Wait for TTS to finish before auto-restart to avoid recording speaker - Silence timeout increased to 3s for natural pauses - Tests: hallucination filter, silent recording skip, real speech passthrough --- tests/tools/test_voice_mode.py | 68 +++++++++++++++++++++++++++++++++- tools/voice_mode.py | 46 ++++++++++++++++++++++- 2 files changed, 111 insertions(+), 3 deletions(-) diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index ff1a99b2f2..0d40932e2f 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -154,8 +154,8 @@ class TestAudioRecorderStop: recorder = AudioRecorder() recorder.start() - # Simulate captured audio frames (1 second of silence) - frame = np.zeros((SAMPLE_RATE, 1), dtype="int16") + # Simulate captured audio frames (1 second of loud audio above RMS threshold) + frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16") recorder._frames = [frame] wav_path = recorder.stop() @@ -189,6 +189,24 @@ class TestAudioRecorderStop: wav_path = recorder.stop() assert wav_path is None + def test_stop_returns_none_for_silent_recording(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder, SAMPLE_RATE + + recorder = AudioRecorder() + recorder.start() + + # 1 second of near-silence (RMS well below threshold) + frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16") + recorder._frames = [frame] + + wav_path = recorder.stop() + assert wav_path is None + class TestAudioRecorderCancel: def test_cancel_discards_frames(self, mock_sd): @@ -259,6 +277,52 @@ class TestTranscribeRecording: assert result["transcript"] == "hello world" mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1") + def test_filters_whisper_hallucination(self): + mock_transcribe = MagicMock(return_value={ + "success": True, + "transcript": "Thank you.", + }) + + with patch("tools.transcription_tools.transcribe_audio", mock_transcribe): + from tools.voice_mode import transcribe_recording + result = transcribe_recording("/tmp/test.wav") + + assert result["success"] is True + assert result["transcript"] == "" + assert result["filtered"] is True + + def test_does_not_filter_real_speech(self): + mock_transcribe = MagicMock(return_value={ + "success": True, + "transcript": "Thank you for helping me with this code.", + }) + + with patch("tools.transcription_tools.transcribe_audio", mock_transcribe): + from tools.voice_mode import transcribe_recording + result = transcribe_recording("/tmp/test.wav") + + assert result["transcript"] == "Thank you for helping me with this code." + assert "filtered" not in result + + +class TestWhisperHallucinationFilter: + def test_known_hallucinations(self): + from tools.voice_mode import is_whisper_hallucination + + assert is_whisper_hallucination("Thank you.") is True + assert is_whisper_hallucination("thank you") is True + assert is_whisper_hallucination("Thanks for watching.") is True + assert is_whisper_hallucination("Bye.") is True + assert is_whisper_hallucination(" Thank you. ") is True # with whitespace + assert is_whisper_hallucination("you") is True + + def test_real_speech_not_filtered(self): + from tools.voice_mode import is_whisper_hallucination + + assert is_whisper_hallucination("Hello, how are you?") is False + assert is_whisper_hallucination("Thank you for your help with the project.") is False + assert is_whisper_hallucination("Can you explain this code?") is False + # ============================================================================ # play_audio_file diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 5abdc4d601..cdffa99086 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -235,6 +235,12 @@ class AudioRecorder: logger.debug("Recording too short (%d samples), discarding", len(audio_data)) return None + # Skip silent recordings (RMS below threshold = no real speech) + rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2))) + if rms < SILENCE_RMS_THRESHOLD: + logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD) + return None + return self._write_wav(audio_data) def cancel(self) -> None: @@ -276,6 +282,36 @@ class AudioRecorder: return wav_path +# ============================================================================ +# Whisper hallucination filter +# ============================================================================ +# Whisper commonly hallucinates these phrases on silent/near-silent audio. +WHISPER_HALLUCINATIONS = { + "thank you.", + "thank you", + "thanks for watching.", + "thanks for watching", + "subscribe to my channel.", + "subscribe to my channel", + "like and subscribe.", + "like and subscribe", + "please subscribe.", + "please subscribe", + "thank you for watching.", + "thank you for watching", + "bye.", + "bye", + "you", + "the end.", + "the end", +} + + +def is_whisper_hallucination(transcript: str) -> bool: + """Check if a transcript is a known Whisper hallucination on silence.""" + return transcript.strip().lower() in WHISPER_HALLUCINATIONS + + # ============================================================================ # STT dispatch # ============================================================================ @@ -283,6 +319,7 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str """Transcribe a WAV recording using the existing Whisper pipeline. Delegates to ``tools.transcription_tools.transcribe_audio()``. + Filters out known Whisper hallucinations on silent audio. Args: wav_path: Path to the WAV file. @@ -293,7 +330,14 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str """ from tools.transcription_tools import transcribe_audio - return transcribe_audio(wav_path, model=model) + result = transcribe_audio(wav_path, model=model) + + # Filter out Whisper hallucinations (common on silent/near-silent audio) + if result.get("success") and is_whisper_hallucination(result.get("transcript", "")): + logger.info("Filtered Whisper hallucination: %r", result["transcript"]) + return {"success": True, "transcript": "", "filtered": True} + + return result # ============================================================================ From dad865e920b8cdd14cbd74794ea265eb448d5106 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 20:43:22 +0300 Subject: [PATCH 09/93] fix: fix silence detection bugs and add Phase 4 voice mode features Fix 3 critical bugs in silence detection: - Micro-pause tolerance now tracks dip duration (not time since speech start) - Peak RMS check in stop() prevents discarding recordings with real speech - Reduced min_speech_duration from 0.5s to 0.3s for reliable speech confirmation Phase 4 features: configurable silence params, visual audio level indicator, voice system prompt, tool call audio cues, TTS interrupt, continuous mode auto-restart, interruptable playback via Popen tracking. --- cli.py | 113 +++++++++++++++++++++++++++++---- hermes_cli/config.py | 2 + tests/tools/test_voice_mode.py | 50 ++++++++++++++- tools/voice_mode.py | 103 +++++++++++++++++++++++++++--- 4 files changed, 245 insertions(+), 23 deletions(-) diff --git a/cli.py b/cli.py index 9fb613c851..e35fdafb9d 100755 --- a/cli.py +++ b/cli.py @@ -1550,6 +1550,7 @@ class HermesCLI: checkpoints_enabled=self.checkpoints_enabled, checkpoint_max_snapshots=self.checkpoint_max_snapshots, pass_session_id=self.pass_session_id, + tool_progress_callback=self._on_tool_progress, ) # Apply any pending title now that the session exists in the DB if self._pending_title and self._session_db: @@ -3515,6 +3516,28 @@ class HermesCLI: except Exception as e: print(f" ❌ MCP reload failed: {e}") + # ==================================================================== + # Tool progress callback (audio cues for voice mode) + # ==================================================================== + + def _on_tool_progress(self, function_name: str, preview: str, function_args: dict): + """Called when a tool starts executing. Plays audio cue in voice mode.""" + if not self._voice_mode: + return + # Skip internal/thinking tools + if function_name.startswith("_"): + return + try: + from tools.voice_mode import play_beep + # Short, subtle tick sound (higher pitch, very brief) + threading.Thread( + target=play_beep, + kwargs={"frequency": 1200, "duration": 0.06, "count": 1}, + daemon=True, + ).start() + except Exception: + pass + # ==================================================================== # Voice mode methods # ==================================================================== @@ -3536,9 +3559,21 @@ class HermesCLI: "Get one at: https://platform.openai.com/api-keys" ) + # Load silence detection params from config + voice_cfg = {} + try: + from hermes_cli.config import load_config + voice_cfg = load_config().get("voice", {}) + except Exception: + pass + if self._voice_recorder is None: self._voice_recorder = AudioRecorder() + # Apply config-driven silence params + self._voice_recorder._silence_threshold = voice_cfg.get("silence_threshold", 200) + self._voice_recorder._silence_duration = voice_cfg.get("silence_duration", 3.0) + def _on_silence(): """Called by AudioRecorder when silence is detected after speech.""" with self._voice_lock: @@ -3549,18 +3584,26 @@ class HermesCLI: self._app.invalidate() self._voice_stop_and_transcribe() + # Audio cue: single beep BEFORE starting stream (avoid CoreAudio conflict) + try: + from tools.voice_mode import play_beep + play_beep(frequency=880, count=1) + except Exception: + pass + self._voice_recorder.start(on_silence_stop=_on_silence) with self._voice_lock: self._voice_recording = True - - # Audio cue: single beep on recording start - try: - from tools.voice_mode import play_beep - threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start() - except Exception: - pass _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") + # Periodically refresh prompt to update audio level indicator + def _refresh_level(): + while self._voice_recording: + if hasattr(self, '_app') and self._app: + self._app.invalidate() + time.sleep(0.15) + threading.Thread(target=_refresh_level, daemon=True).start() + def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" try: @@ -3571,15 +3614,15 @@ class HermesCLI: with self._voice_lock: self._voice_recording = False - # Audio cue: double beep on recording stop + # Audio cue: double beep after stream stopped (no CoreAudio conflict) try: from tools.voice_mode import play_beep - threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start() + play_beep(frequency=660, count=2) except Exception: pass if wav_path is None: - _cprint(f"{_DIM}No speech detected (recording too short).{_RST}") + _cprint(f"{_DIM}No speech detected.{_RST}") return with self._voice_lock: @@ -3614,6 +3657,7 @@ class HermesCLI: finally: with self._voice_lock: self._voice_processing = False + submitted = self._pending_input.qsize() > 0 if hasattr(self, '_app') and self._app: self._app.invalidate() # Clean up temp file @@ -3623,6 +3667,18 @@ class HermesCLI: except Exception: pass + # If no transcript was submitted but continuous mode is active, + # restart recording so the user can keep talking. + # (When transcript IS submitted, process_loop handles restart + # after chat() completes.) + if self._voice_continuous and not submitted and not self._voice_recording: + try: + self._voice_start_recording() + if hasattr(self, '_app') and self._app: + self._app.invalidate() + except Exception: + pass + def _voice_speak_response(self, text: str): """Speak the agent's response aloud using TTS (runs in background thread).""" if not self._voice_tts: @@ -3727,6 +3783,16 @@ class HermesCLI: except Exception: pass + # Append voice-mode system prompt for concise, conversational responses + self._voice_original_prompt = self.system_prompt + voice_instruction = ( + "\n\n[Voice mode active] The user is speaking via voice input. " + "Keep responses concise and conversational — 2-3 sentences max unless " + "the user asks for detail. Avoid code blocks, markdown formatting, " + "and long lists. Respond naturally as in a spoken conversation." + ) + self.system_prompt = (self.system_prompt or "") + voice_instruction + tts_status = " (TTS enabled)" if self._voice_tts else "" _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") _cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}") @@ -3742,6 +3808,10 @@ class HermesCLI: self._voice_mode = False self._voice_tts = False self._voice_continuous = False + + # Restore original system prompt + if hasattr(self, '_voice_original_prompt'): + self.system_prompt = self._voice_original_prompt _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -4237,11 +4307,24 @@ class HermesCLI: # Icon-only custom prompts should still remain visible in special states. return symbol, symbol + def _audio_level_bar(self) -> str: + """Return a visual audio level indicator based on current RMS.""" + _LEVEL_BARS = " ▁▂▃▄▅▆▇" + rec = getattr(self, "_voice_recorder", None) + if rec is None: + return "" + rms = rec.current_rms + # Normalize RMS (0-32767) to 0-7 index, with log-ish scaling + # Typical speech RMS is 500-5000, we cap display at ~8000 + level = min(rms, 8000) * 7 // 8000 + return _LEVEL_BARS[level] + def _get_tui_prompt_fragments(self): """Return the prompt_toolkit fragments for the current interactive state.""" symbol, state_suffix = self._get_tui_prompt_symbols() if self._voice_recording: - return [("class:voice-recording", f"● {state_suffix}")] + bar = self._audio_level_bar() + return [("class:voice-recording", f"● {bar} {state_suffix}")] if self._voice_processing: return [("class:voice-processing", f"◉ {state_suffix}")] if self._sudo_state: @@ -4692,6 +4775,14 @@ class HermesCLI: ).start() else: try: + # Interrupt TTS if playing, so user can start talking + if not cli_ref._voice_tts_done.is_set(): + try: + from tools.voice_mode import stop_playback + stop_playback() + cli_ref._voice_tts_done.set() + except Exception: + pass with cli_ref._voice_lock: cli_ref._voice_continuous = True cli_ref._voice_start_recording() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 174e4326e4..8dc2076404 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -207,6 +207,8 @@ DEFAULT_CONFIG = { "record_key": "ctrl+r", "max_recording_seconds": 120, "auto_tts": False, + "silence_threshold": 200, # RMS below this = silence (0-32767) + "silence_duration": 3.0, # Seconds of silence before auto-stop }, "human_delay": { diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index 0d40932e2f..e6a46def7c 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -157,6 +157,7 @@ class TestAudioRecorderStop: # Simulate captured audio frames (1 second of loud audio above RMS threshold) frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16") recorder._frames = [frame] + recorder._peak_rms = 1000 # Peak RMS above threshold wav_path = recorder.stop() @@ -203,6 +204,7 @@ class TestAudioRecorderStop: # 1 second of near-silence (RMS well below threshold) frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16") recorder._frames = [frame] + recorder._peak_rms = 10 # Peak RMS also below threshold wav_path = recorder.stop() assert wav_path is None @@ -475,8 +477,9 @@ class TestSilenceDetection: from tools.voice_mode import AudioRecorder, SAMPLE_RATE recorder = AudioRecorder() - # Use very short silence duration for testing + # Use very short durations for testing recorder._silence_duration = 0.05 + recorder._min_speech_duration = 0.05 fired = threading.Event() @@ -490,9 +493,11 @@ class TestSilenceDetection: if callback is None: callback = mock_sd.InputStream.call_args[1]["callback"] - # Simulate loud audio (speech) -- RMS well above threshold + # Simulate sustained speech (multiple loud chunks to exceed min_speech_duration) loud_frame = np.full((1600, 1), 5000, dtype="int16") callback(loud_frame, 1600, None, None) + time.sleep(0.06) + callback(loud_frame, 1600, None, None) assert recorder._has_spoken is True # Simulate silence @@ -537,6 +542,47 @@ class TestSilenceDetection: recorder.cancel() + def test_micro_pause_tolerance_during_speech(self, mock_sd): + """Brief dips below threshold during speech should NOT reset speech tracking.""" + np = pytest.importorskip("numpy") + import threading + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder._silence_duration = 0.05 + recorder._min_speech_duration = 0.15 + recorder._max_dip_tolerance = 0.1 + + fired = threading.Event() + recorder.start(on_silence_stop=lambda: fired.set()) + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + loud_frame = np.full((1600, 1), 5000, dtype="int16") + quiet_frame = np.full((1600, 1), 50, dtype="int16") + + # Speech chunk 1 + callback(loud_frame, 1600, None, None) + time.sleep(0.05) + # Brief micro-pause (dip < max_dip_tolerance) + callback(quiet_frame, 1600, None, None) + time.sleep(0.05) + # Speech resumes -- speech_start should NOT have been reset + callback(loud_frame, 1600, None, None) + assert recorder._speech_start > 0, "Speech start should be preserved across brief dips" + time.sleep(0.06) + # Another speech chunk to exceed min_speech_duration + callback(loud_frame, 1600, None, None) + assert recorder._has_spoken is True, "Speech should be confirmed after tolerating micro-pause" + + recorder.cancel() + def test_no_callback_means_no_silence_detection(self, mock_sd): np = pytest.importorskip("numpy") diff --git a/tools/voice_mode.py b/tools/voice_mode.py index cdffa99086..d4fd00f19b 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -117,10 +117,18 @@ class AudioRecorder: self._start_time: float = 0.0 # Silence detection state self._has_spoken = False + self._speech_start: float = 0.0 # When speech attempt began + self._dip_start: float = 0.0 # When current below-threshold dip began + self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm + self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech self._silence_start: float = 0.0 self._on_silence_stop = None self._silence_threshold: int = SILENCE_RMS_THRESHOLD self._silence_duration: float = SILENCE_DURATION_SECONDS + # Peak RMS seen during recording (for speech presence check in stop()) + self._peak_rms: int = 0 + # Live audio level (read by UI for visual feedback) + self._current_rms: int = 0 # -- public properties --------------------------------------------------- @@ -134,6 +142,11 @@ class AudioRecorder: return 0.0 return time.monotonic() - self._start_time + @property + def current_rms(self) -> int: + """Current audio input RMS level (0-32767). Updated each audio chunk.""" + return self._current_rms + # -- public methods ------------------------------------------------------ def start(self, on_silence_stop=None) -> None: @@ -161,7 +174,10 @@ class AudioRecorder: self._frames = [] self._start_time = time.monotonic() self._has_spoken = False + self._speech_start = 0.0 + self._dip_start = 0.0 self._silence_start = 0.0 + self._peak_rms = 0 self._on_silence_stop = on_silence_stop def _callback(indata, frames, time_info, status): # noqa: ARG001 @@ -169,15 +185,44 @@ class AudioRecorder: logger.debug("sounddevice status: %s", status) self._frames.append(indata.copy()) - # Silence detection: compute RMS of this chunk + # Compute RMS for level display and silence detection + rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) + self._current_rms = rms + if rms > self._peak_rms: + self._peak_rms = rms + + # Silence detection if self._on_silence_stop is not None and self._recording: - rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) now = time.monotonic() if rms > self._silence_threshold: - self._has_spoken = True + # Audio is above threshold -- this is speech (or noise). + self._dip_start = 0.0 # Reset dip tracker + if self._speech_start == 0.0: + self._speech_start = now + elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration: + self._has_spoken = True + logger.debug("Speech confirmed (%.2fs above threshold)", + now - self._speech_start) self._silence_start = 0.0 elif self._has_spoken: + # Speech already confirmed, let silence timer run below + pass + elif self._speech_start > 0: + # We were in a speech attempt but RMS dipped. + # Tolerate brief dips (micro-pauses between syllables). + if self._dip_start == 0.0: + self._dip_start = now + elif now - self._dip_start >= self._max_dip_tolerance: + # Dip lasted too long -- genuine silence, reset + logger.debug("Speech attempt reset (dip lasted %.2fs)", + now - self._dip_start) + self._speech_start = 0.0 + self._dip_start = 0.0 + # else: brief dip, keep tolerating + # else: no speech attempt, just silence -- nothing to do + + if self._has_spoken and rms <= self._silence_threshold: # User was speaking and now is silent if self._silence_start == 0.0: self._silence_start = now @@ -235,10 +280,11 @@ class AudioRecorder: logger.debug("Recording too short (%d samples), discarding", len(audio_data)) return None - # Skip silent recordings (RMS below threshold = no real speech) - rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2))) - if rms < SILENCE_RMS_THRESHOLD: - logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD) + # Skip silent recordings using peak RMS (not overall average, which + # gets diluted by silence at the end of the recording). + if self._peak_rms < SILENCE_RMS_THRESHOLD: + logger.info("Recording too quiet (peak RMS=%d < %d), discarding", + self._peak_rms, SILENCE_RMS_THRESHOLD) return None return self._write_wav(audio_data) @@ -341,8 +387,34 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str # ============================================================================ -# Audio playback +# Audio playback (interruptable) # ============================================================================ + +# Global reference to the active playback process so it can be interrupted. +_active_playback: Optional[subprocess.Popen] = None +_playback_lock = threading.Lock() + + +def stop_playback() -> None: + """Interrupt the currently playing audio (if any).""" + global _active_playback + with _playback_lock: + proc = _active_playback + _active_playback = None + if proc and proc.poll() is None: + try: + proc.terminate() + logger.info("Audio playback interrupted") + except Exception: + pass + # Also stop sounddevice playback if active + if _HAS_AUDIO: + try: + sd.stop() + except Exception: + pass + + def play_audio_file(file_path: str) -> bool: """Play an audio file through the default output device. @@ -351,9 +423,13 @@ def play_audio_file(file_path: str) -> bool: 2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform), ``aplay`` (Linux ALSA). + Playback can be interrupted by calling ``stop_playback()``. + Returns: ``True`` if playback succeeded, ``False`` otherwise. """ + global _active_playback + if not os.path.isfile(file_path): logger.warning("Audio file not found: %s", file_path) return False @@ -372,7 +448,7 @@ def play_audio_file(file_path: str) -> bool: except Exception as e: logger.debug("sounddevice playback failed: %s", e) - # Fall back to system audio players + # Fall back to system audio players (using Popen for interruptability) system = platform.system() players = [] @@ -386,10 +462,17 @@ def play_audio_file(file_path: str) -> bool: exe = shutil.which(cmd[0]) if exe: try: - subprocess.run(cmd, capture_output=True, timeout=300) + proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + with _playback_lock: + _active_playback = proc + proc.wait(timeout=300) + with _playback_lock: + _active_playback = None return True except Exception as e: logger.debug("System player %s failed: %s", cmd[0], e) + with _playback_lock: + _active_playback = None logger.warning("No audio player available for %s", file_path) return False From d7425343eea6a222bfb01a1ac717067617e80315 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 20:55:06 +0300 Subject: [PATCH 10/93] fix: fix voice recording stuck in continuous mode - Track submitted state locally instead of using racy qsize() check - Allow Ctrl+R to stop recording even while agent is running - Add double-start guard to prevent concurrent recording attempts --- cli.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/cli.py b/cli.py index e35fdafb9d..5e4f5c7d52 100755 --- a/cli.py +++ b/cli.py @@ -3544,6 +3544,10 @@ class HermesCLI: def _voice_start_recording(self): """Start capturing audio from the microphone.""" + # Prevent double-start from concurrent threads + if self._voice_recording: + return + from tools.voice_mode import AudioRecorder, check_voice_requirements reqs = check_voice_requirements() @@ -3606,6 +3610,8 @@ class HermesCLI: def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" + submitted = False + wav_path = None try: if self._voice_recorder is None: return @@ -3646,6 +3652,7 @@ class HermesCLI: if result.get("success") and result.get("transcript", "").strip(): transcript = result["transcript"].strip() self._pending_input.put(transcript) + submitted = True elif result.get("success"): _cprint(f"{_DIM}No speech detected.{_RST}") else: @@ -3657,7 +3664,6 @@ class HermesCLI: finally: with self._voice_lock: self._voice_processing = False - submitted = self._pending_input.qsize() > 0 if hasattr(self, '_app') and self._app: self._app.invalidate() # Clean up temp file @@ -4758,11 +4764,7 @@ class HermesCLI: """Toggle voice recording when voice mode is active.""" if not cli_ref._voice_mode: return - if cli_ref._agent_running: - return - # Block recording during interactive prompts - if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: - return + # Always allow STOPPING a recording (even when agent is running) if cli_ref._voice_recording: # Manual stop via Ctrl+R: stop continuous mode with cli_ref._voice_lock: @@ -4774,6 +4776,11 @@ class HermesCLI: daemon=True, ).start() else: + # Guard: don't START recording during agent run or interactive prompts + if cli_ref._agent_running: + return + if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: + return try: # Interrupt TTS if playing, so user can start talking if not cli_ref._voice_tts_done.is_set(): From 179d9e1a22709a6475d931cb4827abc97bd6ca02 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 3 Mar 2026 23:03:42 +0300 Subject: [PATCH 11/93] feat: add streaming sentence-by-sentence TTS via ElevenLabs Stream audio to speaker as the agent generates tokens instead of waiting for the full response. First sentence plays within ~1-2s of agent starting to respond. - run_agent: add stream_callback to run_conversation/chat, streaming path in _interruptible_api_call accumulates chunks into mock ChatCompletion while forwarding content deltas to callback - tts_tool: add stream_tts_to_speaker() with sentence buffering, think block filtering, markdown stripping, ElevenLabs pcm_24000 streaming to sounddevice OutputStream - cli: wire up streaming TTS pipeline in chat(), detect elevenlabs provider + sounddevice availability, skip batch TTS when streaming is active, signal stop on interrupt Falls back to batch TTS for Edge/OpenAI providers or when elevenlabs/sounddevice are not available. Zero impact on non-voice mode (callback defaults to None). --- cli.py | 70 ++++++++++++-- run_agent.py | 127 +++++++++++++++++++++++-- tools/tts_tool.py | 231 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 410 insertions(+), 18 deletions(-) diff --git a/cli.py b/cli.py index 5e4f5c7d52..230d1e9ff9 100755 --- a/cli.py +++ b/cli.py @@ -4093,19 +4093,60 @@ class HermesCLI: try: # Run the conversation with interrupt monitoring result = None - + + # --- Streaming TTS setup --- + # When ElevenLabs is the TTS provider and sounddevice is available, + # we stream audio sentence-by-sentence as the agent generates tokens + # instead of waiting for the full response. + use_streaming_tts = False + text_queue = None + tts_thread = None + stream_callback = None + stop_event = None + + if self._voice_tts: + try: + from tools.tts_tool import ( + _load_tts_config as _load_tts_cfg, + _get_provider as _get_prov, + _HAS_ELEVENLABS as _el_ok, + _HAS_AUDIO as _audio_ok, + stream_tts_to_speaker, + ) + _tts_cfg = _load_tts_cfg() + if (_get_prov(_tts_cfg) == "elevenlabs" and _el_ok and _audio_ok): + use_streaming_tts = True + except Exception: + pass + + if use_streaming_tts: + text_queue = queue.Queue() + stop_event = threading.Event() + + tts_thread = threading.Thread( + target=stream_tts_to_speaker, + args=(text_queue, stop_event, self._voice_tts_done), + daemon=True, + ) + tts_thread.start() + + def stream_callback(delta: str): + if text_queue is not None: + text_queue.put(delta) + def run_agent(): nonlocal result result = self.agent.run_conversation( user_message=message, conversation_history=self.conversation_history[:-1], # Exclude the message we just added + stream_callback=stream_callback, task_id=self.session_id, ) - + # Start agent in background thread agent_thread = threading.Thread(target=run_agent) agent_thread.start() - + # Monitor the dedicated interrupt queue while the agent runs. # _interrupt_queue is separate from _pending_input, so process_loop # and chat() never compete for the same queue. @@ -4124,6 +4165,9 @@ class HermesCLI: if self._clarify_state or self._clarify_freetext: continue print(f"\n⚡ New message detected, interrupting...") + # Signal TTS to stop on interrupt + if stop_event is not None: + stop_event.set() self.agent.interrupt(interrupt_msg) # Debug: log to file (stdout may be devnull from redirect_stdout) try: @@ -4143,9 +4187,15 @@ class HermesCLI: else: # Fallback for non-interactive mode (e.g., single-query) agent_thread.join(0.1) - + agent_thread.join() # Ensure agent thread completes + # Signal end-of-text to TTS consumer and wait for it to finish + if use_streaming_tts and text_queue is not None: + text_queue.put(None) # sentinel + if tts_thread is not None: + tts_thread.join(timeout=120) + # Drain any remaining agent output still in the StdoutProxy # buffer so tool/status lines render ABOVE our response box. # The flush pushes data into the renderer queue; the short @@ -4156,15 +4206,15 @@ class HermesCLI: # Update history with full conversation self.conversation_history = result.get("messages", self.conversation_history) if result else self.conversation_history - + # Get the final response response = result.get("final_response", "") if result else "" - + # Handle failed results (e.g., non-retryable errors like invalid model) if result and result.get("failed") and not response: error_detail = result.get("error", "Unknown error") response = f"Error: {error_detail}" - + # Handle interrupt - check if we were interrupted pending_message = None if result and result.get("interrupted"): @@ -4172,8 +4222,9 @@ class HermesCLI: # Add indicator that we were interrupted if response and pending_message: response = response + "\n\n---\n_[Interrupted - processing new message]_" - + response_previewed = result.get("response_previewed", False) if result else False + # Display reasoning (thinking) box if enabled and available if self.show_reasoning and result: reasoning = result.get("last_reasoning") @@ -4226,7 +4277,8 @@ class HermesCLI: sys.stdout.flush() # Speak response aloud if voice TTS is enabled - if self._voice_tts and response: + # Skip batch TTS when streaming TTS already handled it + if self._voice_tts and response and not use_streaming_tts: threading.Thread( target=self._voice_speak_response, args=(response,), diff --git a/run_agent.py b/run_agent.py index ba214b715f..6dd08436ac 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2576,10 +2576,16 @@ class AIAgent: """ Run the API call in a background thread so the main conversation loop can detect interrupts without waiting for the full HTTP round-trip. - + On interrupt, closes the HTTP client to cancel the in-flight request (stops token generation and avoids wasting money), then rebuilds the client for future calls. + + When ``self._stream_callback`` is set (streaming TTS mode), the call + uses ``stream=True`` and iterates over chunks inside the background + thread. Content deltas are forwarded to the callback in real-time + while the full response is accumulated and returned as a + ``SimpleNamespace`` that mimics a normal ``ChatCompletion``. """ result = {"response": None, "error": None} @@ -2587,10 +2593,103 @@ class AIAgent: try: if self.api_mode == "codex_responses": result["response"] = self._run_codex_stream(api_kwargs) + return elif self.api_mode == "anthropic_messages": result["response"] = self._anthropic_client.messages.create(**api_kwargs) - else: + return + + cb = getattr(self, "_stream_callback", None) + if cb is None: + # Non-streaming path (default) result["response"] = self.client.chat.completions.create(**api_kwargs) + return + + # --- Streaming path for TTS pipeline --- + stream_kwargs = {**api_kwargs, "stream": True} + stream = self.client.chat.completions.create(**stream_kwargs) + + content_parts: list[str] = [] + tool_calls_acc: dict[int, dict] = {} # index -> {id, type, function:{name, arguments}} + finish_reason = None + model_name = None + role = "assistant" + + for chunk in stream: + if not chunk.choices: + # Usage-only or empty chunk + if hasattr(chunk, "model") and chunk.model: + model_name = chunk.model + continue + + delta = chunk.choices[0].delta + if hasattr(chunk, "model") and chunk.model: + model_name = chunk.model + + # Content delta + if delta and delta.content: + content_parts.append(delta.content) + try: + cb(delta.content) + except Exception: + pass + + # Tool call deltas + if delta and delta.tool_calls: + for tc_delta in delta.tool_calls: + idx = tc_delta.index + if idx not in tool_calls_acc: + tool_calls_acc[idx] = { + "id": tc_delta.id or "", + "type": "function", + "function": {"name": "", "arguments": ""}, + } + entry = tool_calls_acc[idx] + if tc_delta.id: + entry["id"] = tc_delta.id + if tc_delta.function: + if tc_delta.function.name: + entry["function"]["name"] += tc_delta.function.name + if tc_delta.function.arguments: + entry["function"]["arguments"] += tc_delta.function.arguments + + if chunk.choices[0].finish_reason: + finish_reason = chunk.choices[0].finish_reason + + # Build a mock ChatCompletion matching the non-streaming interface + full_content = "".join(content_parts) or None + mock_tool_calls = None + if tool_calls_acc: + mock_tool_calls = [] + for idx in sorted(tool_calls_acc): + tc = tool_calls_acc[idx] + mock_tool_calls.append(SimpleNamespace( + id=tc["id"], + type=tc["type"], + function=SimpleNamespace( + name=tc["function"]["name"], + arguments=tc["function"]["arguments"], + ), + )) + + mock_message = SimpleNamespace( + role=role, + content=full_content, + tool_calls=mock_tool_calls, + reasoning_content=None, + ) + mock_choice = SimpleNamespace( + index=0, + message=mock_message, + finish_reason=finish_reason or "stop", + ) + mock_response = SimpleNamespace( + id="stream-" + str(uuid.uuid4()), + model=model_name, + choices=[mock_choice], + usage=None, + ) + result["response"] = mock_response + except Exception as e: result["error"] = e @@ -3915,7 +4014,8 @@ class AIAgent: user_message: str, system_message: str = None, conversation_history: List[Dict[str, Any]] = None, - task_id: str = None + task_id: str = None, + stream_callback: Optional[callable] = None, ) -> Dict[str, Any]: """ Run a complete conversation with tool calling until completion. @@ -3925,6 +4025,9 @@ class AIAgent: system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided) conversation_history (List[Dict]): Previous conversation messages (optional) task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided) + stream_callback: Optional callback invoked with each text delta during streaming. + Used by the TTS pipeline to start audio generation before the full response. + When None (default), API calls use the standard non-streaming path. Returns: Dict: Complete conversation result with final response and message history @@ -3933,6 +4036,8 @@ class AIAgent: # Installed once, transparent when streams are healthy, prevents crash on write. _install_safe_stdio() + # Store stream callback for _interruptible_api_call to pick up + self._stream_callback = stream_callback # Generate unique task_id if not provided to isolate VMs between concurrent tasks effective_task_id = task_id or str(uuid.uuid4()) @@ -5377,20 +5482,24 @@ class AIAgent: # Clear interrupt state after handling self.clear_interrupt() - + + # Clear stream callback so it doesn't leak into future calls + self._stream_callback = None + return result - - def chat(self, message: str) -> str: + + def chat(self, message: str, stream_callback: Optional[callable] = None) -> str: """ Simple chat interface that returns just the final response. - + Args: message (str): User message - + stream_callback: Optional callback invoked with each text delta during streaming. + Returns: str: Final assistant response """ - result = self.run_conversation(message) + result = self.run_conversation(message, stream_callback=stream_callback) return result["final_response"] diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 3544b20fd8..358bd6f112 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -25,9 +25,12 @@ import datetime import json import logging import os +import queue +import re import shutil import subprocess import tempfile +import threading from pathlib import Path from typing import Dict, Any, Optional @@ -55,6 +58,13 @@ try: except ImportError: _HAS_OPENAI = False +try: + import sounddevice as sd + _HAS_AUDIO = True +except ImportError: + sd = None # type: ignore[assignment] + _HAS_AUDIO = False + # =========================================================================== # Defaults @@ -63,6 +73,7 @@ DEFAULT_PROVIDER = "edge" DEFAULT_EDGE_VOICE = "en-US-AriaNeural" DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2" +DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5" DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts" DEFAULT_OPENAI_VOICE = "alloy" DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache") @@ -420,6 +431,226 @@ def check_tts_requirements() -> bool: return False +# =========================================================================== +# Streaming TTS: sentence-by-sentence pipeline for ElevenLabs +# =========================================================================== +# Sentence boundary pattern: punctuation followed by space or newline +_SENTENCE_BOUNDARY_RE = re.compile(r'(?<=[.!?])(?:\s|\n)|(?:\n\n)') + +# Markdown stripping patterns (same as cli.py _voice_speak_response) +_MD_CODE_BLOCK = re.compile(r'```[\s\S]*?```') +_MD_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)') +_MD_URL = re.compile(r'https?://\S+') +_MD_BOLD = re.compile(r'\*\*(.+?)\*\*') +_MD_ITALIC = re.compile(r'\*(.+?)\*') +_MD_INLINE_CODE = re.compile(r'`(.+?)`') +_MD_HEADER = re.compile(r'^#+\s*', flags=re.MULTILINE) +_MD_LIST_ITEM = re.compile(r'^\s*[-*]\s+', flags=re.MULTILINE) +_MD_HR = re.compile(r'---+') +_MD_EXCESS_NL = re.compile(r'\n{3,}') + + +def _strip_markdown_for_tts(text: str) -> str: + """Remove markdown formatting that shouldn't be spoken aloud.""" + text = _MD_CODE_BLOCK.sub(' ', text) + text = _MD_LINK.sub(r'\1', text) + text = _MD_URL.sub('', text) + text = _MD_BOLD.sub(r'\1', text) + text = _MD_ITALIC.sub(r'\1', text) + text = _MD_INLINE_CODE.sub(r'\1', text) + text = _MD_HEADER.sub('', text) + text = _MD_LIST_ITEM.sub('', text) + text = _MD_HR.sub('', text) + text = _MD_EXCESS_NL.sub('\n\n', text) + return text.strip() + + +def stream_tts_to_speaker( + text_queue: queue.Queue, + stop_event: threading.Event, + tts_done_event: threading.Event, +): + """Consume text deltas from *text_queue*, buffer them into sentences, + and stream each sentence through ElevenLabs TTS to the speaker in + real-time. + + Protocol: + * The producer puts ``str`` deltas onto *text_queue*. + * A ``None`` sentinel signals end-of-text (flush remaining buffer). + * *stop_event* can be set to abort early (e.g. user interrupt). + * *tts_done_event* is **set** in the ``finally`` block so callers + waiting on it (continuous voice mode) know playback is finished. + """ + tts_done_event.clear() + + try: + tts_config = _load_tts_config() + el_config = tts_config.get("elevenlabs", {}) + voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID) + model_id = el_config.get("streaming_model_id", + el_config.get("model_id", DEFAULT_ELEVENLABS_STREAMING_MODEL_ID)) + + api_key = os.getenv("ELEVENLABS_API_KEY", "") + if not api_key: + logger.warning("ELEVENLABS_API_KEY not set; streaming TTS disabled") + return + + client = ElevenLabs(api_key=api_key) + + # Open a single sounddevice output stream for the lifetime of + # this function. ElevenLabs pcm_24000 produces signed 16-bit + # little-endian mono PCM at 24 kHz. + use_sd = _HAS_AUDIO and sd is not None + output_stream = None + if use_sd: + try: + import numpy as _np + output_stream = sd.OutputStream( + samplerate=24000, channels=1, dtype="int16", + ) + output_stream.start() + except Exception as exc: + logger.warning("sounddevice OutputStream failed: %s", exc) + output_stream = None + + sentence_buf = "" + in_think = False # track ... blocks + min_sentence_len = 20 + long_flush_len = 100 + queue_timeout = 0.5 + + def _speak_sentence(sentence: str): + """Generate and play audio for a single sentence.""" + if stop_event.is_set(): + return + cleaned = _strip_markdown_for_tts(sentence).strip() + if not cleaned: + return + # Truncate very long sentences + if len(cleaned) > MAX_TEXT_LENGTH: + cleaned = cleaned[:MAX_TEXT_LENGTH] + try: + audio_iter = client.text_to_speech.convert( + text=cleaned, + voice_id=voice_id, + model_id=model_id, + output_format="pcm_24000", + ) + if output_stream is not None: + for chunk in audio_iter: + if stop_event.is_set(): + break + import numpy as _np + audio_array = _np.frombuffer(chunk, dtype=_np.int16) + output_stream.write(audio_array.reshape(-1, 1)) + else: + # Fallback: write chunks to temp file and play via system player + _play_via_tempfile(audio_iter, stop_event) + except Exception as exc: + logger.warning("Streaming TTS sentence failed: %s", exc) + + def _play_via_tempfile(audio_iter, stop_evt): + """Write PCM chunks to a temp WAV file and play it.""" + try: + import wave + tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + tmp_path = tmp.name + with wave.open(tmp, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # 16-bit + wf.setframerate(24000) + for chunk in audio_iter: + if stop_evt.is_set(): + break + wf.writeframes(chunk) + from tools.voice_mode import play_audio_file + play_audio_file(tmp_path) + os.unlink(tmp_path) + except Exception as exc: + logger.warning("Temp-file TTS fallback failed: %s", exc) + + while not stop_event.is_set(): + # Read next delta from queue + try: + delta = text_queue.get(timeout=queue_timeout) + except queue.Empty: + # Timeout: if we have accumulated a long buffer, flush it + if len(sentence_buf) > long_flush_len: + _speak_sentence(sentence_buf) + sentence_buf = "" + continue + + if delta is None: + # End-of-text sentinel: flush remaining buffer + if sentence_buf.strip(): + _speak_sentence(sentence_buf) + break + + # --- Think block filtering --- + # Process delta character by character for think tags + i = 0 + filtered_delta = [] + while i < len(delta): + # Check for opening ", i) + if end != -1: + i = end + 1 + else: + i = len(delta) + continue + # Check for closing tag + if delta[i:].startswith(""): + in_think = False + i += len("") + continue + if not in_think: + filtered_delta.append(delta[i]) + i += 1 + + text = "".join(filtered_delta) + if not text: + continue + + sentence_buf += text + + # Check for sentence boundaries + while True: + m = _SENTENCE_BOUNDARY_RE.search(sentence_buf) + if m is None: + break + end_pos = m.end() + sentence = sentence_buf[:end_pos] + sentence_buf = sentence_buf[end_pos:] + # Merge short fragments into the next sentence + if len(sentence.strip()) < min_sentence_len: + sentence_buf = sentence + sentence_buf + break + _speak_sentence(sentence) + + # Drain any remaining items from the queue + while True: + try: + text_queue.get_nowait() + except queue.Empty: + break + + # Close the audio output stream + if output_stream is not None: + try: + output_stream.stop() + output_stream.close() + except Exception: + pass + + except Exception as exc: + logger.warning("Streaming TTS pipeline error: %s", exc) + finally: + tts_done_event.set() + + # =========================================================================== # Main -- quick diagnostics # =========================================================================== From fd4f229eab0fc76482fe39eb5340b258efa27a5f Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:26:59 +0300 Subject: [PATCH 12/93] fix: catch OSError on sounddevice import for CI without PortAudio sounddevice raises OSError (not ImportError) when the PortAudio C library is missing. This broke test collection on CI runners that have the Python package installed but lack the native library. --- tools/tts_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 358bd6f112..31c57ce010 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -61,7 +61,7 @@ except ImportError: try: import sounddevice as sd _HAS_AUDIO = True -except ImportError: +except (ImportError, OSError): sd = None # type: ignore[assignment] _HAS_AUDIO = False From a15fa8524843cf950ffda6a4d801276d59ab9c5d Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Thu, 5 Mar 2026 21:35:50 +0300 Subject: [PATCH 13/93] fix: catch OSError on sounddevice import in voice_mode.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same PortAudio fix as tts_tool.py — sounddevice raises OSError when the native library is missing on CI runners. --- tools/voice_mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/voice_mode.py b/tools/voice_mode.py index d4fd00f19b..bdf2c5353e 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -31,7 +31,7 @@ try: import numpy as np _HAS_AUDIO = True -except ImportError: +except (ImportError, OSError): sd = None # type: ignore[assignment] np = None # type: ignore[assignment] _HAS_AUDIO = False From 7d4b4e95f1250984ec16ccad8f74db2b285e3e1f Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 6 Mar 2026 00:58:29 +0300 Subject: [PATCH 14/93] feat: sync text display with TTS audio playback Move screen output from stream_callback to display_callback called by TTS consumer thread. Text now appears sentence-by-sentence in sync with audio instead of streaming ahead at LLM speed. Removes quiet_mode hack. --- cli.py | 43 ++++++++++++++++++++++++----------- tools/tts_tool.py | 57 ++++++++++++++++++++++++++++------------------- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/cli.py b/cli.py index 230d1e9ff9..3221cbb790 100755 --- a/cli.py +++ b/cli.py @@ -4099,6 +4099,7 @@ class HermesCLI: # we stream audio sentence-by-sentence as the agent generates tokens # instead of waiting for the full response. use_streaming_tts = False + _streaming_box_opened = False text_queue = None tts_thread = None stream_callback = None @@ -4123,9 +4124,21 @@ class HermesCLI: text_queue = queue.Queue() stop_event = threading.Event() + def display_callback(sentence: str): + """Called by TTS consumer when a sentence is ready to display + speak.""" + nonlocal _streaming_box_opened + if not _streaming_box_opened: + _streaming_box_opened = True + w = self.console.width + label = " ⚕ Hermes " + fill = w - 2 - len(label) + _cprint(f"\n{_GOLD}╭─{label}{'─' * max(fill - 1, 0)}╮{_RST}") + _cprint(sentence.rstrip()) + tts_thread = threading.Thread( target=stream_tts_to_speaker, args=(text_queue, stop_event, self._voice_tts_done), + kwargs={"display_callback": display_callback}, daemon=True, ) tts_thread.start() @@ -4244,8 +4257,7 @@ class HermesCLI: _cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}") if response and not response_previewed: - # Use a Rich Panel for the response box — adapts to terminal - # width at render time instead of hard-coding border length. + # Use skin engine for label/color with fallback try: from hermes_cli.skin_engine import get_active_skin _skin = get_active_skin() @@ -4257,17 +4269,22 @@ class HermesCLI: _resp_color = "#CD7F32" _resp_text = "#FFF8DC" - _chat_console = ChatConsole() - _chat_console.print(Panel( - _rich_text_from_ansi(response), - title=f"[{_resp_color} bold]{label}[/]", - title_align="left", - border_style=_resp_color, - style=_resp_text, - box=rich_box.HORIZONTALS, - padding=(1, 2), - )) - + is_error_response = result and (result.get("failed") or result.get("partial")) + if use_streaming_tts and _streaming_box_opened and not is_error_response: + # Text was already printed sentence-by-sentence; just close the box + w = shutil.get_terminal_size().columns + _cprint(f"\n{_GOLD}╰{'─' * (w - 2)}╯{_RST}") + else: + _chat_console = ChatConsole() + _chat_console.print(Panel( + _rich_text_from_ansi(response), + title=f"[{_resp_color} bold]{label}[/]", + title_align="left", + border_style=_resp_color, + style=_resp_text, + box=rich_box.HORIZONTALS, + padding=(1, 2), + )) # Play terminal bell when agent finishes (if enabled). diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 31c57ce010..3b8773d49d 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -32,7 +32,7 @@ import subprocess import tempfile import threading from pathlib import Path -from typing import Dict, Any, Optional +from typing import Callable, Dict, Any, Optional logger = logging.getLogger(__name__) @@ -469,6 +469,7 @@ def stream_tts_to_speaker( text_queue: queue.Queue, stop_event: threading.Event, tts_done_event: threading.Event, + display_callback: Optional[Callable[[str], None]] = None, ): """Consume text deltas from *text_queue*, buffer them into sentences, and stream each sentence through ElevenLabs TTS to the speaker in @@ -484,34 +485,38 @@ def stream_tts_to_speaker( tts_done_event.clear() try: + # --- TTS client setup (optional -- display_callback works without it) --- + client = None + output_stream = None + voice_id = DEFAULT_ELEVENLABS_VOICE_ID + model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID + tts_config = _load_tts_config() el_config = tts_config.get("elevenlabs", {}) - voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID) + voice_id = el_config.get("voice_id", voice_id) model_id = el_config.get("streaming_model_id", - el_config.get("model_id", DEFAULT_ELEVENLABS_STREAMING_MODEL_ID)) + el_config.get("model_id", model_id)) api_key = os.getenv("ELEVENLABS_API_KEY", "") if not api_key: - logger.warning("ELEVENLABS_API_KEY not set; streaming TTS disabled") - return + logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled") + elif _HAS_ELEVENLABS: + client = ElevenLabs(api_key=api_key) - client = ElevenLabs(api_key=api_key) - - # Open a single sounddevice output stream for the lifetime of - # this function. ElevenLabs pcm_24000 produces signed 16-bit - # little-endian mono PCM at 24 kHz. - use_sd = _HAS_AUDIO and sd is not None - output_stream = None - if use_sd: - try: - import numpy as _np - output_stream = sd.OutputStream( - samplerate=24000, channels=1, dtype="int16", - ) - output_stream.start() - except Exception as exc: - logger.warning("sounddevice OutputStream failed: %s", exc) - output_stream = None + # Open a single sounddevice output stream for the lifetime of + # this function. ElevenLabs pcm_24000 produces signed 16-bit + # little-endian mono PCM at 24 kHz. + use_sd = _HAS_AUDIO and sd is not None + if use_sd: + try: + import numpy as _np + output_stream = sd.OutputStream( + samplerate=24000, channels=1, dtype="int16", + ) + output_stream.start() + except Exception as exc: + logger.warning("sounddevice OutputStream failed: %s", exc) + output_stream = None sentence_buf = "" in_think = False # track ... blocks @@ -520,12 +525,18 @@ def stream_tts_to_speaker( queue_timeout = 0.5 def _speak_sentence(sentence: str): - """Generate and play audio for a single sentence.""" + """Display sentence and optionally generate + play audio.""" if stop_event.is_set(): return cleaned = _strip_markdown_for_tts(sentence).strip() if not cleaned: return + # Display raw sentence on screen before TTS processing + if display_callback is not None: + display_callback(sentence) + # Skip audio generation if no TTS client available + if client is None: + return # Truncate very long sentences if len(cleaned) > MAX_TEXT_LENGTH: cleaned = cleaned[:MAX_TEXT_LENGTH] From 3a1b35ed92340918db9a869073937fe46898ec65 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 6 Mar 2026 01:32:37 +0300 Subject: [PATCH 15/93] fix: voice mode race conditions, temp file leak, think tag parsing - Atomic check-and-set for _voice_recording flag with _voice_lock - Guard _voice_stop_and_transcribe against concurrent invocation - Remove premature flag clearing from Ctrl+R handler - Clean up temp WAV files in finally block (_play_via_tempfile) - Use buffer-level regex for block filtering (handles chunked tags) - Prevent /voice on prompt accumulation on repeated calls - Include Groq in STT key error message --- cli.py | 39 ++++++++++++++++++++++++++------------ tools/tts_tool.py | 48 +++++++++++++++++++---------------------------- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/cli.py b/cli.py index 3221cbb790..d15d43a16f 100755 --- a/cli.py +++ b/cli.py @@ -3544,10 +3544,6 @@ class HermesCLI: def _voice_start_recording(self): """Start capturing audio from the microphone.""" - # Prevent double-start from concurrent threads - if self._voice_recording: - return - from tools.voice_mode import AudioRecorder, check_voice_requirements reqs = check_voice_requirements() @@ -3559,10 +3555,18 @@ class HermesCLI: ) if not reqs["stt_key_set"]: raise RuntimeError( - "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n" - "Get one at: https://platform.openai.com/api-keys" + "Voice mode requires an STT API key for transcription.\n" + "Set GROQ_API_KEY (free) or VOICE_TOOLS_OPENAI_KEY.\n" + "Groq: https://console.groq.com/keys\n" + "OpenAI: https://platform.openai.com/api-keys" ) + # Prevent double-start from concurrent threads (atomic check-and-set) + with self._voice_lock: + if self._voice_recording: + return + self._voice_recording = True + # Load silence detection params from config voice_cfg = {} try: @@ -3595,9 +3599,12 @@ class HermesCLI: except Exception: pass - self._voice_recorder.start(on_silence_stop=_on_silence) - with self._voice_lock: - self._voice_recording = True + try: + self._voice_recorder.start(on_silence_stop=_on_silence) + except Exception: + with self._voice_lock: + self._voice_recording = False + raise _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") # Periodically refresh prompt to update audio level indicator @@ -3610,6 +3617,12 @@ class HermesCLI: def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" + # Atomic guard: only one thread can enter stop-and-transcribe + with self._voice_lock: + if not self._voice_recording: + return + self._voice_recording = False + submitted = False wav_path = None try: @@ -3617,8 +3630,6 @@ class HermesCLI: return wav_path = self._voice_recorder.stop() - with self._voice_lock: - self._voice_recording = False # Audio cue: double beep after stream stopped (no CoreAudio conflict) try: @@ -3764,6 +3775,10 @@ class HermesCLI: def _enable_voice_mode(self): """Enable voice mode after checking requirements.""" + if self._voice_mode: + _cprint(f"{_DIM}Voice mode is already enabled.{_RST}") + return + from tools.voice_mode import check_voice_requirements reqs = check_voice_requirements() @@ -4838,7 +4853,7 @@ class HermesCLI: # Manual stop via Ctrl+R: stop continuous mode with cli_ref._voice_lock: cli_ref._voice_continuous = False - cli_ref._voice_recording = False + # Flag clearing is handled atomically inside _voice_stop_and_transcribe event.app.invalidate() threading.Thread( target=cli_ref._voice_stop_and_transcribe, diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 3b8773d49d..988fa653a7 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -519,10 +519,11 @@ def stream_tts_to_speaker( output_stream = None sentence_buf = "" - in_think = False # track ... blocks min_sentence_len = 20 long_flush_len = 100 queue_timeout = 0.5 + # Regex to strip complete ... blocks from buffer + _think_block_re = re.compile(r'].*?', flags=re.DOTALL) def _speak_sentence(sentence: str): """Display sentence and optionally generate + play audio.""" @@ -562,6 +563,7 @@ def stream_tts_to_speaker( def _play_via_tempfile(audio_iter, stop_evt): """Write PCM chunks to a temp WAV file and play it.""" + tmp_path = None try: import wave tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) @@ -576,9 +578,14 @@ def stream_tts_to_speaker( wf.writeframes(chunk) from tools.voice_mode import play_audio_file play_audio_file(tmp_path) - os.unlink(tmp_path) except Exception as exc: logger.warning("Temp-file TTS fallback failed: %s", exc) + finally: + if tmp_path: + try: + os.unlink(tmp_path) + except OSError: + pass while not stop_event.is_set(): # Read next delta from queue @@ -592,41 +599,24 @@ def stream_tts_to_speaker( continue if delta is None: - # End-of-text sentinel: flush remaining buffer + # End-of-text sentinel: strip any remaining think blocks, flush + sentence_buf = _think_block_re.sub('', sentence_buf) if sentence_buf.strip(): _speak_sentence(sentence_buf) break + sentence_buf += delta + # --- Think block filtering --- - # Process delta character by character for think tags - i = 0 - filtered_delta = [] - while i < len(delta): - # Check for opening ", i) - if end != -1: - i = end + 1 - else: - i = len(delta) - continue - # Check for closing tag - if delta[i:].startswith(""): - in_think = False - i += len("") - continue - if not in_think: - filtered_delta.append(delta[i]) - i += 1 + # Strip complete ... blocks from buffer. + # Works correctly even when tags span multiple deltas. + sentence_buf = _think_block_re.sub('', sentence_buf) - text = "".join(filtered_delta) - if not text: + # If an incomplete ' not in sentence_buf: continue - sentence_buf += text - # Check for sentence boundaries while True: m = _SENTENCE_BOUNDARY_RE.search(sentence_buf) From b00c5949fcae98de1495308e36ff971ccc88aa7c Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 6 Mar 2026 01:51:10 +0300 Subject: [PATCH 16/93] fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors - Add _vprint() helper to suppress log output when stream_callback is active - Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text - Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit) --- cli.py | 5 ++ run_agent.py | 167 ++++++++++++++++++++++++++------------------ tools/voice_mode.py | 28 +++++++- 3 files changed, 130 insertions(+), 70 deletions(-) diff --git a/cli.py b/cli.py index d15d43a16f..0778b62646 100755 --- a/cli.py +++ b/cli.py @@ -4242,6 +4242,11 @@ class HermesCLI: if result and result.get("failed") and not response: error_detail = result.get("error", "Unknown error") response = f"Error: {error_detail}" + # Stop continuous voice mode on persistent errors (e.g. 429 rate limit) + # to avoid an infinite error → record → error loop + if self._voice_continuous: + self._voice_continuous = False + _cprint(f"\n{_DIM}Continuous voice mode stopped due to error.{_RST}") # Handle interrupt - check if we were interrupted pending_message = None diff --git a/run_agent.py b/run_agent.py index 6dd08436ac..475a797fc7 100644 --- a/run_agent.py +++ b/run_agent.py @@ -493,6 +493,10 @@ class AIAgent: ]: logging.getLogger(quiet_logger).setLevel(logging.ERROR) + # Internal stream callback (set during streaming TTS). + # Initialized here so _vprint can reference it before run_conversation. + self._stream_callback = None + # Initialize LLM client via centralized provider router. # The router handles auth resolution, base URL, headers, and # Codex/Anthropic wrapping for all known providers. @@ -812,6 +816,12 @@ class AIAgent: else: print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)") + def _vprint(self, *args, **kwargs): + """Verbose print — suppressed when streaming TTS is active.""" + if getattr(self, "_stream_callback", None) is not None: + return + print(*args, **kwargs) + def _max_tokens_param(self, value: int) -> dict: """Return the correct max tokens kwarg for the current provider. @@ -1340,7 +1350,7 @@ class AIAgent: encoding="utf-8", ) - print(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}") + self._vprint(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}") if os.getenv("HERMES_DUMP_REQUEST_STDOUT", "").strip().lower() in {"1", "true", "yes", "on"}: print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str)) @@ -1482,7 +1492,7 @@ class AIAgent: # Replay the items into the store (replace mode) self._todo_store.write(last_todo_response, merge=False) if not self.quiet_mode: - print(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history") + self._vprint(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history") _set_interrupt(False) @property @@ -3578,7 +3588,7 @@ class AIAgent: if self._interrupt_requested: remaining_calls = assistant_message.tool_calls[i-1:] if remaining_calls: - print(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)") + self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)") for skipped_tc in remaining_calls: skipped_name = skipped_tc.function.name skip_msg = { @@ -3640,7 +3650,7 @@ class AIAgent: ) tool_duration = time.time() - tool_start_time if self.quiet_mode: - print(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}") + self._vprint(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}") elif function_name == "session_search": if not self._session_db: function_result = json.dumps({"success": False, "error": "Session database not available."}) @@ -3655,7 +3665,7 @@ class AIAgent: ) tool_duration = time.time() - tool_start_time if self.quiet_mode: - print(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}") + self._vprint(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}") elif function_name == "memory": target = function_args.get("target", "memory") from tools.memory_tool import memory_tool as _memory_tool @@ -3671,7 +3681,7 @@ class AIAgent: self._honcho_save_user_observation(function_args.get("content", "")) tool_duration = time.time() - tool_start_time if self.quiet_mode: - print(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}") + self._vprint(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}") elif function_name == "clarify": from tools.clarify_tool import clarify_tool as _clarify_tool function_result = _clarify_tool( @@ -3681,7 +3691,7 @@ class AIAgent: ) tool_duration = time.time() - tool_start_time if self.quiet_mode: - print(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}") + self._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}") elif function_name == "delegate_task": from tools.delegate_tool import delegate_task as _delegate_task tasks_arg = function_args.get("tasks") @@ -3714,8 +3724,8 @@ class AIAgent: if spinner: spinner.stop(cute_msg) elif self.quiet_mode: - print(f" {cute_msg}") - elif self.quiet_mode: + self._vprint(f" {cute_msg}") + elif self.quiet_mode and self._stream_callback is None: face = random.choice(KawaiiSpinner.KAWAII_WAITING) tool_emoji_map = { 'web_search': '🔍', 'web_extract': '📄', 'web_crawl': '🕸️', @@ -3802,7 +3812,7 @@ class AIAgent: if self._interrupt_requested and i < len(assistant_message.tool_calls): remaining = len(assistant_message.tool_calls) - i - print(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)") + self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)") for skipped_tc in assistant_message.tool_calls[i:]: skipped_name = skipped_tc.function.name skip_msg = { @@ -4344,11 +4354,11 @@ class AIAgent: thinking_spinner = None if not self.quiet_mode: - print(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...") - print(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") - print(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}") - else: - # Animated thinking spinner in quiet mode + self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...") + self._vprint(f"{self.log_prefix} 📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)") + self._vprint(f"{self.log_prefix} 🔧 Available tools: {len(self.tools) if self.tools else 0}") + elif self._stream_callback is None: + # Animated thinking spinner in quiet mode (skip during streaming TTS) face = random.choice(KawaiiSpinner.KAWAII_THINKING) verb = random.choice(KawaiiSpinner.THINKING_VERBS) if self.thinking_callback: @@ -4401,7 +4411,7 @@ class AIAgent: self.thinking_callback("") if not self.quiet_mode: - print(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s") + self._vprint(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s") if self.verbose_logging: # Log response with provider info if available @@ -4478,17 +4488,17 @@ class AIAgent: if self.verbose_logging: logging.debug(f"Response attributes for invalid response: {resp_attrs}") - print(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}") - print(f"{self.log_prefix} 🏢 Provider: {provider_name}") - print(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}") - print(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)") + self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}") + self._vprint(f"{self.log_prefix} 🏢 Provider: {provider_name}") + self._vprint(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}") + self._vprint(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)") if retry_count >= max_retries: # Try fallback before giving up if self._try_activate_fallback(): retry_count = 0 continue - print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.") + self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.", force=True) logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.") self._persist_session(messages, conversation_history) return { @@ -4501,14 +4511,14 @@ class AIAgent: # Longer backoff for rate limiting (likely cause of None choices) wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s - print(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...") + self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...") logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") # Sleep in small increments to stay responsive to interrupts sleep_end = time.time() + wait_time while time.time() < sleep_end: if self._interrupt_requested: - print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.") + self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.") self._persist_session(messages, conversation_history) self.clear_interrupt() return { @@ -4541,7 +4551,7 @@ class AIAgent: finish_reason = response.choices[0].finish_reason if finish_reason == "length": - print(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens") + self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens") if self.api_mode == "chat_completions": assistant_message = response.choices[0].message @@ -4553,7 +4563,7 @@ class AIAgent: truncated_response_prefix += assistant_message.content if length_continue_retries < 3: - print( + self._vprint( f"{self.log_prefix}↻ Requesting continuation " f"({length_continue_retries}/3)..." ) @@ -4585,7 +4595,7 @@ class AIAgent: # If we have prior messages, roll back to last complete state if len(messages) > 1: - print(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn") + self._vprint(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn") rolled_back_messages = self._get_messages_up_to_last_assistant(messages) self._cleanup_task_resources(effective_task_id) @@ -4601,7 +4611,7 @@ class AIAgent: } else: # First message was truncated - mark as failed - print(f"{self.log_prefix}❌ First response truncated - cannot recover") + self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover") self._persist_session(messages, conversation_history) return { "final_response": None, @@ -4661,7 +4671,7 @@ class AIAgent: prompt = usage_dict["prompt_tokens"] hit_pct = (cached / prompt * 100) if prompt > 0 else 0 if not self.quiet_mode: - print(f"{self.log_prefix} 💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)") + self._vprint(f"{self.log_prefix} 💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)") break # Success, exit retry loop @@ -4672,7 +4682,7 @@ class AIAgent: if self.thinking_callback: self.thinking_callback("") api_elapsed = time.time() - api_start_time - print(f"{self.log_prefix}⚡ Interrupted during API call.") + self._vprint(f"{self.log_prefix}⚡ Interrupted during API call.", force=True) self._persist_session(messages, conversation_history) interrupted = True final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)." @@ -4695,7 +4705,7 @@ class AIAgent: ): codex_auth_retry_attempted = True if self._try_refresh_codex_client_credentials(force=True): - print(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...") + self._vprint(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...") continue if ( self.api_mode == "chat_completions" @@ -4743,14 +4753,14 @@ class AIAgent: error_type = type(api_error).__name__ error_msg = str(api_error).lower() - print(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}") - print(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s") - print(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}") - print(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools") + self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}") + self._vprint(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s") + self._vprint(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}") + self._vprint(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools") # Check for interrupt before deciding to retry if self._interrupt_requested: - print(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.") + self._vprint(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.") self._persist_session(messages, conversation_history) self.clear_interrupt() return { @@ -4775,7 +4785,7 @@ class AIAgent: if is_payload_too_large: compression_attempts += 1 if compression_attempts > max_compression_attempts: - print(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.") + self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True) logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.") self._persist_session(messages, conversation_history) return { @@ -4785,7 +4795,7 @@ class AIAgent: "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.", "partial": True } - print(f"{self.log_prefix}⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...") + self._vprint(f"{self.log_prefix}⚠️ Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...") original_len = len(messages) messages, active_system_prompt = self._compress_context( @@ -4794,12 +4804,12 @@ class AIAgent: ) if len(messages) < original_len: - print(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") + self._vprint(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") time.sleep(2) # Brief pause between compression retries restart_with_compressed_messages = True break else: - print(f"{self.log_prefix}❌ Payload too large and cannot compress further.") + self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.") logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") self._persist_session(messages, conversation_history) return { @@ -4830,7 +4840,7 @@ class AIAgent: parsed_limit = parse_context_limit_from_error(error_msg) if parsed_limit and parsed_limit < old_ctx: new_ctx = parsed_limit - print(f"{self.log_prefix}⚠️ Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})") + self._vprint(f"{self.log_prefix}⚠️ Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True) else: # Step down to the next probe tier new_ctx = get_next_probe_tier(old_ctx) @@ -4839,13 +4849,13 @@ class AIAgent: compressor.context_length = new_ctx compressor.threshold_tokens = int(new_ctx * compressor.threshold_percent) compressor._context_probed = True - print(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens") + self._vprint(f"{self.log_prefix}⚠️ Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True) else: - print(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...") + self._vprint(f"{self.log_prefix}⚠️ Context length exceeded at minimum tier — attempting compression...", force=True) compression_attempts += 1 if compression_attempts > max_compression_attempts: - print(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.") + self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") self._persist_session(messages, conversation_history) return { @@ -4855,7 +4865,7 @@ class AIAgent: "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", "partial": True } - print(f"{self.log_prefix} 🗜️ Context compression attempt {compression_attempts}/{max_compression_attempts}...") + self._vprint(f"{self.log_prefix} 🗜️ Context compression attempt {compression_attempts}/{max_compression_attempts}...") original_len = len(messages) messages, active_system_prompt = self._compress_context( @@ -4865,14 +4875,14 @@ class AIAgent: if len(messages) < original_len or new_ctx and new_ctx < old_ctx: if len(messages) < original_len: - print(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") + self._vprint(f"{self.log_prefix} 🗜️ Compressed {original_len} → {len(messages)} messages, retrying...") time.sleep(2) # Brief pause between compression retries restart_with_compressed_messages = True break else: # Can't compress further and already at minimum tier - print(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.") - print(f"{self.log_prefix} 💡 The conversation has accumulated too much content.") + self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True) + self._vprint(f"{self.log_prefix} 💡 The conversation has accumulated too much content.", force=True) logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.") self._persist_session(messages, conversation_history) return { @@ -4908,8 +4918,8 @@ class AIAgent: self._dump_api_request_debug( api_kwargs, reason="non_retryable_client_error", error=api_error, ) - print(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") - print(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") + self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") + self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") self._persist_session(messages, conversation_history) return { @@ -4926,7 +4936,7 @@ class AIAgent: if self._try_activate_fallback(): retry_count = 0 continue - print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.") + self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.", force=True) logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}") logging.error(f"{self.log_prefix}Request details - Messages: {len(api_messages)}, Approx tokens: {approx_tokens:,}") raise api_error @@ -4934,15 +4944,15 @@ class AIAgent: wait_time = min(2 ** retry_count, 60) # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}") if retry_count >= max_retries: - print(f"{self.log_prefix}⚠️ API call failed after {retry_count} attempts: {str(api_error)[:100]}") - print(f"{self.log_prefix}⏳ Final retry in {wait_time}s...") + self._vprint(f"{self.log_prefix}⚠️ API call failed after {retry_count} attempts: {str(api_error)[:100]}") + self._vprint(f"{self.log_prefix}⏳ Final retry in {wait_time}s...") # Sleep in small increments so we can respond to interrupts quickly # instead of blocking the entire wait_time in one sleep() call sleep_end = time.time() + wait_time while time.time() < sleep_end: if self._interrupt_requested: - print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.") + self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.") self._persist_session(messages, conversation_history) self.clear_interrupt() return { @@ -5006,7 +5016,7 @@ class AIAgent: # Handle assistant response if assistant_message.content and not self.quiet_mode: - print(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") + self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}") # Notify progress callback of model's thinking (used by subagent # delegation to relay the child's reasoning to the parent display). @@ -5033,15 +5043,15 @@ class AIAgent: self._incomplete_scratchpad_retries = 0 self._incomplete_scratchpad_retries += 1 - print(f"{self.log_prefix}⚠️ Incomplete detected (opened but never closed)") + self._vprint(f"{self.log_prefix}⚠️ Incomplete detected (opened but never closed)") if self._incomplete_scratchpad_retries <= 2: - print(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...") + self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...") # Don't add the broken message, just retry continue else: # Max retries - discard this turn and save as partial - print(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.") + self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.") self._incomplete_scratchpad_retries = 0 rolled_back_messages = self._get_messages_up_to_last_assistant(messages) @@ -5084,7 +5094,7 @@ class AIAgent: if self._codex_incomplete_retries < 3: if not self.quiet_mode: - print(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)") + self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)") self._session_messages = messages self._save_session_log(messages) continue @@ -5105,7 +5115,7 @@ class AIAgent: # Check for tool calls if assistant_message.tool_calls: if not self.quiet_mode: - print(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...") + self._vprint(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...") if self.verbose_logging: for tc in assistant_message.tool_calls: @@ -5124,11 +5134,30 @@ class AIAgent: if tc.function.name not in self.valid_tool_names ] if invalid_tool_calls: + # Track retries for invalid tool calls + if not hasattr(self, '_invalid_tool_retries'): + self._invalid_tool_retries = 0 + self._invalid_tool_retries += 1 + # Return helpful error to model — model can self-correct next turn available = ", ".join(sorted(self.valid_tool_names)) invalid_name = invalid_tool_calls[0] invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name - print(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction") + self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)") + + if self._invalid_tool_retries >= 3: + self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.") + self._invalid_tool_retries = 0 + self._persist_session(messages, conversation_history) + return { + "final_response": None, + "messages": messages, + "api_calls": api_call_count, + "completed": False, + "partial": True, + "error": f"Model generated invalid tool call: {invalid_preview}" + } + assistant_msg = self._build_assistant_message(assistant_message, finish_reason) messages.append(assistant_msg) for tc in assistant_message.tool_calls: @@ -5165,15 +5194,15 @@ class AIAgent: self._invalid_json_retries += 1 tool_name, error_msg = invalid_json_args[0] - print(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") + self._vprint(f"{self.log_prefix}⚠️ Invalid JSON in tool call arguments for '{tool_name}': {error_msg}") if self._invalid_json_retries < 3: - print(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...") + self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...") # Don't add anything to messages, just retry the API call continue else: # Instead of returning partial, inject a helpful message and let model recover - print(f"{self.log_prefix}⚠️ Injecting recovery message for invalid JSON...") + self._vprint(f"{self.log_prefix}⚠️ Injecting recovery message for invalid JSON...") self._invalid_json_retries = 0 # Reset for next attempt # Add a user message explaining the issue @@ -5203,7 +5232,7 @@ class AIAgent: if self.quiet_mode: clean = self._strip_think_blocks(turn_content).strip() if clean: - print(f" ┊ 💬 {clean}") + self._vprint(f" ┊ 💬 {clean}") messages.append(assistant_msg) @@ -5279,19 +5308,19 @@ class AIAgent: self._empty_content_retries += 1 reasoning_text = self._extract_reasoning(assistant_message) - print(f"{self.log_prefix}⚠️ Response only contains think block with no content after it") + self._vprint(f"{self.log_prefix}⚠️ Response only contains think block with no content after it") if reasoning_text: reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text - print(f"{self.log_prefix} Reasoning: {reasoning_preview}") + self._vprint(f"{self.log_prefix} Reasoning: {reasoning_preview}") else: content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response - print(f"{self.log_prefix} Content: '{content_preview}'") + self._vprint(f"{self.log_prefix} Content: '{content_preview}'") if self._empty_content_retries < 3: - print(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...") + self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...") continue else: - print(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.") + self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.") self._empty_content_retries = 0 # If a prior tool_calls turn had real content, salvage it: diff --git a/tools/voice_mode.py b/tools/voice_mode.py index bdf2c5353e..87b6cad67d 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -12,6 +12,7 @@ Dependencies (optional): import logging import os import platform +import re import shutil import subprocess import tempfile @@ -350,12 +351,37 @@ WHISPER_HALLUCINATIONS = { "you", "the end.", "the end", + # Non-English hallucinations (common on silence) + "продолжение следует", + "продолжение следует...", + "sous-titres", + "sous-titres réalisés par la communauté d'amara.org", + "sottotitoli creati dalla comunità amara.org", + "untertitel von stephanie geiges", + "amara.org", + "www.mooji.org", + "ご視聴ありがとうございました", } +# Regex patterns for repetitive hallucinations (e.g. "Thank you. Thank you. Thank you.") +_HALLUCINATION_REPEAT_RE = re.compile( + r'^(?:thank you|thanks|bye|you|ok|okay|the end|\.|\s|,|!)+$', + flags=re.IGNORECASE, +) + def is_whisper_hallucination(transcript: str) -> bool: """Check if a transcript is a known Whisper hallucination on silence.""" - return transcript.strip().lower() in WHISPER_HALLUCINATIONS + cleaned = transcript.strip().lower() + if not cleaned: + return True + # Exact match against known phrases + if cleaned.rstrip('.!') in WHISPER_HALLUCINATIONS or cleaned in WHISPER_HALLUCINATIONS: + return True + # Repetitive patterns (e.g. "Thank you. Thank you. Thank you. you") + if _HALLUCINATION_REPEAT_RE.match(cleaned): + return True + return False # ============================================================================ From 404123aea78ee13f83d5c5d89c6563ab02efa7c0 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Fri, 6 Mar 2026 02:16:23 +0300 Subject: [PATCH 17/93] feat: add persistent voice mode status bar below input area Shows voice state (recording, transcribing, TTS/continuous toggles) as a persistent toolbar using prompt_toolkit ConditionalContainer. --- cli.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cli.py b/cli.py index 0778b62646..32c88ec96d 100755 --- a/cli.py +++ b/cli.py @@ -5338,6 +5338,24 @@ class HermesCLI: height=Condition(lambda: bool(cli_ref._attached_images)), ) + # Persistent voice mode status bar (visible only when voice mode is on) + def _get_voice_status(): + if cli_ref._voice_recording: + return [('class:voice-status-recording', ' ● REC Ctrl+R to stop ')] + if cli_ref._voice_processing: + return [('class:voice-status', ' ◉ Transcribing... ')] + tts = " | TTS on" if cli_ref._voice_tts else "" + cont = " | Continuous" if cli_ref._voice_continuous else "" + return [('class:voice-status', f' 🎤 Voice mode{tts}{cont} — Ctrl+R to record ')] + + voice_status_bar = ConditionalContainer( + Window( + FormattedTextControl(_get_voice_status), + height=1, + ), + filter=Condition(lambda: cli_ref._voice_mode), + ) + # Layout: interactive prompt widgets + ruled input at bottom. # The sudo, approval, and clarify widgets appear above the input when # the corresponding interactive prompt is active. @@ -5354,6 +5372,7 @@ class HermesCLI: image_bar, input_area, input_rule_bot, + voice_status_bar, CompletionsMenu(max_height=12, scroll_offset=1), ]) ) @@ -5398,6 +5417,8 @@ class HermesCLI: 'voice-prompt': '#87CEEB', 'voice-recording': '#FF4444 bold', 'voice-processing': '#FFA500 italic', + 'voice-status': 'bg:#1a1a2e #87CEEB', + 'voice-status-recording': 'bg:#1a1a2e #FF4444 bold', } style = PTStyle.from_dict(self._build_tui_style_dict()) From 46db7aeffd022ff4e6bb6586a3b3780c392fcc16 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Sat, 7 Mar 2026 01:49:12 +0300 Subject: [PATCH 18/93] fix: streaming tool call parsing, error handling, and fake HA state mutation - Fix Gemini streaming tool call merge bug: multiple tool calls with same index but different IDs are now parsed as separate calls instead of concatenating names (e.g. ha_call_serviceha_call_service) - Handle partial results in voice mode: show error and stop continuous mode when agent returns partial/failed results with empty response - Fix error display during streaming TTS: error messages are shown in full response box even when streaming box was already opened - Add duplicate sentence filter in TTS: skip near-duplicate sentences from LLM repetition - Fix fake HA server state mutation: turn_on/turn_off/set_temperature correctly update entity states; temperature sensor simulates change when thermostat is adjusted --- cli.py | 6 ++++-- run_agent.py | 16 +++++++++++++++- tests/fakes/fake_ha_server.py | 17 +++++++++++++++-- tools/tts_tool.py | 7 +++++++ 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/cli.py b/cli.py index 32c88ec96d..dd15151ee8 100755 --- a/cli.py +++ b/cli.py @@ -4238,8 +4238,10 @@ class HermesCLI: # Get the final response response = result.get("final_response", "") if result else "" - # Handle failed results (e.g., non-retryable errors like invalid model) - if result and result.get("failed") and not response: + # Handle failed or partial results (e.g., non-retryable errors, rate limits, + # truncated output, invalid tool calls). Both "failed" and "partial" with + # an empty final_response mean the agent couldn't produce a usable answer. + if result and (result.get("failed") or result.get("partial")) and not response: error_detail = result.get("error", "Unknown error") response = f"Error: {error_detail}" # Stop continuous voice mode on persistent errors (e.g. 429 rate limit) diff --git a/run_agent.py b/run_agent.py index 475a797fc7..152d6092ea 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2646,7 +2646,21 @@ class AIAgent: # Tool call deltas if delta and delta.tool_calls: for tc_delta in delta.tool_calls: - idx = tc_delta.index + idx = tc_delta.index if tc_delta.index is not None else 0 + # Gemini may reuse index 0 for multiple tool calls, + # sending a new id each time. Detect this and assign + # a fresh virtual index so calls don't merge. + if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]: + # Look for existing entry with this id first + # (follow-up deltas for an already-created tool call) + matched = False + for eidx, eentry in tool_calls_acc.items(): + if eentry["id"] == tc_delta.id: + idx = eidx + matched = True + break + if not matched: + idx = (max(k for k in tool_calls_acc if isinstance(k, int)) + 1) if tool_calls_acc else 0 if idx not in tool_calls_acc: tool_calls_acc[idx] = { "id": tc_delta.id or "", diff --git a/tests/fakes/fake_ha_server.py b/tests/fakes/fake_ha_server.py index 1d51bf51b6..b5119da366 100644 --- a/tests/fakes/fake_ha_server.py +++ b/tests/fakes/fake_ha_server.py @@ -275,12 +275,25 @@ class FakeHAServer: affected = [] entity_id = body.get("entity_id") if entity_id: - new_state = "on" if service == "turn_on" else "off" for s in ENTITY_STATES: if s["entity_id"] == entity_id: + if service == "turn_on": + s["state"] = "on" + elif service == "turn_off": + s["state"] = "off" + elif service == "set_temperature" and "temperature" in body: + s["attributes"]["temperature"] = body["temperature"] + # Keep current state or set to heat if off + if s["state"] == "off": + s["state"] = "heat" + # Simulate temperature sensor approaching the target + for ts in ENTITY_STATES: + if ts["entity_id"] == "sensor.temperature": + ts["state"] = str(body["temperature"] - 0.5) + break affected.append({ "entity_id": entity_id, - "state": new_state, + "state": s["state"], "attributes": s.get("attributes", {}), }) break diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 988fa653a7..6c4e53787a 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -522,6 +522,7 @@ def stream_tts_to_speaker( min_sentence_len = 20 long_flush_len = 100 queue_timeout = 0.5 + _spoken_sentences: list[str] = [] # track spoken sentences to skip duplicates # Regex to strip complete ... blocks from buffer _think_block_re = re.compile(r'].*?', flags=re.DOTALL) @@ -532,6 +533,12 @@ def stream_tts_to_speaker( cleaned = _strip_markdown_for_tts(sentence).strip() if not cleaned: return + # Skip duplicate/near-duplicate sentences (LLM repetition) + cleaned_lower = cleaned.lower().rstrip(".!,") + for prev in _spoken_sentences: + if prev.lower().rstrip(".!,") == cleaned_lower: + return + _spoken_sentences.append(cleaned) # Display raw sentence on screen before TTS processing if display_callback is not None: display_callback(sentence) From 143cc68946a6009ecaac39c012d3e8c26a474946 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Sun, 8 Mar 2026 21:58:04 +0300 Subject: [PATCH 19/93] fix(test): add /voice to EXPECTED_COMMANDS set in test_commands.py --- tests/hermes_cli/test_commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/hermes_cli/test_commands.py b/tests/hermes_cli/test_commands.py index 9aa7220806..218059434a 100644 --- a/tests/hermes_cli/test_commands.py +++ b/tests/hermes_cli/test_commands.py @@ -12,7 +12,7 @@ EXPECTED_COMMANDS = { "/personality", "/clear", "/history", "/new", "/reset", "/retry", "/undo", "/save", "/config", "/cron", "/skills", "/platforms", "/verbose", "/reasoning", "/compress", "/title", "/usage", "/insights", "/paste", - "/reload-mcp", "/rollback", "/background", "/skin", "/quit", + "/reload-mcp", "/rollback", "/background", "/skin", "/voice", "/quit", } From b859dfab16268da39ac393b1f54407089d32a034 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:48:49 +0300 Subject: [PATCH 20/93] fix: address voice mode review feedback 1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and openai are never imported at module level. Each is imported only when the feature is explicitly activated, preventing crashes in headless environments (SSH, Docker, WSL, no PortAudio). 2. No core agent loop changes: streaming TTS path extracted from _interruptible_api_call() into separate _streaming_api_call() method. The original method is restored to its upstream form. 3. Configurable key binding: push-to-talk key changed from Ctrl+R (conflicts with readline reverse-search) to Ctrl+B by default. Configurable via voice.push_to_talk_key in config.yaml. 4. Environment detection: new detect_audio_environment() function checks for SSH, Docker, WSL, and missing audio devices before enabling voice mode. Auto-disables with clear warnings in incompatible environments. 5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream, sd.OutputStream) wrapped in try/except with ImportError/OSError handling. Failures produce warnings, not crashes. --- cli.py | 101 +++++++----- run_agent.py | 72 +++++---- tests/tools/test_voice_mode.py | 286 ++++++++++++++++++++++++++++++--- tools/tts_tool.py | 101 +++++++----- tools/voice_mode.py | 108 ++++++++++--- 5 files changed, 526 insertions(+), 142 deletions(-) diff --git a/cli.py b/cli.py index dd15151ee8..46d2372997 100755 --- a/cli.py +++ b/cli.py @@ -3779,7 +3779,15 @@ class HermesCLI: _cprint(f"{_DIM}Voice mode is already enabled.{_RST}") return - from tools.voice_mode import check_voice_requirements + from tools.voice_mode import check_voice_requirements, detect_audio_environment + + # Environment detection -- warn and block in incompatible environments + env_check = detect_audio_environment() + if not env_check["available"]: + _cprint(f"\n{_GOLD}Voice mode unavailable in this environment:{_RST}") + for warning in env_check["warnings"]: + _cprint(f" {_DIM}{warning}{_RST}") + return reqs = check_voice_requirements() if not reqs["available"]: @@ -3815,8 +3823,14 @@ class HermesCLI: self.system_prompt = (self.system_prompt or "") + voice_instruction tts_status = " (TTS enabled)" if self._voice_tts else "" + try: + from hermes_cli.config import load_config + _ptt_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b") + except Exception: + _ptt_key = "c-b" + _ptt_display = _ptt_key.replace("c-", "Ctrl+").upper() _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") - _cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}") + _cprint(f" {_DIM}{_ptt_display} to start/stop recording{_RST}") _cprint(f" {_DIM}/voice tts to toggle speech output{_RST}") _cprint(f" {_DIM}/voice off to disable voice mode{_RST}") @@ -4804,6 +4818,51 @@ class HermesCLI: self._should_exit = True event.app.exit() + # Voice push-to-talk key: configurable via config.yaml (voice.push_to_talk_key) + # Default: Ctrl+B (avoids conflict with Ctrl+R readline reverse-search) + try: + from hermes_cli.config import load_config + _voice_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b") + except Exception: + _voice_key = "c-b" + + @kb.add(_voice_key) + def handle_voice_record(event): + """Toggle voice recording when voice mode is active.""" + if not cli_ref._voice_mode: + return + # Always allow STOPPING a recording (even when agent is running) + if cli_ref._voice_recording: + # Manual stop via Ctrl+R: stop continuous mode + with cli_ref._voice_lock: + cli_ref._voice_continuous = False + # Flag clearing is handled atomically inside _voice_stop_and_transcribe + event.app.invalidate() + threading.Thread( + target=cli_ref._voice_stop_and_transcribe, + daemon=True, + ).start() + else: + # Guard: don't START recording during agent run or interactive prompts + if cli_ref._agent_running: + return + if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: + return + try: + # Interrupt TTS if playing, so user can start talking + if not cli_ref._voice_tts_done.is_set(): + try: + from tools.voice_mode import stop_playback + stop_playback() + cli_ref._voice_tts_done.set() + except Exception: + pass + with cli_ref._voice_lock: + cli_ref._voice_continuous = True + cli_ref._voice_start_recording() + event.app.invalidate() + except Exception as e: + _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") from prompt_toolkit.keys import Keys @kb.add(Keys.BracketedPaste, eager=True) @@ -4850,44 +4909,6 @@ class HermesCLI: # No image found — show a hint pass # silent when no image (avoid noise on accidental press) - @kb.add('c-space') - def handle_ctrl_space(event): - """Toggle voice recording when voice mode is active.""" - if not cli_ref._voice_mode: - return - # Always allow STOPPING a recording (even when agent is running) - if cli_ref._voice_recording: - # Manual stop via Ctrl+R: stop continuous mode - with cli_ref._voice_lock: - cli_ref._voice_continuous = False - # Flag clearing is handled atomically inside _voice_stop_and_transcribe - event.app.invalidate() - threading.Thread( - target=cli_ref._voice_stop_and_transcribe, - daemon=True, - ).start() - else: - # Guard: don't START recording during agent run or interactive prompts - if cli_ref._agent_running: - return - if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: - return - try: - # Interrupt TTS if playing, so user can start talking - if not cli_ref._voice_tts_done.is_set(): - try: - from tools.voice_mode import stop_playback - stop_playback() - cli_ref._voice_tts_done.set() - except Exception: - pass - with cli_ref._voice_lock: - cli_ref._voice_continuous = True - cli_ref._voice_start_recording() - event.app.invalidate() - except Exception as e: - _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") - # Dynamic prompt: shows Hermes symbol when agent is working, # or answer prompt when clarify freetext mode is active. cli_ref = self diff --git a/run_agent.py b/run_agent.py index 152d6092ea..6df794e06c 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2590,12 +2590,6 @@ class AIAgent: On interrupt, closes the HTTP client to cancel the in-flight request (stops token generation and avoids wasting money), then rebuilds the client for future calls. - - When ``self._stream_callback`` is set (streaming TTS mode), the call - uses ``stream=True`` and iterates over chunks inside the background - thread. Content deltas are forwarded to the callback in real-time - while the full response is accumulated and returned as a - ``SimpleNamespace`` that mimics a normal ``ChatCompletion``. """ result = {"response": None, "error": None} @@ -2603,30 +2597,58 @@ class AIAgent: try: if self.api_mode == "codex_responses": result["response"] = self._run_codex_stream(api_kwargs) - return elif self.api_mode == "anthropic_messages": result["response"] = self._anthropic_client.messages.create(**api_kwargs) - return - - cb = getattr(self, "_stream_callback", None) - if cb is None: - # Non-streaming path (default) + else: result["response"] = self.client.chat.completions.create(**api_kwargs) - return + except Exception as e: + result["error"] = e - # --- Streaming path for TTS pipeline --- + t = threading.Thread(target=_call, daemon=True) + t.start() + while t.is_alive(): + t.join(timeout=0.3) + if self._interrupt_requested: + # Force-close the HTTP connection to stop token generation + try: + self.client.close() + except Exception: + pass + # Rebuild the client for future calls (cheap, no network) + try: + self.client = OpenAI(**self._client_kwargs) + except Exception: + pass + raise InterruptedError("Agent interrupted during API call") + if result["error"] is not None: + raise result["error"] + return result["response"] + + def _streaming_api_call(self, api_kwargs: dict, stream_callback): + """Streaming variant of _interruptible_api_call for voice TTS pipeline. + + Uses ``stream=True`` and forwards content deltas to *stream_callback* + in real-time. Returns a ``SimpleNamespace`` that mimics a normal + ``ChatCompletion`` so the rest of the agent loop works unchanged. + + This method is separate from ``_interruptible_api_call`` to keep the + core agent loop untouched for non-voice users. + """ + result = {"response": None, "error": None} + + def _call(): + try: stream_kwargs = {**api_kwargs, "stream": True} stream = self.client.chat.completions.create(**stream_kwargs) content_parts: list[str] = [] - tool_calls_acc: dict[int, dict] = {} # index -> {id, type, function:{name, arguments}} + tool_calls_acc: dict[int, dict] = {} finish_reason = None model_name = None role = "assistant" for chunk in stream: if not chunk.choices: - # Usage-only or empty chunk if hasattr(chunk, "model") and chunk.model: model_name = chunk.model continue @@ -2635,24 +2657,17 @@ class AIAgent: if hasattr(chunk, "model") and chunk.model: model_name = chunk.model - # Content delta if delta and delta.content: content_parts.append(delta.content) try: - cb(delta.content) + stream_callback(delta.content) except Exception: pass - # Tool call deltas if delta and delta.tool_calls: for tc_delta in delta.tool_calls: idx = tc_delta.index if tc_delta.index is not None else 0 - # Gemini may reuse index 0 for multiple tool calls, - # sending a new id each time. Detect this and assign - # a fresh virtual index so calls don't merge. if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]: - # Look for existing entry with this id first - # (follow-up deltas for an already-created tool call) matched = False for eidx, eentry in tool_calls_acc.items(): if eentry["id"] == tc_delta.id: @@ -2679,7 +2694,6 @@ class AIAgent: if chunk.choices[0].finish_reason: finish_reason = chunk.choices[0].finish_reason - # Build a mock ChatCompletion matching the non-streaming interface full_content = "".join(content_parts) or None mock_tool_calls = None if tool_calls_acc: @@ -2722,7 +2736,6 @@ class AIAgent: while t.is_alive(): t.join(timeout=0.3) if self._interrupt_requested: - # Force-close the HTTP connection to stop token generation try: if self.api_mode == "anthropic_messages": self._anthropic_client.close() @@ -2730,7 +2743,6 @@ class AIAgent: self.client.close() except Exception: pass - # Rebuild the client for future calls (cheap, no network) try: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_client @@ -4412,7 +4424,11 @@ class AIAgent: if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}: self._dump_api_request_debug(api_kwargs, reason="preflight") - response = self._interruptible_api_call(api_kwargs) + cb = getattr(self, "_stream_callback", None) + if cb is not None: + response = self._streaming_api_call(api_kwargs, cb) + else: + response = self._interruptible_api_call(api_kwargs) api_duration = time.time() - api_start_time diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index e6a46def7c..c994436843 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -41,16 +41,18 @@ def temp_voice_dir(tmp_path, monkeypatch): @pytest.fixture def mock_sd(monkeypatch): - """Replace tools.voice_mode.sd with a MagicMock (sounddevice may not be installed).""" + """Mock _import_audio to return (mock_sd, real_np) so lazy imports work.""" mock = MagicMock() - monkeypatch.setattr("tools.voice_mode.sd", mock) - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) - # Also ensure numpy is available (use real numpy if installed, else mock) try: import numpy as real_np - monkeypatch.setattr("tools.voice_mode.np", real_np) except ImportError: - monkeypatch.setattr("tools.voice_mode.np", MagicMock()) + real_np = MagicMock() + + def _fake_import_audio(): + return mock, real_np + + monkeypatch.setattr("tools.voice_mode._import_audio", _fake_import_audio) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True) return mock @@ -60,7 +62,9 @@ def mock_sd(monkeypatch): class TestCheckVoiceRequirements: def test_all_requirements_met(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True) + monkeypatch.setattr("tools.voice_mode.detect_audio_environment", + lambda: {"available": True, "warnings": []}) monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key") from tools.voice_mode import check_voice_requirements @@ -72,7 +76,9 @@ class TestCheckVoiceRequirements: assert result["missing_packages"] == [] def test_missing_audio_packages(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: False) + monkeypatch.setattr("tools.voice_mode.detect_audio_environment", + lambda: {"available": False, "warnings": ["Audio libraries not installed"]}) monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key") from tools.voice_mode import check_voice_requirements @@ -84,7 +90,9 @@ class TestCheckVoiceRequirements: assert "numpy" in result["missing_packages"] def test_missing_stt_key(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) + monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True) + monkeypatch.setattr("tools.voice_mode.detect_audio_environment", + lambda: {"available": True, "warnings": []}) monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) monkeypatch.delenv("GROQ_API_KEY", raising=False) @@ -102,7 +110,9 @@ class TestCheckVoiceRequirements: class TestAudioRecorderStart: def test_start_raises_without_audio(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) from tools.voice_mode import AudioRecorder @@ -334,21 +344,25 @@ class TestPlayAudioFile: def test_play_wav_via_sounddevice(self, monkeypatch, sample_wav): np = pytest.importorskip("numpy") - mock_sd = MagicMock() - monkeypatch.setattr("tools.voice_mode.sd", mock_sd) - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True) - monkeypatch.setattr("tools.voice_mode.np", np) + mock_sd_obj = MagicMock() + + def _fake_import(): + return mock_sd_obj, np + + monkeypatch.setattr("tools.voice_mode._import_audio", _fake_import) from tools.voice_mode import play_audio_file result = play_audio_file(sample_wav) assert result is True - mock_sd.play.assert_called_once() - mock_sd.wait.assert_called_once() + mock_sd_obj.play.assert_called_once() + mock_sd_obj.wait.assert_called_once() def test_returns_false_when_no_player(self, monkeypatch, sample_wav): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) monkeypatch.setattr("shutil.which", lambda _: None) from tools.voice_mode import play_audio_file @@ -446,7 +460,9 @@ class TestPlayBeep: assert len(audio_arg) > single_beep_samples def test_beep_noop_without_audio(self, monkeypatch): - monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False) + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) from tools.voice_mode import play_beep @@ -607,3 +623,237 @@ class TestSilenceDetection: # No crash, no callback assert recorder._on_silence_stop is None recorder.cancel() + + +# ============================================================================ +# Playback interrupt +# ============================================================================ + +class TestPlaybackInterrupt: + """Verify that TTS playback can be interrupted.""" + + def test_stop_playback_terminates_process(self): + from tools.voice_mode import stop_playback, _playback_lock + import tools.voice_mode as vm + + mock_proc = MagicMock() + mock_proc.poll.return_value = None # process is running + + with _playback_lock: + vm._active_playback = mock_proc + + stop_playback() + + mock_proc.terminate.assert_called_once() + + with _playback_lock: + assert vm._active_playback is None + + def test_stop_playback_noop_when_nothing_playing(self): + import tools.voice_mode as vm + + with vm._playback_lock: + vm._active_playback = None + + vm.stop_playback() + + def test_play_audio_file_sets_active_playback(self, monkeypatch, sample_wav): + import tools.voice_mode as vm + + def _fail_import(): + raise ImportError("no sounddevice") + monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import) + + mock_proc = MagicMock() + mock_proc.wait.return_value = 0 + + mock_popen = MagicMock(return_value=mock_proc) + monkeypatch.setattr("subprocess.Popen", mock_popen) + monkeypatch.setattr("shutil.which", lambda cmd: "/usr/bin/" + cmd) + + vm.play_audio_file(sample_wav) + + assert mock_popen.called + with vm._playback_lock: + assert vm._active_playback is None + + +# ============================================================================ +# Continuous mode flow +# ============================================================================ + +class TestContinuousModeFlow: + """Verify continuous mode: auto-restart after transcription or silence.""" + + def test_continuous_restart_on_no_speech(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + + # First recording: only silence -> stop returns None + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + for _ in range(10): + silence = np.full((1600, 1), 10, dtype="int16") + callback(silence, 1600, None, None) + + wav_path = recorder.stop() + assert wav_path is None + + # Simulate continuous mode restart + recorder.start() + assert recorder.is_recording is True + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + for _ in range(10): + speech = np.full((1600, 1), 5000, dtype="int16") + callback(speech, 1600, None, None) + + wav_path = recorder.stop() + assert wav_path is not None + + recorder.cancel() + + def test_recorder_reusable_after_stop(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + results = [] + + for i in range(3): + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + loud = np.full((1600, 1), 5000, dtype="int16") + for _ in range(10): + callback(loud, 1600, None, None) + wav_path = recorder.stop() + results.append(wav_path) + + assert all(r is not None for r in results) + assert os.path.isfile(results[-1]) + + +# ============================================================================ +# Audio level indicator +# ============================================================================ + +class TestAudioLevelIndicator: + """Verify current_rms property updates in real-time for UI feedback.""" + + def test_rms_updates_with_audio_chunks(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + assert recorder.current_rms == 0 + + loud = np.full((1600, 1), 5000, dtype="int16") + callback(loud, 1600, None, None) + assert recorder.current_rms == 5000 + + quiet = np.full((1600, 1), 100, dtype="int16") + callback(quiet, 1600, None, None) + assert recorder.current_rms == 100 + + recorder.cancel() + + def test_peak_rms_tracks_maximum(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder.start() + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + frames = [ + np.full((1600, 1), 100, dtype="int16"), + np.full((1600, 1), 8000, dtype="int16"), + np.full((1600, 1), 500, dtype="int16"), + np.full((1600, 1), 3000, dtype="int16"), + ] + for frame in frames: + callback(frame, 1600, None, None) + + assert recorder._peak_rms == 8000 + assert recorder.current_rms == 3000 + + recorder.cancel() + + +# ============================================================================ +# Configurable silence parameters +# ============================================================================ + +class TestConfigurableSilenceParams: + """Verify that silence detection params can be configured.""" + + def test_custom_threshold_and_duration(self, mock_sd): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + import threading + + recorder = AudioRecorder() + recorder._silence_threshold = 5000 + recorder._silence_duration = 0.05 + recorder._min_speech_duration = 0.05 + + fired = threading.Event() + recorder.start(on_silence_stop=lambda: fired.set()) + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + # Audio at RMS 1000 -- below custom threshold (5000) + moderate = np.full((1600, 1), 1000, dtype="int16") + for _ in range(5): + callback(moderate, 1600, None, None) + time.sleep(0.02) + + assert recorder._has_spoken is False + assert fired.wait(timeout=0.2) is False + + # Now send really loud audio (above 5000 threshold) + very_loud = np.full((1600, 1), 8000, dtype="int16") + callback(very_loud, 1600, None, None) + time.sleep(0.06) + callback(very_loud, 1600, None, None) + assert recorder._has_spoken is True + + recorder.cancel() diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 6c4e53787a..1a1642e0d6 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -37,33 +37,29 @@ from typing import Callable, Dict, Any, Optional logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Optional imports -- providers degrade gracefully if not installed +# Lazy imports -- providers are imported only when actually used to avoid +# crashing in headless environments (SSH, Docker, WSL, no PortAudio). # --------------------------------------------------------------------------- -try: + +def _import_edge_tts(): + """Lazy import edge_tts. Returns the module or raises ImportError.""" import edge_tts - _HAS_EDGE_TTS = True -except ImportError: - _HAS_EDGE_TTS = False + return edge_tts -try: +def _import_elevenlabs(): + """Lazy import ElevenLabs client. Returns the class or raises ImportError.""" from elevenlabs.client import ElevenLabs - _HAS_ELEVENLABS = True -except ImportError: - _HAS_ELEVENLABS = False + return ElevenLabs -# openai is a core dependency, but guard anyway -try: +def _import_openai_client(): + """Lazy import OpenAI client. Returns the class or raises ImportError.""" from openai import OpenAI as OpenAIClient - _HAS_OPENAI = True -except ImportError: - _HAS_OPENAI = False + return OpenAIClient -try: +def _import_sounddevice(): + """Lazy import sounddevice. Returns the module or raises ImportError/OSError.""" import sounddevice as sd - _HAS_AUDIO = True -except (ImportError, OSError): - sd = None # type: ignore[assignment] - _HAS_AUDIO = False + return sd # =========================================================================== @@ -202,6 +198,7 @@ def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any] else: output_format = "mp3_44100_128" + ElevenLabs = _import_elevenlabs() client = ElevenLabs(api_key=api_key) audio_generator = client.text_to_speech.convert( text=text, @@ -247,6 +244,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any] else: response_format = "mp3" + OpenAIClient = _import_openai_client() client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1") response = client.audio.speech.create( model=model, @@ -322,7 +320,9 @@ def text_to_speech_tool( try: # Generate audio with the configured provider if provider == "elevenlabs": - if not _HAS_ELEVENLABS: + try: + _import_elevenlabs() + except ImportError: return json.dumps({ "success": False, "error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs" @@ -331,7 +331,9 @@ def text_to_speech_tool( _generate_elevenlabs(text, file_str, tts_config) elif provider == "openai": - if not _HAS_OPENAI: + try: + _import_openai_client() + except ImportError: return json.dumps({ "success": False, "error": "OpenAI provider selected but 'openai' package not installed." @@ -341,7 +343,9 @@ def text_to_speech_tool( else: # Default: Edge TTS (free) - if not _HAS_EDGE_TTS: + try: + _import_edge_tts() + except ImportError: return json.dumps({ "success": False, "error": "Edge TTS not available. Run: pip install edge-tts" @@ -422,12 +426,23 @@ def check_tts_requirements() -> bool: Returns: bool: True if at least one provider can work. """ - if _HAS_EDGE_TTS: - return True - if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"): - return True - if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"): + try: + _import_edge_tts() return True + except ImportError: + pass + try: + _import_elevenlabs() + if os.getenv("ELEVENLABS_API_KEY"): + return True + except ImportError: + pass + try: + _import_openai_client() + if os.getenv("VOICE_TOOLS_OPENAI_KEY"): + return True + except ImportError: + pass return False @@ -500,20 +515,27 @@ def stream_tts_to_speaker( api_key = os.getenv("ELEVENLABS_API_KEY", "") if not api_key: logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled") - elif _HAS_ELEVENLABS: - client = ElevenLabs(api_key=api_key) + else: + try: + ElevenLabs = _import_elevenlabs() + client = ElevenLabs(api_key=api_key) + except ImportError: + logger.warning("elevenlabs package not installed; streaming TTS disabled") # Open a single sounddevice output stream for the lifetime of # this function. ElevenLabs pcm_24000 produces signed 16-bit # little-endian mono PCM at 24 kHz. - use_sd = _HAS_AUDIO and sd is not None - if use_sd: + if client is not None: try: + sd = _import_sounddevice() import numpy as _np output_stream = sd.OutputStream( samplerate=24000, channels=1, dtype="int16", ) output_stream.start() + except (ImportError, OSError) as exc: + logger.debug("sounddevice not available: %s", exc) + output_stream = None except Exception as exc: logger.warning("sounddevice OutputStream failed: %s", exc) output_stream = None @@ -666,12 +688,19 @@ if __name__ == "__main__": print("🔊 Text-to-Speech Tool Module") print("=" * 50) + def _check(importer, label): + try: + importer() + return True + except ImportError: + return False + print(f"\nProvider availability:") - print(f" Edge TTS: {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}") - print(f" ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}") - print(f" API Key: {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}") - print(f" OpenAI: {'✅ installed' if _HAS_OPENAI else '❌ not installed'}") - print(f" API Key: {'✅ set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else '❌ not set (VOICE_TOOLS_OPENAI_KEY)'}") + print(f" Edge TTS: {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}") + print(f" ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}") + print(f" API Key: {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}") + print(f" OpenAI: {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}") + print(f" API Key: {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}") print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}") print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}") diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 87b6cad67d..27de0fc550 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -25,17 +25,69 @@ from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Optional imports with graceful degradation +# Lazy audio imports -- never imported at module level to avoid crashing +# in headless environments (SSH, Docker, WSL, no PortAudio). # --------------------------------------------------------------------------- -try: + +def _import_audio(): + """Lazy-import sounddevice and numpy. Returns (sd, np). + + Raises ImportError or OSError if the libraries are not available + (e.g. PortAudio missing on headless servers). + """ import sounddevice as sd import numpy as np + return sd, np - _HAS_AUDIO = True -except (ImportError, OSError): - sd = None # type: ignore[assignment] - np = None # type: ignore[assignment] - _HAS_AUDIO = False + +def _audio_available() -> bool: + """Return True if audio libraries can be imported.""" + try: + _import_audio() + return True + except (ImportError, OSError): + return False + + +def detect_audio_environment() -> dict: + """Detect if the current environment supports audio I/O. + + Returns dict with 'available' (bool) and 'warnings' (list of strings). + """ + warnings = [] + + # SSH detection + if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')): + warnings.append("Running over SSH -- no audio devices available") + + # Docker detection + if os.path.exists('/.dockerenv'): + warnings.append("Running inside Docker container -- no audio devices") + + # WSL detection + try: + with open('/proc/version', 'r') as f: + if 'microsoft' in f.read().lower(): + warnings.append("Running in WSL -- audio requires PulseAudio bridge to Windows") + except (FileNotFoundError, PermissionError, OSError): + pass + + # Check audio libraries + try: + sd, _ = _import_audio() + try: + devices = sd.query_devices() + if not devices: + warnings.append("No audio input/output devices detected") + except Exception: + warnings.append("Audio subsystem error (PortAudio cannot query devices)") + except (ImportError, OSError): + warnings.append("Audio libraries not installed (pip install sounddevice numpy)") + + return { + "available": len(warnings) == 0, + "warnings": warnings, + } # --------------------------------------------------------------------------- # Recording parameters @@ -65,7 +117,9 @@ def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> N duration: Duration of each beep in seconds. count: Number of beeps to play (with short gap between). """ - if not _HAS_AUDIO: + try: + sd, np = _import_audio() + except (ImportError, OSError): return try: gap = 0.06 # seconds between beeps @@ -161,12 +215,14 @@ class AudioRecorder: Raises ``RuntimeError`` if sounddevice/numpy are not installed or if a recording is already in progress. """ - if not _HAS_AUDIO: + try: + sd, np = _import_audio() + except (ImportError, OSError) as e: raise RuntimeError( "Voice mode requires sounddevice and numpy.\n" "Install with: pip install sounddevice numpy\n" "Or: pip install hermes-agent[voice]" - ) + ) from e with self._lock: if self._recording: @@ -269,6 +325,7 @@ class AudioRecorder: return None # Concatenate frames and write WAV + _, np = _import_audio() audio_data = np.concatenate(self._frames, axis=0) self._frames = [] @@ -434,11 +491,11 @@ def stop_playback() -> None: except Exception: pass # Also stop sounddevice playback if active - if _HAS_AUDIO: - try: - sd.stop() - except Exception: - pass + try: + sd, _ = _import_audio() + sd.stop() + except Exception: + pass def play_audio_file(file_path: str) -> bool: @@ -461,8 +518,9 @@ def play_audio_file(file_path: str) -> bool: return False # Try sounddevice for WAV files - if _HAS_AUDIO and file_path.endswith(".wav"): + if file_path.endswith(".wav"): try: + sd, np = _import_audio() with wave.open(file_path, "rb") as wf: frames = wf.readframes(wf.getnframes()) audio_data = np.frombuffer(frames, dtype=np.int16) @@ -471,6 +529,8 @@ def play_audio_file(file_path: str) -> bool: sd.play(audio_data, samplerate=sample_rate) sd.wait() return True + except (ImportError, OSError): + pass # audio libs not available, fall through to system players except Exception as e: logger.debug("sounddevice playback failed: %s", e) @@ -518,14 +578,18 @@ def check_voice_requirements() -> Dict[str, Any]: groq_key = bool(os.getenv("GROQ_API_KEY")) stt_key_set = openai_key or groq_key missing: List[str] = [] + has_audio = _audio_available() - if not _HAS_AUDIO: + if not has_audio: missing.extend(["sounddevice", "numpy"]) - available = _HAS_AUDIO and stt_key_set + # Environment detection + env_check = detect_audio_environment() + + available = has_audio and stt_key_set and env_check["available"] details_parts = [] - if _HAS_AUDIO: + if has_audio: details_parts.append("Audio capture: OK") else: details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)") @@ -537,12 +601,16 @@ def check_voice_requirements() -> Dict[str, Any]: else: details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)") + for warning in env_check["warnings"]: + details_parts.append(f"Environment: {warning}") + return { "available": available, - "audio_available": _HAS_AUDIO, + "audio_available": has_audio, "stt_key_set": stt_key_set, "missing_packages": missing, "details": "\n".join(details_parts), + "environment": env_check, } From a8838a7ae5e1ce530d0847deb76af672d1b96fb1 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:00:08 +0300 Subject: [PATCH 21/93] fix: replace all hardcoded Ctrl+R references with Ctrl+B --- cli.py | 14 +++++++------- hermes_cli/config.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cli.py b/cli.py index 46d2372997..5a3079b99a 100755 --- a/cli.py +++ b/cli.py @@ -3605,7 +3605,7 @@ class HermesCLI: with self._voice_lock: self._voice_recording = False raise - _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") + _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+B to stop & exit continuous){_RST}") # Periodically refresh prompt to update audio level indicator def _refresh_level(): @@ -3876,7 +3876,7 @@ class HermesCLI: _cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}") _cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}") _cprint(f" Recording: {'YES' if self._voice_recording else 'no'}") - _cprint(f" Record key: Ctrl+R") + _cprint(f" Record key: Ctrl+B") _cprint(f"\n {_BOLD}Requirements:{_RST}") for line in reqs["details"].split("\n"): _cprint(f" {line}") @@ -4833,7 +4833,7 @@ class HermesCLI: return # Always allow STOPPING a recording (even when agent is running) if cli_ref._voice_recording: - # Manual stop via Ctrl+R: stop continuous mode + # Manual stop via push-to-talk key: stop continuous mode with cli_ref._voice_lock: cli_ref._voice_continuous = False # Flag clearing is handled atomically inside _voice_stop_and_transcribe @@ -5003,7 +5003,7 @@ class HermesCLI: def _get_placeholder(): if cli_ref._voice_recording: - return "recording... Ctrl+R to stop, Ctrl+C to cancel" + return "recording... Ctrl+B to stop, Ctrl+C to cancel" if cli_ref._voice_processing: return "transcribing..." if cli_ref._sudo_state: @@ -5023,7 +5023,7 @@ class HermesCLI: if cli_ref._agent_running: return "type a message + Enter to interrupt, Ctrl+C to cancel" if cli_ref._voice_mode: - return "type or Ctrl+R to record" + return "type or Ctrl+B to record" return "" input_area.control.input_processors.append(_PlaceholderProcessor(_get_placeholder)) @@ -5364,12 +5364,12 @@ class HermesCLI: # Persistent voice mode status bar (visible only when voice mode is on) def _get_voice_status(): if cli_ref._voice_recording: - return [('class:voice-status-recording', ' ● REC Ctrl+R to stop ')] + return [('class:voice-status-recording', ' ● REC Ctrl+B to stop ')] if cli_ref._voice_processing: return [('class:voice-status', ' ◉ Transcribing... ')] tts = " | TTS on" if cli_ref._voice_tts else "" cont = " | Continuous" if cli_ref._voice_continuous else "" - return [('class:voice-status', f' 🎤 Voice mode{tts}{cont} — Ctrl+R to record ')] + return [('class:voice-status', f' 🎤 Voice mode{tts}{cont} — Ctrl+B to record ')] voice_status_bar = ConditionalContainer( Window( diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 8dc2076404..b37f30f0cd 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -204,7 +204,7 @@ DEFAULT_CONFIG = { }, "voice": { - "record_key": "ctrl+r", + "record_key": "ctrl+b", "max_recording_seconds": 120, "auto_tts": False, "silence_threshold": 200, # RMS below this = silence (0-32767) From fc893f98f4c2caf3724df774626836d21cc3372f Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:12:57 +0300 Subject: [PATCH 22/93] fix: wrap sd.InputStream in try-except and fix config key name - AudioRecorder.start() now catches InputStream errors gracefully with a clear error message about microphone availability - Fix config key mismatch: cli.py was reading "push_to_talk_key" but config.py defines "record_key" -- now consistent - Add format conversion from config format ("ctrl+b") to prompt_toolkit format ("c-b") --- cli.py | 9 ++++++--- tools/voice_mode.py | 21 ++++++++++++++------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/cli.py b/cli.py index 5a3079b99a..d16954e4b7 100755 --- a/cli.py +++ b/cli.py @@ -3825,7 +3825,8 @@ class HermesCLI: tts_status = " (TTS enabled)" if self._voice_tts else "" try: from hermes_cli.config import load_config - _ptt_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b") + _raw_ptt = load_config().get("voice", {}).get("record_key", "ctrl+b") + _ptt_key = _raw_ptt.lower().replace("ctrl+", "c-").replace("alt+", "a-") except Exception: _ptt_key = "c-b" _ptt_display = _ptt_key.replace("c-", "Ctrl+").upper() @@ -4818,11 +4819,13 @@ class HermesCLI: self._should_exit = True event.app.exit() - # Voice push-to-talk key: configurable via config.yaml (voice.push_to_talk_key) + # Voice push-to-talk key: configurable via config.yaml (voice.record_key) # Default: Ctrl+B (avoids conflict with Ctrl+R readline reverse-search) + # Config uses "ctrl+b" format; prompt_toolkit expects "c-b" format. try: from hermes_cli.config import load_config - _voice_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b") + _raw_key = load_config().get("voice", {}).get("record_key", "ctrl+b") + _voice_key = _raw_key.lower().replace("ctrl+", "c-").replace("alt+", "a-") except Exception: _voice_key = "c-b" diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 27de0fc550..2c3a168bd5 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -291,13 +291,20 @@ class AudioRecorder: if cb: threading.Thread(target=cb, daemon=True).start() - self._stream = sd.InputStream( - samplerate=SAMPLE_RATE, - channels=CHANNELS, - dtype=DTYPE, - callback=_callback, - ) - self._stream.start() + try: + self._stream = sd.InputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype=DTYPE, + callback=_callback, + ) + self._stream.start() + except Exception as e: + self._stream = None + raise RuntimeError( + f"Failed to open audio input stream: {e}. " + "Check that a microphone is connected and accessible." + ) from e self._recording = True logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS) From a78249230c060fc1527dc1e4fa4dc905cb801156 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 03:43:03 +0300 Subject: [PATCH 23/93] fix: address voice mode PR review (streaming TTS, prompt cache, _vprint) Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with lazy import function calls (_import_elevenlabs, _import_sounddevice). The old constants no longer exist in tts_tool -- the try/except silently swallowed the ImportError, leaving streaming TTS dead. Bug B: Use user message prefix instead of modifying system prompt for voice mode instruction. Changing ephemeral_system_prompt mid-session invalidates the prompt cache. Now the concise-response hint is prepended to the user_message passed to run_conversation while conversation_history keeps the original text. Minor: Add force parameter to _vprint so critical error messages (max retries, non-retryable errors, API failures) are always shown even during streaming TTS playback. Tests: 15 new tests in test_voice_cli_integration.py covering all three fixes -- lazy import activation, message prefix behavior, history cleanliness, system prompt stability, and AST verification that all critical _vprint calls use force=True. --- cli.py | 40 +-- run_agent.py | 28 +- tests/tools/test_voice_cli_integration.py | 322 +++++++++++++++++++++- 3 files changed, 361 insertions(+), 29 deletions(-) diff --git a/cli.py b/cli.py index d16954e4b7..face0e0e34 100755 --- a/cli.py +++ b/cli.py @@ -3812,15 +3812,9 @@ class HermesCLI: except Exception: pass - # Append voice-mode system prompt for concise, conversational responses - self._voice_original_prompt = self.system_prompt - voice_instruction = ( - "\n\n[Voice mode active] The user is speaking via voice input. " - "Keep responses concise and conversational — 2-3 sentences max unless " - "the user asks for detail. Avoid code blocks, markdown formatting, " - "and long lists. Respond naturally as in a spoken conversation." - ) - self.system_prompt = (self.system_prompt or "") + voice_instruction + # Voice mode instruction is injected as a user message prefix (not a + # system prompt change) to avoid invalidating the prompt cache. See + # _voice_message_prefix property and its usage in _process_message(). tts_status = " (TTS enabled)" if self._voice_tts else "" try: @@ -3845,9 +3839,6 @@ class HermesCLI: self._voice_tts = False self._voice_continuous = False - # Restore original system prompt - if hasattr(self, '_voice_original_prompt'): - self.system_prompt = self._voice_original_prompt _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -4140,13 +4131,18 @@ class HermesCLI: from tools.tts_tool import ( _load_tts_config as _load_tts_cfg, _get_provider as _get_prov, - _HAS_ELEVENLABS as _el_ok, - _HAS_AUDIO as _audio_ok, + _import_elevenlabs, + _import_sounddevice, stream_tts_to_speaker, ) _tts_cfg = _load_tts_cfg() - if (_get_prov(_tts_cfg) == "elevenlabs" and _el_ok and _audio_ok): + if _get_prov(_tts_cfg) == "elevenlabs": + # Verify both ElevenLabs SDK and audio output are available + _import_elevenlabs() + _import_sounddevice() use_streaming_tts = True + except (ImportError, OSError): + pass except Exception: pass @@ -4177,10 +4173,22 @@ class HermesCLI: if text_queue is not None: text_queue.put(delta) + # When voice mode is active, prepend a brief instruction to the + # user message so the model responds concisely. This avoids + # modifying the system prompt (which would invalidate the prompt + # cache). The original message in conversation_history stays clean. + agent_message = message + if self._voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + def run_agent(): nonlocal result result = self.agent.run_conversation( - user_message=message, + user_message=agent_message, conversation_history=self.conversation_history[:-1], # Exclude the message we just added stream_callback=stream_callback, task_id=self.session_id, diff --git a/run_agent.py b/run_agent.py index 6df794e06c..d32f65cfdd 100644 --- a/run_agent.py +++ b/run_agent.py @@ -816,9 +816,13 @@ class AIAgent: else: print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)") - def _vprint(self, *args, **kwargs): - """Verbose print — suppressed when streaming TTS is active.""" - if getattr(self, "_stream_callback", None) is not None: + def _vprint(self, *args, force: bool = False, **kwargs): + """Verbose print — suppressed when streaming TTS is active. + + Pass ``force=True`` for error/warning messages that should always be + shown even during streaming TTS playback. + """ + if not force and getattr(self, "_stream_callback", None) is not None: return print(*args, **kwargs) @@ -4641,7 +4645,7 @@ class AIAgent: } else: # First message was truncated - mark as failed - self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover") + self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True) self._persist_session(messages, conversation_history) return { "final_response": None, @@ -4783,9 +4787,9 @@ class AIAgent: error_type = type(api_error).__name__ error_msg = str(api_error).lower() - self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}") + self._vprint(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}", force=True) self._vprint(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s") - self._vprint(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}") + self._vprint(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}", force=True) self._vprint(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools") # Check for interrupt before deciding to retry @@ -4839,7 +4843,7 @@ class AIAgent: restart_with_compressed_messages = True break else: - self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.") + self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True) logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.") self._persist_session(messages, conversation_history) return { @@ -4948,8 +4952,8 @@ class AIAgent: self._dump_api_request_debug( api_kwargs, reason="non_retryable_client_error", error=api_error, ) - self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.") - self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.") + self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.", force=True) + self._vprint(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True) logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}") self._persist_session(messages, conversation_history) return { @@ -5081,7 +5085,7 @@ class AIAgent: continue else: # Max retries - discard this turn and save as partial - self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.") + self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True) self._incomplete_scratchpad_retries = 0 rolled_back_messages = self._get_messages_up_to_last_assistant(messages) @@ -5176,7 +5180,7 @@ class AIAgent: self._vprint(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)") if self._invalid_tool_retries >= 3: - self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.") + self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True) self._invalid_tool_retries = 0 self._persist_session(messages, conversation_history) return { @@ -5350,7 +5354,7 @@ class AIAgent: self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...") continue else: - self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.") + self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True) self._empty_content_retries = 0 # If a prior tool_calls turn had real content, salvage it: diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index 7bb78e66c1..e42c3fc7d5 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -1,7 +1,11 @@ -"""Tests for CLI voice mode integration -- command parsing, markdown stripping, state management.""" +"""Tests for CLI voice mode integration -- command parsing, markdown stripping, +state management, streaming TTS activation, voice message prefix, _vprint.""" +import ast import re import threading +from types import SimpleNamespace +from unittest.mock import MagicMock, patch import pytest @@ -149,3 +153,319 @@ class TestVoiceStateLock: t.join() assert state["count"] == 4000 + + +# ============================================================================ +# Streaming TTS lazy import activation (Bug A fix) +# ============================================================================ + +class TestStreamingTTSActivation: + """Verify streaming TTS uses lazy imports to check availability.""" + + def test_activates_when_elevenlabs_and_sounddevice_available(self): + """use_streaming_tts should be True when provider is elevenlabs + and both lazy imports succeed.""" + use_streaming_tts = False + try: + from tools.tts_tool import ( + _load_tts_config as _load_tts_cfg, + _get_provider as _get_prov, + _import_elevenlabs, + _import_sounddevice, + ) + assert callable(_import_elevenlabs) + assert callable(_import_sounddevice) + except ImportError: + pytest.skip("tools.tts_tool not available") + + with patch("tools.tts_tool._load_tts_config") as mock_cfg, \ + patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \ + patch("tools.tts_tool._import_elevenlabs") as mock_el, \ + patch("tools.tts_tool._import_sounddevice") as mock_sd: + mock_cfg.return_value = {"provider": "elevenlabs"} + mock_el.return_value = MagicMock() + mock_sd.return_value = MagicMock() + + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + + assert use_streaming_tts is True + + def test_does_not_activate_when_elevenlabs_missing(self): + """use_streaming_tts stays False when elevenlabs import fails.""" + use_streaming_tts = False + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "elevenlabs"}), \ + patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \ + patch("tools.tts_tool._import_elevenlabs", side_effect=ImportError("no elevenlabs")): + try: + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + except (ImportError, OSError): + pass + + assert use_streaming_tts is False + + def test_does_not_activate_when_sounddevice_missing(self): + """use_streaming_tts stays False when sounddevice import fails.""" + use_streaming_tts = False + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "elevenlabs"}), \ + patch("tools.tts_tool._get_provider", return_value="elevenlabs"), \ + patch("tools.tts_tool._import_elevenlabs", return_value=MagicMock()), \ + patch("tools.tts_tool._import_sounddevice", side_effect=OSError("no PortAudio")): + try: + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + except (ImportError, OSError): + pass + + assert use_streaming_tts is False + + def test_does_not_activate_for_non_elevenlabs_provider(self): + """use_streaming_tts stays False when provider is not elevenlabs.""" + use_streaming_tts = False + with patch("tools.tts_tool._load_tts_config", return_value={"provider": "edge"}), \ + patch("tools.tts_tool._get_provider", return_value="edge"): + try: + from tools.tts_tool import ( + _load_tts_config as load_cfg, + _get_provider as get_prov, + _import_elevenlabs as import_el, + _import_sounddevice as import_sd, + ) + cfg = load_cfg() + if get_prov(cfg) == "elevenlabs": + import_el() + import_sd() + use_streaming_tts = True + except (ImportError, OSError): + pass + + assert use_streaming_tts is False + + def test_stale_boolean_imports_no_longer_exist(self): + """Confirm _HAS_ELEVENLABS and _HAS_AUDIO are not in tts_tool module.""" + import tools.tts_tool as tts_mod + assert not hasattr(tts_mod, "_HAS_ELEVENLABS"), \ + "_HAS_ELEVENLABS should not exist -- lazy imports replaced it" + assert not hasattr(tts_mod, "_HAS_AUDIO"), \ + "_HAS_AUDIO should not exist -- lazy imports replaced it" + + +# ============================================================================ +# Voice mode user message prefix (Bug B fix) +# ============================================================================ + +class TestVoiceMessagePrefix: + """Voice mode should inject instruction via user message prefix, + not by modifying the system prompt (which breaks prompt cache).""" + + def test_prefix_added_when_voice_mode_active(self): + """When voice mode is active and message is str, agent_message + should have the voice instruction prefix.""" + voice_mode = True + message = "What's the weather like?" + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert agent_message.startswith("[Voice input") + assert "What's the weather like?" in agent_message + + def test_no_prefix_when_voice_mode_inactive(self): + """When voice mode is off, message passes through unchanged.""" + voice_mode = False + message = "What's the weather like?" + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert agent_message == message + + def test_no_prefix_for_multimodal_content(self): + """When message is a list (multimodal), no prefix is added.""" + voice_mode = True + message = [{"type": "text", "text": "describe this"}, {"type": "image_url"}] + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert agent_message is message + + def test_history_stays_clean(self): + """conversation_history should contain the original message, + not the prefixed version.""" + voice_mode = True + message = "Hello there" + conversation_history = [] + + conversation_history.append({"role": "user", "content": message}) + + agent_message = message + if voice_mode and isinstance(message, str): + agent_message = ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] " + + message + ) + + assert conversation_history[-1]["content"] == "Hello there" + assert agent_message.startswith("[Voice input") + assert agent_message != conversation_history[-1]["content"] + + def test_enable_voice_mode_does_not_modify_system_prompt(self): + """_enable_voice_mode should NOT modify self.system_prompt or + agent.ephemeral_system_prompt -- the system prompt must stay + stable to preserve prompt cache.""" + cli = SimpleNamespace( + _voice_mode=False, + _voice_tts=False, + _voice_lock=threading.Lock(), + system_prompt="You are helpful", + agent=SimpleNamespace(ephemeral_system_prompt="You are helpful"), + ) + + original_system = cli.system_prompt + original_ephemeral = cli.agent.ephemeral_system_prompt + + cli._voice_mode = True + + assert cli.system_prompt == original_system + assert cli.agent.ephemeral_system_prompt == original_ephemeral + + +# ============================================================================ +# _vprint force parameter (Minor fix) +# ============================================================================ + +class TestVprintForceParameter: + """_vprint should suppress output during streaming TTS unless force=True.""" + + def _make_agent_with_stream(self, stream_active: bool): + """Create a minimal agent-like object with _vprint.""" + agent = SimpleNamespace( + _stream_callback=MagicMock() if stream_active else None, + ) + + def _vprint(*args, force=False, **kwargs): + if not force and getattr(agent, "_stream_callback", None) is not None: + return + print(*args, **kwargs) + + agent._vprint = _vprint + return agent + + def test_suppressed_during_streaming(self, capsys): + """Normal _vprint output is suppressed when streaming TTS is active.""" + agent = self._make_agent_with_stream(stream_active=True) + agent._vprint("should be hidden") + captured = capsys.readouterr() + assert captured.out == "" + + def test_shown_when_not_streaming(self, capsys): + """Normal _vprint output is shown when streaming is not active.""" + agent = self._make_agent_with_stream(stream_active=False) + agent._vprint("should be shown") + captured = capsys.readouterr() + assert "should be shown" in captured.out + + def test_force_shown_during_streaming(self, capsys): + """force=True bypasses the streaming suppression.""" + agent = self._make_agent_with_stream(stream_active=True) + agent._vprint("critical error!", force=True) + captured = capsys.readouterr() + assert "critical error!" in captured.out + + def test_force_shown_when_not_streaming(self, capsys): + """force=True works normally when not streaming (no regression).""" + agent = self._make_agent_with_stream(stream_active=False) + agent._vprint("normal message", force=True) + captured = capsys.readouterr() + assert "normal message" in captured.out + + def test_error_messages_use_force_in_run_agent(self): + """Verify that critical error _vprint calls in run_agent.py + include force=True.""" + with open("run_agent.py", "r") as f: + source = f.read() + + tree = ast.parse(source) + + forced_error_count = 0 + unforced_error_count = 0 + + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not (isinstance(func, ast.Attribute) and func.attr == "_vprint"): + continue + has_fatal = False + for arg in node.args: + if isinstance(arg, ast.JoinedStr): + for val in arg.values: + if isinstance(val, ast.Constant) and isinstance(val.value, str): + if "\u274c" in val.value: + has_fatal = True + break + + if not has_fatal: + continue + + has_force = any( + kw.arg == "force" + and isinstance(kw.value, ast.Constant) + and kw.value.value is True + for kw in node.keywords + ) + + if has_force: + forced_error_count += 1 + else: + unforced_error_count += 1 + + assert forced_error_count > 0, \ + "Expected at least one _vprint with force=True for error messages" + assert unforced_error_count == 0, \ + f"Found {unforced_error_count} critical error _vprint calls without force=True" From ddfd6e0c59658440e1f29e571a965c8158429266 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:33:53 +0300 Subject: [PATCH 24/93] fix: resolve 6 voice mode bugs found during audit - edge_tts NameError: _generate_edge_tts now calls _import_edge_tts() instead of referencing bare module name (tts_tool.py) - TTS thread leak: chat() finally block sends sentinel to text_queue, sets stop_event, and joins tts_thread on exception paths (cli.py) - output_stream leak: moved close() into finally block so audio device is released even on exception (tts_tool.py) - Ctrl+C continuous mode: cancel handler now resets _voice_continuous to prevent auto-restart after user cancels recording (cli.py) - _disable_voice_mode: now calls stop_playback() and sets _voice_tts_done so TTS stops when voice mode is turned off (cli.py) - _show_voice_status: reads record key from config instead of hardcoding Ctrl+B (cli.py) --- cli.py | 29 ++- tests/tools/test_voice_cli_integration.py | 205 ++++++++++++++++++++++ tools/tts_tool.py | 14 +- 3 files changed, 240 insertions(+), 8 deletions(-) diff --git a/cli.py b/cli.py index face0e0e34..95834959cd 100755 --- a/cli.py +++ b/cli.py @@ -3830,7 +3830,7 @@ class HermesCLI: _cprint(f" {_DIM}/voice off to disable voice mode{_RST}") def _disable_voice_mode(self): - """Disable voice mode and cancel any active recording.""" + """Disable voice mode, cancel any active recording, and stop TTS.""" with self._voice_lock: if self._voice_recording and self._voice_recorder: self._voice_recorder.cancel() @@ -3839,6 +3839,14 @@ class HermesCLI: self._voice_tts = False self._voice_continuous = False + # Stop any active TTS playback + try: + from tools.voice_mode import stop_playback + stop_playback() + except Exception: + pass + self._voice_tts_done.set() + _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -3868,7 +3876,9 @@ class HermesCLI: _cprint(f" Mode: {'ON' if self._voice_mode else 'OFF'}") _cprint(f" TTS: {'ON' if self._voice_tts else 'OFF'}") _cprint(f" Recording: {'YES' if self._voice_recording else 'no'}") - _cprint(f" Record key: Ctrl+B") + _raw_key = load_config().get("voice", {}).get("record_key", "ctrl+b") + _display_key = _raw_key.replace("ctrl+", "Ctrl+").upper() if "ctrl+" in _raw_key.lower() else _raw_key + _cprint(f" Record key: {_display_key}") _cprint(f"\n {_BOLD}Requirements:{_RST}") for line in reqs["details"].split("\n"): _cprint(f" {line}") @@ -4368,6 +4378,20 @@ class HermesCLI: except Exception as e: print(f"Error: {e}") return None + finally: + # Ensure streaming TTS resources are cleaned up even on error. + # Normal path sends the sentinel at line ~3568; this is a safety + # net for exception paths that skip it. Duplicate sentinels are + # harmless — stream_tts_to_speaker exits on the first None. + if text_queue is not None: + try: + text_queue.put_nowait(None) + except Exception: + pass + if stop_event is not None: + stop_event.set() + if tts_thread is not None and tts_thread.is_alive(): + tts_thread.join(timeout=5) def _print_exit_summary(self): """Print session resume info on exit, similar to Claude Code.""" @@ -4763,6 +4787,7 @@ class HermesCLI: if cli_ref._voice_recording and cli_ref._voice_recorder: cli_ref._voice_recorder.cancel() cli_ref._voice_recording = False + cli_ref._voice_continuous = False _cprint(f"\n{_DIM}Recording cancelled.{_RST}") event.app.invalidate() return diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index e42c3fc7d5..38a947b66a 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -469,3 +469,208 @@ class TestVprintForceParameter: "Expected at least one _vprint with force=True for error messages" assert unforced_error_count == 0, \ f"Found {unforced_error_count} critical error _vprint calls without force=True" + + +# ============================================================================ +# Bug fix regression tests +# ============================================================================ + +class TestEdgeTTSLazyImport: + """Bug #3: _generate_edge_tts must use lazy import, not bare module name.""" + + def test_generate_edge_tts_calls_lazy_import(self): + """AST check: _generate_edge_tts must call _import_edge_tts(), not + reference bare 'edge_tts' module name.""" + import ast as _ast + + with open("tools/tts_tool.py") as f: + tree = _ast.parse(f.read()) + + for node in _ast.walk(tree): + if isinstance(node, _ast.AsyncFunctionDef) and node.name == "_generate_edge_tts": + # Collect all Name references (bare identifiers) + bare_refs = [ + n.id for n in _ast.walk(node) + if isinstance(n, _ast.Name) and n.id == "edge_tts" + ] + assert bare_refs == [], ( + f"_generate_edge_tts uses bare 'edge_tts' name — " + f"should use _import_edge_tts() lazy helper" + ) + + # Must have a call to _import_edge_tts + lazy_calls = [ + n for n in _ast.walk(node) + if isinstance(n, _ast.Call) + and isinstance(n.func, _ast.Name) + and n.func.id == "_import_edge_tts" + ] + assert len(lazy_calls) >= 1, ( + "_generate_edge_tts must call _import_edge_tts()" + ) + break + else: + pytest.fail("_generate_edge_tts not found in tts_tool.py") + + +class TestStreamingTTSOutputStreamCleanup: + """Bug #7: output_stream must be closed in finally block.""" + + def test_output_stream_closed_in_finally(self): + """AST check: stream_tts_to_speaker's finally block must close + output_stream even on exception.""" + import ast as _ast + + with open("tools/tts_tool.py") as f: + tree = _ast.parse(f.read()) + + for node in _ast.walk(tree): + if isinstance(node, _ast.FunctionDef) and node.name == "stream_tts_to_speaker": + # Find the outermost try that has a finally with tts_done_event.set() + for child in _ast.walk(node): + if isinstance(child, _ast.Try) and child.finalbody: + finally_text = "\n".join( + _ast.dump(n) for n in child.finalbody + ) + if "tts_done_event" in finally_text: + assert "output_stream" in finally_text, ( + "finally block must close output_stream" + ) + return + pytest.fail("No finally block with tts_done_event found") + + +class TestCtrlCResetsContinuousMode: + """Bug #4: Ctrl+C cancel must reset _voice_continuous.""" + + def test_ctrl_c_handler_resets_voice_continuous(self): + """Source check: Ctrl+C voice cancel block must set + _voice_continuous = False.""" + with open("cli.py") as f: + source = f.read() + + # Find the Ctrl+C handler's voice cancel block + lines = source.split("\n") + in_cancel_block = False + found_continuous_reset = False + for i, line in enumerate(lines): + if "Cancel active voice recording" in line: + in_cancel_block = True + if in_cancel_block: + if "_voice_continuous = False" in line: + found_continuous_reset = True + break + # Block ends at next comment section or return + if "return" in line and in_cancel_block: + break + + assert found_continuous_reset, ( + "Ctrl+C voice cancel block must set _voice_continuous = False" + ) + + +class TestDisableVoiceModeStopsTTS: + """Bug #5: _disable_voice_mode must stop active TTS playback.""" + + def test_disable_voice_mode_calls_stop_playback(self): + """Source check: _disable_voice_mode must call stop_playback().""" + with open("cli.py") as f: + source = f.read() + + # Extract _disable_voice_mode method body + lines = source.split("\n") + in_method = False + method_lines = [] + for line in lines: + if "def _disable_voice_mode" in line: + in_method = True + elif in_method: + if line.strip() and not line.startswith(" ") and not line.startswith("\t"): + break + if line.strip().startswith("def "): + break + method_lines.append(line) + + method_body = "\n".join(method_lines) + assert "stop_playback" in method_body, ( + "_disable_voice_mode must call stop_playback()" + ) + assert "_voice_tts_done.set()" in method_body, ( + "_disable_voice_mode must set _voice_tts_done" + ) + + +class TestVoiceStatusUsesConfigKey: + """Bug #8: _show_voice_status must read record key from config.""" + + def test_show_voice_status_not_hardcoded(self): + """Source check: _show_voice_status must not hardcode Ctrl+B.""" + with open("cli.py") as f: + source = f.read() + + lines = source.split("\n") + in_method = False + for line in lines: + if "def _show_voice_status" in line: + in_method = True + elif in_method and line.strip().startswith("def "): + break + elif in_method: + assert 'Record key: Ctrl+B"' not in line, ( + "_show_voice_status hardcodes 'Ctrl+B' — " + "should read from config" + ) + + def test_show_voice_status_reads_config(self): + """Source check: _show_voice_status must use load_config().""" + with open("cli.py") as f: + source = f.read() + + lines = source.split("\n") + in_method = False + method_lines = [] + for line in lines: + if "def _show_voice_status" in line: + in_method = True + elif in_method and line.strip().startswith("def "): + break + elif in_method: + method_lines.append(line) + + method_body = "\n".join(method_lines) + assert "load_config" in method_body or "record_key" in method_body, ( + "_show_voice_status should read record_key from config" + ) + + +class TestChatTTSCleanupOnException: + """Bug #2: chat() must clean up streaming TTS resources on exception.""" + + def test_chat_has_finally_for_tts_cleanup(self): + """AST check: chat() method must have a finally block that cleans up + text_queue, stop_event, and tts_thread.""" + import ast as _ast + + with open("cli.py") as f: + tree = _ast.parse(f.read()) + + for node in _ast.walk(tree): + if isinstance(node, _ast.FunctionDef) and node.name == "chat": + # Find Try nodes with finally blocks + for child in _ast.walk(node): + if isinstance(child, _ast.Try) and child.finalbody: + finally_text = "\n".join( + _ast.dump(n) for n in child.finalbody + ) + if "text_queue" in finally_text: + assert "stop_event" in finally_text, ( + "finally must also handle stop_event" + ) + assert "tts_thread" in finally_text, ( + "finally must also handle tts_thread" + ) + return + pytest.fail( + "chat() must have a finally block cleaning up " + "text_queue/stop_event/tts_thread" + ) diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 1a1642e0d6..286bb14b4e 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -161,10 +161,11 @@ async def _generate_edge_tts(text: str, output_path: str, tts_config: Dict[str, Returns: Path to the saved audio file. """ + _edge_tts = _import_edge_tts() edge_config = tts_config.get("edge", {}) voice = edge_config.get("voice", DEFAULT_EDGE_VOICE) - communicate = edge_tts.Communicate(text, voice) + communicate = _edge_tts.Communicate(text, voice) await communicate.save(output_path) return output_path @@ -667,17 +668,18 @@ def stream_tts_to_speaker( except queue.Empty: break - # Close the audio output stream + # output_stream is closed in the finally block below + + except Exception as exc: + logger.warning("Streaming TTS pipeline error: %s", exc) + finally: + # Always close the audio output stream to avoid locking the device if output_stream is not None: try: output_stream.stop() output_stream.close() except Exception: pass - - except Exception as exc: - logger.warning("Streaming TTS pipeline error: %s", exc) - finally: tts_done_event.set() From 6e51729c4cd1461ee9e339ee9b18f3a31e6b62cb Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:39:13 +0300 Subject: [PATCH 25/93] fix: remove browser_tool signal handlers that cause voice mode deadlock browser_tool.py registered SIGINT/SIGTERM handlers that called sys.exit() at module import time. When a signal arrived during a lock acquisition (e.g. AudioRecorder._lock in voice mode), SystemExit was raised inside prompt_toolkit's async event loop, corrupting coroutine state and making the process unkillable (required SIGKILL). atexit handler already ensures browser sessions are cleaned up on any normal exit path, so the signal handlers were redundant and harmful. --- tests/tools/test_voice_cli_integration.py | 27 +++++++++++++++++++++++ tools/browser_tool.py | 22 +++++------------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index 38a947b66a..b3cafede9a 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -674,3 +674,30 @@ class TestChatTTSCleanupOnException: "chat() must have a finally block cleaning up " "text_queue/stop_event/tts_thread" ) + + +class TestBrowserToolSignalHandlerRemoved: + """browser_tool.py must NOT register SIGINT/SIGTERM handlers that call + sys.exit() — this conflicts with prompt_toolkit's event loop and causes + the process to become unkillable during voice mode.""" + + def test_no_signal_handler_registration(self): + """Source check: browser_tool.py must not call signal.signal() + for SIGINT or SIGTERM.""" + with open("tools/browser_tool.py") as f: + source = f.read() + + lines = source.split("\n") + for i, line in enumerate(lines, 1): + stripped = line.strip() + # Skip comments + if stripped.startswith("#"): + continue + assert "signal.signal(signal.SIGINT" not in stripped, ( + f"browser_tool.py:{i} registers SIGINT handler — " + f"use atexit instead to avoid prompt_toolkit conflicts" + ) + assert "signal.signal(signal.SIGTERM" not in stripped, ( + f"browser_tool.py:{i} registers SIGTERM handler — " + f"use atexit instead to avoid prompt_toolkit conflicts" + ) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index 15f4961897..b3516c4f24 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -224,24 +224,14 @@ def _emergency_cleanup_all_sessions(): logger.error("Emergency cleanup error: %s", e) -def _signal_handler(signum, frame): - """Handle interrupt signals to cleanup sessions before exit.""" - logger.warning("Received signal %s, cleaning up...", signum) - _emergency_cleanup_all_sessions() - sys.exit(128 + signum) - - -# Register cleanup handlers +# Register cleanup via atexit only. Previous versions installed SIGINT/SIGTERM +# handlers that called sys.exit(), but this conflicts with prompt_toolkit's +# async event loop — a SystemExit raised inside a key-binding callback +# corrupts the coroutine state and makes the process unkillable. atexit +# handlers run on any normal exit (including sys.exit), so browser sessions +# are still cleaned up without hijacking signals. atexit.register(_emergency_cleanup_all_sessions) -# Only register signal handlers in main process (not in multiprocessing workers) -try: - if os.getpid() == os.getpgrp(): # Main process check - signal.signal(signal.SIGINT, _signal_handler) - signal.signal(signal.SIGTERM, _signal_handler) -except (OSError, AttributeError): - pass # Signal handling not available (e.g., Windows or worker process) - # ============================================================================= # Inactivity Cleanup Functions From ecc3dd7c630dd6bee5aae7e2a47995012ec5f563 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:51:13 +0300 Subject: [PATCH 26/93] test: add comprehensive voice mode test coverage (86 tests) - Add TestStreamingApiCall (11 tests) for _streaming_api_call in test_run_agent.py - Add regression tests for all 7 bug fixes (edge_tts lazy import, output_stream cleanup, ctrl+c continuous reset, disable stops TTS, config key, chat cleanup, browser_tool signal handler removal) - Add real behavior tests for CLI voice methods via _make_voice_cli() fixture: TestHandleVoiceCommandReal (7), TestEnableVoiceModeReal (7), TestDisableVoiceModeReal (6), TestVoiceSpeakResponseReal (7), TestVoiceStopAndTranscribeReal (12) --- tests/test_run_agent.py | 155 ++++++++ tests/tools/test_voice_cli_integration.py | 431 ++++++++++++++++++++++ 2 files changed, 586 insertions(+) diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index 15a0d5fba3..50cf3c90af 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -2083,3 +2083,158 @@ class TestAnthropicBaseUrlPassthrough: # No base_url provided, should be default empty string or None passed_url = call_args[0][1] assert not passed_url or passed_url is None + + +# =================================================================== +# _streaming_api_call tests +# =================================================================== + +def _make_chunk(content=None, tool_calls=None, finish_reason=None, model="test/model"): + """Build a SimpleNamespace mimicking an OpenAI streaming chunk.""" + delta = SimpleNamespace(content=content, tool_calls=tool_calls) + choice = SimpleNamespace(delta=delta, finish_reason=finish_reason) + return SimpleNamespace(model=model, choices=[choice]) + + +def _make_tc_delta(index=0, tc_id=None, name=None, arguments=None): + """Build a SimpleNamespace mimicking a streaming tool_call delta.""" + func = SimpleNamespace(name=name, arguments=arguments) + return SimpleNamespace(index=index, id=tc_id, function=func) + + +class TestStreamingApiCall: + """Tests for _streaming_api_call — voice TTS streaming pipeline.""" + + def test_content_assembly(self, agent): + chunks = [ + _make_chunk(content="Hel"), + _make_chunk(content="lo "), + _make_chunk(content="World"), + _make_chunk(finish_reason="stop"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + callback = MagicMock() + + resp = agent._streaming_api_call({"messages": []}, callback) + + assert resp.choices[0].message.content == "Hello World" + assert resp.choices[0].finish_reason == "stop" + assert callback.call_count == 3 + callback.assert_any_call("Hel") + callback.assert_any_call("lo ") + callback.assert_any_call("World") + + def test_tool_call_accumulation(self, agent): + chunks = [ + _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "web_", '{"q":')]), + _make_chunk(tool_calls=[_make_tc_delta(0, None, "search", '"test"}')]), + _make_chunk(finish_reason="tool_calls"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + tc = resp.choices[0].message.tool_calls + assert len(tc) == 1 + assert tc[0].function.name == "web_search" + assert tc[0].function.arguments == '{"q":"test"}' + assert tc[0].id == "call_1" + + def test_multiple_tool_calls(self, agent): + chunks = [ + _make_chunk(tool_calls=[_make_tc_delta(0, "call_a", "search", '{}')]), + _make_chunk(tool_calls=[_make_tc_delta(1, "call_b", "read", '{}')]), + _make_chunk(finish_reason="tool_calls"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + tc = resp.choices[0].message.tool_calls + assert len(tc) == 2 + assert tc[0].function.name == "search" + assert tc[1].function.name == "read" + + def test_content_and_tool_calls_together(self, agent): + chunks = [ + _make_chunk(content="I'll search"), + _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "search", '{}')]), + _make_chunk(finish_reason="tool_calls"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + assert resp.choices[0].message.content == "I'll search" + assert len(resp.choices[0].message.tool_calls) == 1 + + def test_empty_content_returns_none(self, agent): + chunks = [_make_chunk(finish_reason="stop")] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + assert resp.choices[0].message.content is None + assert resp.choices[0].message.tool_calls is None + + def test_callback_exception_swallowed(self, agent): + chunks = [ + _make_chunk(content="Hello"), + _make_chunk(content=" World"), + _make_chunk(finish_reason="stop"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + callback = MagicMock(side_effect=ValueError("boom")) + + resp = agent._streaming_api_call({"messages": []}, callback) + + assert resp.choices[0].message.content == "Hello World" + + def test_model_name_captured(self, agent): + chunks = [ + _make_chunk(content="Hi", model="gpt-4o"), + _make_chunk(finish_reason="stop", model="gpt-4o"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + assert resp.model == "gpt-4o" + + def test_stream_kwarg_injected(self, agent): + chunks = [_make_chunk(content="x"), _make_chunk(finish_reason="stop")] + agent.client.chat.completions.create.return_value = iter(chunks) + + agent._streaming_api_call({"messages": [], "model": "test"}, MagicMock()) + + call_kwargs = agent.client.chat.completions.create.call_args + assert call_kwargs[1].get("stream") is True or call_kwargs.kwargs.get("stream") is True + + def test_api_exception_propagated(self, agent): + agent.client.chat.completions.create.side_effect = ConnectionError("fail") + + with pytest.raises(ConnectionError, match="fail"): + agent._streaming_api_call({"messages": []}, MagicMock()) + + def test_response_has_uuid_id(self, agent): + chunks = [_make_chunk(content="x"), _make_chunk(finish_reason="stop")] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + assert resp.id.startswith("stream-") + assert len(resp.id) > len("stream-") + + def test_empty_choices_chunk_skipped(self, agent): + empty_chunk = SimpleNamespace(model="gpt-4", choices=[]) + chunks = [ + empty_chunk, + _make_chunk(content="Hello", model="gpt-4"), + _make_chunk(finish_reason="stop", model="gpt-4"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._streaming_api_call({"messages": []}, MagicMock()) + + assert resp.choices[0].message.content == "Hello" + assert resp.model == "gpt-4" diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index b3cafede9a..32f48e19ce 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -2,6 +2,8 @@ state management, streaming TTS activation, voice message prefix, _vprint.""" import ast +import os +import queue import re import threading from types import SimpleNamespace @@ -10,6 +12,33 @@ from unittest.mock import MagicMock, patch import pytest +def _make_voice_cli(**overrides): + """Create a minimal HermesCLI with only voice-related attrs initialized. + + Uses ``__new__()`` to bypass ``__init__`` so no config/env/API setup is + needed. Only the voice state attributes (from __init__ lines 3749-3758) + are populated. + """ + from cli import HermesCLI + + cli = HermesCLI.__new__(HermesCLI) + cli._voice_lock = threading.Lock() + cli._voice_mode = False + cli._voice_tts = False + cli._voice_recorder = None + cli._voice_recording = False + cli._voice_processing = False + cli._voice_continuous = False + cli._voice_tts_done = threading.Event() + cli._voice_tts_done.set() + cli._pending_input = queue.Queue() + cli._app = None + cli.console = SimpleNamespace(width=80) + for k, v in overrides.items(): + setattr(cli, k, v) + return cli + + # ============================================================================ # Markdown stripping (same logic as _voice_speak_response) # ============================================================================ @@ -701,3 +730,405 @@ class TestBrowserToolSignalHandlerRemoved: f"browser_tool.py:{i} registers SIGTERM handler — " f"use atexit instead to avoid prompt_toolkit conflicts" ) + + +# ============================================================================ +# Real behavior tests — CLI voice methods via _make_voice_cli() +# ============================================================================ + +class TestHandleVoiceCommandReal: + """Tests _handle_voice_command routing with real CLI instance.""" + + def _cli(self): + cli = _make_voice_cli() + cli._enable_voice_mode = MagicMock() + cli._disable_voice_mode = MagicMock() + cli._toggle_voice_tts = MagicMock() + cli._show_voice_status = MagicMock() + return cli + + @patch("cli._cprint") + def test_on_calls_enable(self, _cp): + cli = self._cli() + cli._handle_voice_command("/voice on") + cli._enable_voice_mode.assert_called_once() + + @patch("cli._cprint") + def test_off_calls_disable(self, _cp): + cli = self._cli() + cli._handle_voice_command("/voice off") + cli._disable_voice_mode.assert_called_once() + + @patch("cli._cprint") + def test_tts_calls_toggle(self, _cp): + cli = self._cli() + cli._handle_voice_command("/voice tts") + cli._toggle_voice_tts.assert_called_once() + + @patch("cli._cprint") + def test_status_calls_show(self, _cp): + cli = self._cli() + cli._handle_voice_command("/voice status") + cli._show_voice_status.assert_called_once() + + @patch("cli._cprint") + def test_toggle_off_when_enabled(self, _cp): + cli = self._cli() + cli._voice_mode = True + cli._handle_voice_command("/voice") + cli._disable_voice_mode.assert_called_once() + + @patch("cli._cprint") + def test_toggle_on_when_disabled(self, _cp): + cli = self._cli() + cli._voice_mode = False + cli._handle_voice_command("/voice") + cli._enable_voice_mode.assert_called_once() + + @patch("builtins.print") + @patch("cli._cprint") + def test_unknown_subcommand(self, _cp, mock_print): + cli = self._cli() + cli._handle_voice_command("/voice foobar") + cli._enable_voice_mode.assert_not_called() + cli._disable_voice_mode.assert_not_called() + # Should print usage via print() (not _cprint) + assert any("Unknown" in str(c) or "unknown" in str(c) + for c in mock_print.call_args_list) + + +class TestEnableVoiceModeReal: + """Tests _enable_voice_mode with real CLI instance.""" + + @patch("cli._cprint") + @patch("hermes_cli.config.load_config", return_value={"voice": {}}) + @patch("tools.voice_mode.check_voice_requirements", + return_value={"available": True, "details": "OK"}) + @patch("tools.voice_mode.detect_audio_environment", + return_value={"available": True, "warnings": []}) + def test_success_sets_voice_mode(self, _env, _req, _cfg, _cp): + cli = _make_voice_cli() + cli._enable_voice_mode() + assert cli._voice_mode is True + + @patch("cli._cprint") + def test_already_enabled_noop(self, _cp): + cli = _make_voice_cli(_voice_mode=True) + cli._enable_voice_mode() + assert cli._voice_mode is True + + @patch("cli._cprint") + @patch("tools.voice_mode.detect_audio_environment", + return_value={"available": False, "warnings": ["SSH session"]}) + def test_env_check_fails(self, _env, _cp): + cli = _make_voice_cli() + cli._enable_voice_mode() + assert cli._voice_mode is False + + @patch("cli._cprint") + @patch("tools.voice_mode.check_voice_requirements", + return_value={"available": False, "details": "Missing", + "missing_packages": ["sounddevice"]}) + @patch("tools.voice_mode.detect_audio_environment", + return_value={"available": True, "warnings": []}) + def test_requirements_fail(self, _env, _req, _cp): + cli = _make_voice_cli() + cli._enable_voice_mode() + assert cli._voice_mode is False + + @patch("cli._cprint") + @patch("hermes_cli.config.load_config", return_value={"voice": {"auto_tts": True}}) + @patch("tools.voice_mode.check_voice_requirements", + return_value={"available": True, "details": "OK"}) + @patch("tools.voice_mode.detect_audio_environment", + return_value={"available": True, "warnings": []}) + def test_auto_tts_from_config(self, _env, _req, _cfg, _cp): + cli = _make_voice_cli() + cli._enable_voice_mode() + assert cli._voice_tts is True + + @patch("cli._cprint") + @patch("hermes_cli.config.load_config", return_value={"voice": {}}) + @patch("tools.voice_mode.check_voice_requirements", + return_value={"available": True, "details": "OK"}) + @patch("tools.voice_mode.detect_audio_environment", + return_value={"available": True, "warnings": []}) + def test_no_auto_tts_default(self, _env, _req, _cfg, _cp): + cli = _make_voice_cli() + cli._enable_voice_mode() + assert cli._voice_tts is False + + @patch("cli._cprint") + @patch("hermes_cli.config.load_config", side_effect=Exception("broken config")) + @patch("tools.voice_mode.check_voice_requirements", + return_value={"available": True, "details": "OK"}) + @patch("tools.voice_mode.detect_audio_environment", + return_value={"available": True, "warnings": []}) + def test_config_exception_still_enables(self, _env, _req, _cfg, _cp): + cli = _make_voice_cli() + cli._enable_voice_mode() + assert cli._voice_mode is True + + +class TestDisableVoiceModeReal: + """Tests _disable_voice_mode with real CLI instance.""" + + @patch("cli._cprint") + @patch("tools.voice_mode.stop_playback") + def test_all_flags_reset(self, _sp, _cp): + cli = _make_voice_cli(_voice_mode=True, _voice_tts=True, + _voice_continuous=True) + cli._disable_voice_mode() + assert cli._voice_mode is False + assert cli._voice_tts is False + assert cli._voice_continuous is False + + @patch("cli._cprint") + @patch("tools.voice_mode.stop_playback") + def test_active_recording_cancelled(self, _sp, _cp): + recorder = MagicMock() + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._disable_voice_mode() + recorder.cancel.assert_called_once() + assert cli._voice_recording is False + + @patch("cli._cprint") + @patch("tools.voice_mode.stop_playback") + def test_stop_playback_called(self, mock_sp, _cp): + cli = _make_voice_cli() + cli._disable_voice_mode() + mock_sp.assert_called_once() + + @patch("cli._cprint") + @patch("tools.voice_mode.stop_playback") + def test_tts_done_event_set(self, _sp, _cp): + cli = _make_voice_cli() + cli._voice_tts_done.clear() + cli._disable_voice_mode() + assert cli._voice_tts_done.is_set() + + @patch("cli._cprint") + @patch("tools.voice_mode.stop_playback") + def test_no_recorder_no_crash(self, _sp, _cp): + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=None) + cli._disable_voice_mode() + assert cli._voice_mode is False + + @patch("cli._cprint") + @patch("tools.voice_mode.stop_playback", side_effect=RuntimeError("boom")) + def test_stop_playback_exception_swallowed(self, _sp, _cp): + cli = _make_voice_cli(_voice_mode=True) + cli._disable_voice_mode() + assert cli._voice_mode is False + + +class TestVoiceSpeakResponseReal: + """Tests _voice_speak_response with real CLI instance.""" + + @patch("cli._cprint") + def test_early_return_when_tts_off(self, _cp): + cli = _make_voice_cli(_voice_tts=False) + with patch("tools.tts_tool.text_to_speech_tool") as mock_tts: + cli._voice_speak_response("Hello") + mock_tts.assert_not_called() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.getsize", return_value=1000) + @patch("cli.os.path.isfile", return_value=True) + @patch("cli.os.makedirs") + @patch("tools.voice_mode.play_audio_file") + @patch("tools.tts_tool.text_to_speech_tool", return_value='{"success": true}') + def test_markdown_stripped(self, mock_tts, _play, _mkd, _isf, _gsz, _unl, _cp): + cli = _make_voice_cli(_voice_tts=True) + cli._voice_speak_response("## Title\n**bold** and `code`") + call_text = mock_tts.call_args.kwargs["text"] + assert "##" not in call_text + assert "**" not in call_text + assert "`" not in call_text + + @patch("cli._cprint") + @patch("cli.os.makedirs") + @patch("tools.tts_tool.text_to_speech_tool", return_value='{"success": true}') + def test_code_blocks_removed(self, mock_tts, _mkd, _cp): + cli = _make_voice_cli(_voice_tts=True) + cli._voice_speak_response("```python\nprint('hi')\n```\nSome text") + call_text = mock_tts.call_args.kwargs["text"] + assert "print" not in call_text + assert "```" not in call_text + assert "Some text" in call_text + + @patch("cli._cprint") + @patch("cli.os.makedirs") + def test_empty_after_strip_returns_early(self, _mkd, _cp): + cli = _make_voice_cli(_voice_tts=True) + with patch("tools.tts_tool.text_to_speech_tool") as mock_tts: + cli._voice_speak_response("```python\nprint('hi')\n```") + mock_tts.assert_not_called() + + @patch("cli._cprint") + @patch("cli.os.makedirs") + @patch("tools.tts_tool.text_to_speech_tool", return_value='{"success": true}') + def test_long_text_truncated(self, mock_tts, _mkd, _cp): + cli = _make_voice_cli(_voice_tts=True) + cli._voice_speak_response("A" * 5000) + call_text = mock_tts.call_args.kwargs["text"] + assert len(call_text) <= 4000 + + @patch("cli._cprint") + @patch("cli.os.makedirs") + @patch("tools.tts_tool.text_to_speech_tool", side_effect=RuntimeError("tts fail")) + def test_exception_sets_done_event(self, _tts, _mkd, _cp): + cli = _make_voice_cli(_voice_tts=True) + cli._voice_tts_done.clear() + cli._voice_speak_response("Hello") + assert cli._voice_tts_done.is_set() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.getsize", return_value=1000) + @patch("cli.os.path.isfile", return_value=True) + @patch("cli.os.makedirs") + @patch("tools.voice_mode.play_audio_file") + @patch("tools.tts_tool.text_to_speech_tool", return_value='{"success": true}') + def test_play_audio_called(self, _tts, mock_play, _mkd, _isf, _gsz, _unl, _cp): + cli = _make_voice_cli(_voice_tts=True) + cli._voice_speak_response("Hello world") + mock_play.assert_called_once() + + +class TestVoiceStopAndTranscribeReal: + """Tests _voice_stop_and_transcribe with real CLI instance.""" + + @patch("cli._cprint") + def test_guard_not_recording(self, _cp): + cli = _make_voice_cli(_voice_recording=False) + with patch("tools.voice_mode.transcribe_recording") as mock_tr: + cli._voice_stop_and_transcribe() + mock_tr.assert_not_called() + + @patch("cli._cprint") + def test_no_recorder_returns_early(self, _cp): + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=None) + with patch("tools.voice_mode.transcribe_recording") as mock_tr: + cli._voice_stop_and_transcribe() + mock_tr.assert_not_called() + assert cli._voice_recording is False + + @patch("cli._cprint") + @patch("tools.voice_mode.play_beep") + def test_no_speech_detected(self, _beep, _cp): + recorder = MagicMock() + recorder.stop.return_value = None + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() + assert cli._pending_input.empty() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.isfile", return_value=True) + @patch("hermes_cli.config.load_config", return_value={"stt": {}}) + @patch("tools.voice_mode.transcribe_recording", + return_value={"success": True, "transcript": "hello world"}) + @patch("tools.voice_mode.play_beep") + def test_successful_transcription_queues_input( + self, _beep, _tr, _cfg, _isf, _unl, _cp + ): + recorder = MagicMock() + recorder.stop.return_value = "/tmp/test.wav" + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() + assert cli._pending_input.get_nowait() == "hello world" + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.isfile", return_value=True) + @patch("hermes_cli.config.load_config", return_value={"stt": {}}) + @patch("tools.voice_mode.transcribe_recording", + return_value={"success": True, "transcript": ""}) + @patch("tools.voice_mode.play_beep") + def test_empty_transcript_not_queued(self, _beep, _tr, _cfg, _isf, _unl, _cp): + recorder = MagicMock() + recorder.stop.return_value = "/tmp/test.wav" + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() + assert cli._pending_input.empty() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.isfile", return_value=True) + @patch("hermes_cli.config.load_config", return_value={"stt": {}}) + @patch("tools.voice_mode.transcribe_recording", + return_value={"success": False, "error": "API timeout"}) + @patch("tools.voice_mode.play_beep") + def test_transcription_failure(self, _beep, _tr, _cfg, _isf, _unl, _cp): + recorder = MagicMock() + recorder.stop.return_value = "/tmp/test.wav" + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() + assert cli._pending_input.empty() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.isfile", return_value=True) + @patch("hermes_cli.config.load_config", return_value={"stt": {}}) + @patch("tools.voice_mode.transcribe_recording", + side_effect=ConnectionError("network")) + @patch("tools.voice_mode.play_beep") + def test_exception_caught(self, _beep, _tr, _cfg, _isf, _unl, _cp): + recorder = MagicMock() + recorder.stop.return_value = "/tmp/test.wav" + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() # Should not raise + + @patch("cli._cprint") + @patch("tools.voice_mode.play_beep") + def test_processing_flag_cleared(self, _beep, _cp): + recorder = MagicMock() + recorder.stop.return_value = None + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() + assert cli._voice_processing is False + + @patch("cli._cprint") + @patch("tools.voice_mode.play_beep") + def test_continuous_restarts_on_no_speech(self, _beep, _cp): + recorder = MagicMock() + recorder.stop.return_value = None + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder, + _voice_continuous=True) + cli._voice_start_recording = MagicMock() + cli._voice_stop_and_transcribe() + cli._voice_start_recording.assert_called_once() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.isfile", return_value=True) + @patch("hermes_cli.config.load_config", return_value={"stt": {}}) + @patch("tools.voice_mode.transcribe_recording", + return_value={"success": True, "transcript": "hello"}) + @patch("tools.voice_mode.play_beep") + def test_continuous_no_restart_on_success( + self, _beep, _tr, _cfg, _isf, _unl, _cp + ): + recorder = MagicMock() + recorder.stop.return_value = "/tmp/test.wav" + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder, + _voice_continuous=True) + cli._voice_start_recording = MagicMock() + cli._voice_stop_and_transcribe() + cli._voice_start_recording.assert_not_called() + + @patch("cli._cprint") + @patch("cli.os.unlink") + @patch("cli.os.path.isfile", return_value=True) + @patch("hermes_cli.config.load_config", return_value={"stt": {"model": "whisper-large-v3"}}) + @patch("tools.voice_mode.transcribe_recording", + return_value={"success": True, "transcript": "hi"}) + @patch("tools.voice_mode.play_beep") + def test_stt_model_from_config(self, _beep, mock_tr, _cfg, _isf, _unl, _cp): + recorder = MagicMock() + recorder.stop.return_value = "/tmp/test.wav" + cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) + cli._voice_stop_and_transcribe() + mock_tr.assert_called_once_with("/tmp/test.wav", model="whisper-large-v3") From d0e3b39e6946cd4ec78ae23bc1100031364ae665 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 12:59:30 +0300 Subject: [PATCH 27/93] fix: prevent Ctrl+B key handler from blocking prompt_toolkit event loop The handle_voice_record key binding runs in prompt_toolkit's event-loop thread. When silence auto-stopped recording, _voice_recording was False but recorder.stop() still held AudioRecorder._lock. A concurrent Ctrl+B press entered the START path and blocked on that lock, freezing all keyboard input. Three changes: - Set _voice_processing atomically with _voice_recording=False in _voice_stop_and_transcribe to close the race window - Add _voice_processing guard in the START path to prevent starting while stop/transcribe is still running - Dispatch _voice_start_recording to a daemon thread so play_beep (sd.wait) and AudioRecorder.start (lock acquire) never block the event loop --- cli.py | 62 +++++++++++----- tests/tools/test_voice_cli_integration.py | 90 +++++++++++++++++++++++ 2 files changed, 133 insertions(+), 19 deletions(-) diff --git a/cli.py b/cli.py index 95834959cd..e2ee9a2676 100755 --- a/cli.py +++ b/cli.py @@ -3617,11 +3617,14 @@ class HermesCLI: def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" - # Atomic guard: only one thread can enter stop-and-transcribe + # Atomic guard: only one thread can enter stop-and-transcribe. + # Set _voice_processing immediately so concurrent Ctrl+B presses + # don't race into the START path while recorder.stop() holds its lock. with self._voice_lock: if not self._voice_recording: return self._voice_recording = False + self._voice_processing = True submitted = False wav_path = None @@ -3642,8 +3645,7 @@ class HermesCLI: _cprint(f"{_DIM}No speech detected.{_RST}") return - with self._voice_lock: - self._voice_processing = True + # _voice_processing is already True (set atomically above) if hasattr(self, '_app') and self._app: self._app.invalidate() _cprint(f"{_DIM}Transcribing...{_RST}") @@ -4864,7 +4866,12 @@ class HermesCLI: @kb.add(_voice_key) def handle_voice_record(event): - """Toggle voice recording when voice mode is active.""" + """Toggle voice recording when voice mode is active. + + IMPORTANT: This handler runs in prompt_toolkit's event-loop thread. + Any blocking call here (locks, sd.wait, disk I/O) freezes the + entire UI. All heavy work is dispatched to daemon threads. + """ if not cli_ref._voice_mode: return # Always allow STOPPING a recording (even when agent is running) @@ -4884,21 +4891,38 @@ class HermesCLI: return if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: return - try: - # Interrupt TTS if playing, so user can start talking - if not cli_ref._voice_tts_done.is_set(): - try: - from tools.voice_mode import stop_playback - stop_playback() - cli_ref._voice_tts_done.set() - except Exception: - pass - with cli_ref._voice_lock: - cli_ref._voice_continuous = True - cli_ref._voice_start_recording() - event.app.invalidate() - except Exception as e: - _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") + # Guard: don't start while a previous stop/transcribe cycle is + # still running — recorder.stop() holds AudioRecorder._lock and + # start() would block the event-loop thread waiting for it. + if cli_ref._voice_processing: + return + + # Interrupt TTS if playing, so user can start talking. + # stop_playback() is fast (just terminates a subprocess). + if not cli_ref._voice_tts_done.is_set(): + try: + from tools.voice_mode import stop_playback + stop_playback() + cli_ref._voice_tts_done.set() + except Exception: + pass + + with cli_ref._voice_lock: + cli_ref._voice_continuous = True + + # Dispatch to a daemon thread so play_beep(sd.wait), + # AudioRecorder.start(lock acquire), and config I/O + # never block the prompt_toolkit event loop. + def _start_recording(): + try: + cli_ref._voice_start_recording() + if hasattr(cli_ref, '_app') and cli_ref._app: + cli_ref._app.invalidate() + except Exception as e: + _cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}") + + threading.Thread(target=_start_recording, daemon=True).start() + event.app.invalidate() from prompt_toolkit.keys import Keys @kb.add(Keys.BracketedPaste, eager=True) diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index 32f48e19ce..e7be698d3b 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -732,6 +732,96 @@ class TestBrowserToolSignalHandlerRemoved: ) +class TestKeyHandlerNeverBlocks: + """The Ctrl+B key handler runs in prompt_toolkit's event-loop thread. + Any blocking call freezes the entire UI. Verify that: + 1. _voice_start_recording is NOT called directly (must be in daemon thread) + 2. _voice_processing guard prevents starting while stop/transcribe runs + 3. _voice_processing is set atomically with _voice_recording in stop_and_transcribe + """ + + def test_start_recording_not_called_directly_in_handler(self): + """AST check: handle_voice_record must NOT call _voice_start_recording() + directly — it must wrap it in a Thread to avoid blocking the UI.""" + import ast as _ast + + with open("cli.py") as f: + tree = _ast.parse(f.read()) + + for node in _ast.walk(tree): + if isinstance(node, _ast.FunctionDef) and node.name == "handle_voice_record": + # Collect all direct calls to _voice_start_recording in this function. + # They should ONLY appear inside a nested def (the _start_recording wrapper). + for child in _ast.iter_child_nodes(node): + # Direct statements in the handler body (not nested defs) + if isinstance(child, _ast.Expr) and isinstance(child.value, _ast.Call): + call_src = _ast.dump(child.value) + assert "_voice_start_recording" not in call_src, ( + "handle_voice_record calls _voice_start_recording directly " + "— must dispatch to a daemon thread" + ) + break + + def test_processing_guard_in_start_path(self): + """Source check: key handler must check _voice_processing before + starting a new recording.""" + with open("cli.py") as f: + source = f.read() + + lines = source.split("\n") + in_handler = False + in_else = False + found_guard = False + for line in lines: + if "def handle_voice_record" in line: + in_handler = True + elif in_handler and line.strip().startswith("def ") and "_start_recording" not in line: + break + elif in_handler and "else:" in line: + in_else = True + elif in_else and "_voice_processing" in line: + found_guard = True + break + + assert found_guard, ( + "Key handler START path must guard against _voice_processing " + "to prevent blocking on AudioRecorder._lock" + ) + + def test_processing_set_atomically_with_recording_false(self): + """Source check: _voice_stop_and_transcribe must set _voice_processing = True + in the same lock block where it sets _voice_recording = False.""" + with open("cli.py") as f: + source = f.read() + + lines = source.split("\n") + in_method = False + in_first_lock = False + found_recording_false = False + found_processing_true = False + for line in lines: + if "def _voice_stop_and_transcribe" in line: + in_method = True + elif in_method and "with self._voice_lock:" in line and not in_first_lock: + in_first_lock = True + elif in_first_lock: + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + if "_voice_recording = False" in stripped: + found_recording_false = True + if "_voice_processing = True" in stripped: + found_processing_true = True + # End of with block (dedent) + if stripped and not line.startswith(" ") and not line.startswith("\t\t\t"): + break + + assert found_recording_false and found_processing_true, ( + "_voice_stop_and_transcribe must set _voice_processing = True " + "atomically (same lock block) with _voice_recording = False" + ) + + # ============================================================================ # Real behavior tests — CLI voice methods via _make_voice_cli() # ============================================================================ From 9d58cafec94befc659fcc83054ed97bb06279f4b Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:31:50 +0300 Subject: [PATCH 28/93] fix: move process_loop voice restart to daemon thread, use _cprint consistently MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - process_loop's continuous mode restart called _voice_start_recording() directly, blocking the loop if play_beep/sd.wait hangs — queued user input would stall silently. Dispatch to daemon thread like Ctrl+B handler. - Replace print() with _cprint() in _handle_voice_command for consistency with the rest of the voice mode code. --- cli.py | 28 +++++++++++++---------- tests/tools/test_voice_cli_integration.py | 7 +++--- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/cli.py b/cli.py index e2ee9a2676..98476a4270 100755 --- a/cli.py +++ b/cli.py @@ -3772,8 +3772,8 @@ class HermesCLI: else: self._enable_voice_mode() else: - print(f"Unknown voice subcommand: {subcommand}") - print("Usage: /voice [on|off|tts|status]") + _cprint(f"Unknown voice subcommand: {subcommand}") + _cprint("Usage: /voice [on|off|tts|status]") def _enable_voice_mode(self): """Enable voice mode after checking requirements.""" @@ -5602,17 +5602,21 @@ class HermesCLI: self._spinner_text = "" app.invalidate() # Refresh status line - # Continuous voice: auto-restart recording after agent responds + # Continuous voice: auto-restart recording after agent responds. + # Dispatch to a daemon thread so play_beep (sd.wait) and + # AudioRecorder.start (lock acquire) never block process_loop — + # otherwise queued user input would stall silently. if self._voice_mode and self._voice_continuous and not self._voice_recording: - try: - # Wait for TTS to finish so we don't record the speaker - if self._voice_tts: - self._voice_tts_done.wait(timeout=60) - time.sleep(0.3) # Brief pause after TTS ends - self._voice_start_recording() - app.invalidate() - except Exception as e: - _cprint(f"{_DIM}Voice auto-restart failed: {e}{_RST}") + def _restart_recording(): + try: + if self._voice_tts: + self._voice_tts_done.wait(timeout=60) + time.sleep(0.3) + self._voice_start_recording() + app.invalidate() + except Exception as e: + _cprint(f"{_DIM}Voice auto-restart failed: {e}{_RST}") + threading.Thread(target=_restart_recording, daemon=True).start() except Exception as e: print(f"Error: {e}") diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index e7be698d3b..105b27fc41 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -875,16 +875,15 @@ class TestHandleVoiceCommandReal: cli._handle_voice_command("/voice") cli._enable_voice_mode.assert_called_once() - @patch("builtins.print") @patch("cli._cprint") - def test_unknown_subcommand(self, _cp, mock_print): + def test_unknown_subcommand(self, mock_cp): cli = self._cli() cli._handle_voice_command("/voice foobar") cli._enable_voice_mode.assert_not_called() cli._disable_voice_mode.assert_not_called() - # Should print usage via print() (not _cprint) + # Should print usage via _cprint assert any("Unknown" in str(c) or "unknown" in str(c) - for c in mock_print.call_args_list) + for c in mock_cp.call_args_list) class TestEnableVoiceModeReal: From bcf4513cb32a462bb90d8a50611b198b7e38ff16 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:11:18 +0300 Subject: [PATCH 29/93] fix: add timeout to play_beep sd.wait and wrap silence callback in try-except MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace sd.wait() with a poll loop + sd.stop() in play_beep(). sd.wait() calls Event.wait() without timeout — hangs forever if the audio device stalls. Poll with a 2s ceiling and force-stop instead. - Wrap _on_silence callback in try-except so exceptions are logged instead of silently lost in the daemon thread. Prevents recording state from becoming inconsistent on unexpected errors. --- tools/voice_mode.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 2c3a168bd5..151d819836 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -140,7 +140,12 @@ def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> N audio = np.concatenate(parts) sd.play(audio, samplerate=SAMPLE_RATE) - sd.wait() + # sd.wait() calls Event.wait() without timeout — hangs forever if the + # audio device stalls. Poll with a 2s ceiling and force-stop. + deadline = time.monotonic() + 2.0 + while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline: + time.sleep(0.01) + sd.stop() except Exception as e: logger.debug("Beep playback failed: %s", e) @@ -289,7 +294,12 @@ class AudioRecorder: cb = self._on_silence_stop self._on_silence_stop = None # fire only once if cb: - threading.Thread(target=cb, daemon=True).start() + def _safe_cb(): + try: + cb() + except Exception as e: + logger.error("Silence callback failed: %s", e, exc_info=True) + threading.Thread(target=_safe_cb, daemon=True).start() try: self._stream = sd.InputStream( From 0a89933f9b44659c03f24cb21f5e8302156d6277 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:30:12 +0300 Subject: [PATCH 30/93] fix: add STT timeout, move finally restart to thread, guard exit on recording MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Set OpenAI client timeout=30s in transcribe_audio() — default 600s blocks _voice_processing for 10 min if Groq/OpenAI stalls - Move _voice_start_recording in _voice_stop_and_transcribe finally block to a daemon thread (same pattern as Ctrl+B handler and process_loop) - Add _should_exit guard at top of _voice_start_recording so all 4 call sites respect shutdown without individual checks --- cli.py | 16 ++++++++++------ tools/transcription_tools.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cli.py b/cli.py index 98476a4270..f874448f1f 100755 --- a/cli.py +++ b/cli.py @@ -3544,6 +3544,8 @@ class HermesCLI: def _voice_start_recording(self): """Start capturing audio from the microphone.""" + if getattr(self, '_should_exit', False): + return from tools.voice_mode import AudioRecorder, check_voice_requirements reqs = check_voice_requirements() @@ -3691,12 +3693,14 @@ class HermesCLI: # (When transcript IS submitted, process_loop handles restart # after chat() completes.) if self._voice_continuous and not submitted and not self._voice_recording: - try: - self._voice_start_recording() - if hasattr(self, '_app') and self._app: - self._app.invalidate() - except Exception: - pass + def _restart_recording(): + try: + self._voice_start_recording() + if hasattr(self, '_app') and self._app: + self._app.invalidate() + except Exception: + pass + threading.Thread(target=_restart_recording, daemon=True).start() def _voice_speak_response(self, text: str): """Speak the agent's response aloud using TTS (runs in background thread).""" diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 7f217bc77e..d7c0a84bbd 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -150,7 +150,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A try: from openai import OpenAI, APIError, APIConnectionError, APITimeoutError - client = OpenAI(api_key=api_key, base_url=base_url) + client = OpenAI(api_key=api_key, base_url=base_url, timeout=30) with open(file_path, "rb") as audio_file: transcription = client.audio.transcriptions.create( From c3dc4448bf2bb9fb5e07c3c7f54c3ba763e4d30c Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:56:46 +0300 Subject: [PATCH 31/93] fix: disable STT retries and stop continuous mode after 3 silent cycles - Set max_retries=0 on the STT OpenAI client. The SDK default (2) honors Groq's retry-after header (often 53s), blocking the thread for up to ~106s on rate limits. Voice STT should fail fast, not retry silently. - Stop continuous recording mode after 3 consecutive no-speech cycles to prevent infinite restart loops when nobody is talking. --- cli.py | 11 +++++++++++ tools/transcription_tools.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cli.py b/cli.py index f874448f1f..1f1a134f09 100755 --- a/cli.py +++ b/cli.py @@ -3688,6 +3688,17 @@ class HermesCLI: except Exception: pass + # Track consecutive no-speech cycles to avoid infinite restart loops. + if not submitted: + self._no_speech_count = getattr(self, '_no_speech_count', 0) + 1 + if self._no_speech_count >= 3: + self._voice_continuous = False + self._no_speech_count = 0 + _cprint(f"{_DIM}No speech detected 3 times, continuous mode stopped.{_RST}") + return + else: + self._no_speech_count = 0 + # If no transcript was submitted but continuous mode is active, # restart recording so the user can keep talking. # (When transcript IS submitted, process_loop handles restart diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index d7c0a84bbd..6b9c4b5f67 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -150,7 +150,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A try: from openai import OpenAI, APIError, APIConnectionError, APITimeoutError - client = OpenAI(api_key=api_key, base_url=base_url, timeout=30) + client = OpenAI(api_key=api_key, base_url=base_url, timeout=30, max_retries=0) with open(file_path, "rb") as audio_file: transcription = client.audio.transcriptions.create( From 8b57a3cb7ecf531099d34f857d55c284cb6388b8 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 17:41:56 +0300 Subject: [PATCH 32/93] fix: add max recording timeout to prevent infinite wait in quiet environments AudioRecorder now auto-stops after 15 seconds if no speech is detected (_has_spoken remains False). In quiet environments where ambient RMS never exceeds the silence threshold (200), the recording would wait indefinitely. The new _max_wait parameter fires the silence callback after the timeout, triggering the normal "No speech detected" flow. --- tools/voice_mode.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 151d819836..736b84a7fe 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -185,6 +185,7 @@ class AudioRecorder: self._on_silence_stop = None self._silence_threshold: int = SILENCE_RMS_THRESHOLD self._silence_duration: float = SILENCE_DURATION_SECONDS + self._max_wait: float = 15.0 # Max seconds to wait for speech before auto-stop # Peak RMS seen during recording (for speech presence check in stop()) self._peak_rms: int = 0 # Live audio level (read by UI for visual feedback) @@ -284,6 +285,10 @@ class AudioRecorder: # else: brief dip, keep tolerating # else: no speech attempt, just silence -- nothing to do + # Fire silence callback when: + # 1. User spoke then went silent for silence_duration, OR + # 2. No speech detected at all for max_wait seconds + should_fire = False if self._has_spoken and rms <= self._silence_threshold: # User was speaking and now is silent if self._silence_start == 0.0: @@ -291,15 +296,24 @@ class AudioRecorder: elif now - self._silence_start >= self._silence_duration: logger.info("Silence detected (%.1fs), auto-stopping", self._silence_duration) - cb = self._on_silence_stop - self._on_silence_stop = None # fire only once - if cb: - def _safe_cb(): - try: - cb() - except Exception as e: - logger.error("Silence callback failed: %s", e, exc_info=True) - threading.Thread(target=_safe_cb, daemon=True).start() + should_fire = True + elif not self._has_spoken and now - self._start_time >= self._max_wait: + # No speech detected within max_wait — stop to avoid + # infinite recording in quiet environments. + logger.info("No speech within %.0fs, auto-stopping", + self._max_wait) + should_fire = True + + if should_fire: + cb = self._on_silence_stop + self._on_silence_stop = None # fire only once + if cb: + def _safe_cb(): + try: + cb() + except Exception as e: + logger.error("Silence callback failed: %s", e, exc_info=True) + threading.Thread(target=_safe_cb, daemon=True).start() try: self._stream = sd.InputStream( From eec04d180aa310e25fec1b877c16834b1363a9d1 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 18:30:42 +0300 Subject: [PATCH 33/93] fix(test): update play_beep test to match polling-based implementation play_beep was changed from sd.wait() to a poll loop + sd.stop() in 302e1fe but the test was not updated. Now asserts sd.stop() instead of sd.wait(). --- tests/tools/test_voice_mode.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index c994436843..0312dd046d 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -438,10 +438,15 @@ class TestPlayBeep: from tools.voice_mode import play_beep + # play_beep uses polling (get_stream) + sd.stop() instead of sd.wait() + mock_stream = MagicMock() + mock_stream.active = False + mock_sd.get_stream.return_value = mock_stream + play_beep(frequency=880, duration=0.1, count=1) mock_sd.play.assert_called_once() - mock_sd.wait.assert_called_once() + mock_sd.stop.assert_called() # Verify audio data is int16 numpy array audio_arg = mock_sd.play.call_args[0][0] assert audio_arg.dtype == np.int16 From eb79dda04be8543c0077459ec64abf46f39aa180 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 20:37:17 +0300 Subject: [PATCH 34/93] fix: persistent audio stream and silence detection improvements - Keep InputStream alive across recordings to avoid CoreAudio hang on repeated open/close cycles on macOS. New _ensure_stream() creates the stream once; start()/stop()/cancel() only toggle frame collection. - Add _close_stream_with_timeout() with daemon thread to prevent stream.stop()/close() from blocking indefinitely. - Add generation counter to detect stale stream-open completions after cancel or restart. - Run recorder.cancel() in background thread from Ctrl+C handler to keep the event loop responsive. - Add shutdown() method called on /voice off to release audio resources. - Fix silence timer reset during active speech: use dip tolerance for _resume_start tracker so natural speech pauses (< 0.3s) don't prevent the silence timer from being reset. - Update tests to match persistent stream behavior. --- cli.py | 31 ++- tests/tools/test_voice_cli_integration.py | 24 +- tests/tools/test_voice_mode.py | 5 +- tools/voice_mode.py | 293 ++++++++++++++-------- 4 files changed, 221 insertions(+), 132 deletions(-) diff --git a/cli.py b/cli.py index 1f1a134f09..beb242aef0 100755 --- a/cli.py +++ b/cli.py @@ -3848,14 +3848,26 @@ class HermesCLI: def _disable_voice_mode(self): """Disable voice mode, cancel any active recording, and stop TTS.""" + recorder = None with self._voice_lock: if self._voice_recording and self._voice_recorder: self._voice_recorder.cancel() self._voice_recording = False + recorder = self._voice_recorder self._voice_mode = False self._voice_tts = False self._voice_continuous = False + # Shut down the persistent audio stream in background + if recorder is not None: + def _bg_shutdown(rec=recorder): + try: + rec.shutdown() + except Exception: + pass + threading.Thread(target=_bg_shutdown, daemon=True).start() + self._voice_recorder = None + # Stop any active TTS playback try: from tools.voice_mode import stop_playback @@ -4799,15 +4811,24 @@ class HermesCLI: import time as _time now = _time.time() - # Cancel active voice recording + # Cancel active voice recording. + # Run cancel() in a background thread to prevent blocking the + # event loop if AudioRecorder._lock or CoreAudio takes time. + _should_cancel_voice = False + _recorder_ref = None with cli_ref._voice_lock: if cli_ref._voice_recording and cli_ref._voice_recorder: - cli_ref._voice_recorder.cancel() + _recorder_ref = cli_ref._voice_recorder cli_ref._voice_recording = False cli_ref._voice_continuous = False - _cprint(f"\n{_DIM}Recording cancelled.{_RST}") - event.app.invalidate() - return + _should_cancel_voice = True + if _should_cancel_voice: + _cprint(f"\n{_DIM}Recording cancelled.{_RST}") + threading.Thread( + target=_recorder_ref.cancel, daemon=True + ).start() + event.app.invalidate() + return # Cancel sudo prompt if self._sudo_state: diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index 105b27fc41..8bd67b30f3 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -603,28 +603,14 @@ class TestDisableVoiceModeStopsTTS: def test_disable_voice_mode_calls_stop_playback(self): """Source check: _disable_voice_mode must call stop_playback().""" - with open("cli.py") as f: - source = f.read() + import inspect + from cli import HermesCLI - # Extract _disable_voice_mode method body - lines = source.split("\n") - in_method = False - method_lines = [] - for line in lines: - if "def _disable_voice_mode" in line: - in_method = True - elif in_method: - if line.strip() and not line.startswith(" ") and not line.startswith("\t"): - break - if line.strip().startswith("def "): - break - method_lines.append(line) - - method_body = "\n".join(method_lines) - assert "stop_playback" in method_body, ( + source = inspect.getsource(HermesCLI._disable_voice_mode) + assert "stop_playback" in source, ( "_disable_voice_mode must call stop_playback()" ) - assert "_voice_tts_done.set()" in method_body, ( + assert "_voice_tts_done.set()" in source, ( "_disable_voice_mode must set _voice_tts_done" ) diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index 0312dd046d..f92bf6f2f5 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -235,8 +235,9 @@ class TestAudioRecorderCancel: assert recorder.is_recording is False assert recorder._frames == [] - mock_stream.stop.assert_called_once() - mock_stream.close.assert_called_once() + # Stream is kept alive (persistent) — cancel() does NOT close it. + mock_stream.stop.assert_not_called() + mock_stream.close.assert_not_called() def test_cancel_when_not_recording_is_safe(self): from tools.voice_mode import AudioRecorder diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 736b84a7fe..04d02143e6 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -175,6 +175,9 @@ class AudioRecorder: self._frames: List[Any] = [] self._recording = False self._start_time: float = 0.0 + # Generation counter — incremented on each start/cancel/stop to + # detect stale stream-open completions after a cancel or restart. + self._generation: int = 0 # Silence detection state self._has_spoken = False self._speech_start: float = 0.0 # When speech attempt began @@ -182,6 +185,8 @@ class AudioRecorder: self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech self._silence_start: float = 0.0 + self._resume_start: float = 0.0 # Tracks sustained speech after silence starts + self._resume_dip_start: float = 0.0 # Dip tolerance tracker for resume detection self._on_silence_stop = None self._silence_threshold: int = SILENCE_RMS_THRESHOLD self._silence_duration: float = SILENCE_DURATION_SECONDS @@ -210,9 +215,137 @@ class AudioRecorder: # -- public methods ------------------------------------------------------ + def _ensure_stream(self) -> None: + """Create the audio InputStream once and keep it alive. + + The stream stays open for the lifetime of the recorder. Between + recordings the callback simply discards audio chunks (``_recording`` + is ``False``). This avoids the CoreAudio bug where closing and + re-opening an ``InputStream`` hangs indefinitely on macOS. + """ + if self._stream is not None: + return # already alive + + sd, np = _import_audio() + + def _callback(indata, frames, time_info, status): # noqa: ARG001 + if status: + logger.debug("sounddevice status: %s", status) + # When not recording the stream is idle — discard audio. + if not self._recording: + return + self._frames.append(indata.copy()) + + # Compute RMS for level display and silence detection + rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) + self._current_rms = rms + if rms > self._peak_rms: + self._peak_rms = rms + + # Silence detection + if self._on_silence_stop is not None: + now = time.monotonic() + elapsed = now - self._start_time + + if rms > self._silence_threshold: + # Audio is above threshold -- this is speech (or noise). + self._dip_start = 0.0 # Reset dip tracker + if self._speech_start == 0.0: + self._speech_start = now + elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration: + self._has_spoken = True + logger.debug("Speech confirmed (%.2fs above threshold)", + now - self._speech_start) + # After speech is confirmed, only reset silence timer if + # speech is sustained (>0.3s above threshold). Brief + # spikes from ambient noise should NOT reset the timer. + if not self._has_spoken: + self._silence_start = 0.0 + else: + # Track resumed speech with dip tolerance. + # Brief dips below threshold are normal during speech, + # so we mirror the initial speech detection pattern: + # start tracking, tolerate short dips, confirm after 0.3s. + self._resume_dip_start = 0.0 # Above threshold — no dip + if self._resume_start == 0.0: + self._resume_start = now + elif now - self._resume_start >= self._min_speech_duration: + self._silence_start = 0.0 + self._resume_start = 0.0 + elif self._has_spoken: + # Below threshold after speech confirmed. + # Use dip tolerance before resetting resume tracker — + # natural speech has brief dips below threshold. + if self._resume_start > 0: + if self._resume_dip_start == 0.0: + self._resume_dip_start = now + elif now - self._resume_dip_start >= self._max_dip_tolerance: + # Sustained dip — user actually stopped speaking + self._resume_start = 0.0 + self._resume_dip_start = 0.0 + elif self._speech_start > 0: + # We were in a speech attempt but RMS dipped. + # Tolerate brief dips (micro-pauses between syllables). + if self._dip_start == 0.0: + self._dip_start = now + elif now - self._dip_start >= self._max_dip_tolerance: + # Dip lasted too long -- genuine silence, reset + logger.debug("Speech attempt reset (dip lasted %.2fs)", + now - self._dip_start) + self._speech_start = 0.0 + self._dip_start = 0.0 + + # Fire silence callback when: + # 1. User spoke then went silent for silence_duration, OR + # 2. No speech detected at all for max_wait seconds + should_fire = False + if self._has_spoken and rms <= self._silence_threshold: + # User was speaking and now is silent + if self._silence_start == 0.0: + self._silence_start = now + elif now - self._silence_start >= self._silence_duration: + logger.info("Silence detected (%.1fs), auto-stopping", + self._silence_duration) + should_fire = True + elif not self._has_spoken and elapsed >= self._max_wait: + logger.info("No speech within %.0fs, auto-stopping", + self._max_wait) + should_fire = True + + if should_fire: + cb = self._on_silence_stop + self._on_silence_stop = None # fire only once + if cb: + def _safe_cb(): + try: + cb() + except Exception as e: + logger.error("Silence callback failed: %s", e, exc_info=True) + threading.Thread(target=_safe_cb, daemon=True).start() + + # Create stream — may block on CoreAudio (first call only). + try: + stream = sd.InputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype=DTYPE, + callback=_callback, + ) + stream.start() + except Exception as e: + raise RuntimeError( + f"Failed to open audio input stream: {e}. " + "Check that a microphone is connected and accessible." + ) from e + self._stream = stream + def start(self, on_silence_stop=None) -> None: """Start capturing audio from the default input device. + The underlying InputStream is created once and kept alive across + recordings. Subsequent calls simply reset detection state and + toggle frame collection via ``_recording``. + Args: on_silence_stop: Optional callback invoked (in a daemon thread) when silence is detected after speech. The callback receives no arguments. @@ -222,7 +355,7 @@ class AudioRecorder: or if a recording is already in progress. """ try: - sd, np = _import_audio() + _import_audio() except (ImportError, OSError) as e: raise RuntimeError( "Voice mode requires sounddevice and numpy.\n" @@ -234,107 +367,54 @@ class AudioRecorder: if self._recording: return # already recording + self._generation += 1 + self._frames = [] self._start_time = time.monotonic() self._has_spoken = False self._speech_start = 0.0 self._dip_start = 0.0 self._silence_start = 0.0 + self._resume_start = 0.0 + self._resume_dip_start = 0.0 self._peak_rms = 0 + self._current_rms = 0 self._on_silence_stop = on_silence_stop - def _callback(indata, frames, time_info, status): # noqa: ARG001 - if status: - logger.debug("sounddevice status: %s", status) - self._frames.append(indata.copy()) + # Ensure the persistent stream is alive (no-op after first call). + self._ensure_stream() - # Compute RMS for level display and silence detection - rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) - self._current_rms = rms - if rms > self._peak_rms: - self._peak_rms = rms - - # Silence detection - if self._on_silence_stop is not None and self._recording: - now = time.monotonic() - - if rms > self._silence_threshold: - # Audio is above threshold -- this is speech (or noise). - self._dip_start = 0.0 # Reset dip tracker - if self._speech_start == 0.0: - self._speech_start = now - elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration: - self._has_spoken = True - logger.debug("Speech confirmed (%.2fs above threshold)", - now - self._speech_start) - self._silence_start = 0.0 - elif self._has_spoken: - # Speech already confirmed, let silence timer run below - pass - elif self._speech_start > 0: - # We were in a speech attempt but RMS dipped. - # Tolerate brief dips (micro-pauses between syllables). - if self._dip_start == 0.0: - self._dip_start = now - elif now - self._dip_start >= self._max_dip_tolerance: - # Dip lasted too long -- genuine silence, reset - logger.debug("Speech attempt reset (dip lasted %.2fs)", - now - self._dip_start) - self._speech_start = 0.0 - self._dip_start = 0.0 - # else: brief dip, keep tolerating - # else: no speech attempt, just silence -- nothing to do - - # Fire silence callback when: - # 1. User spoke then went silent for silence_duration, OR - # 2. No speech detected at all for max_wait seconds - should_fire = False - if self._has_spoken and rms <= self._silence_threshold: - # User was speaking and now is silent - if self._silence_start == 0.0: - self._silence_start = now - elif now - self._silence_start >= self._silence_duration: - logger.info("Silence detected (%.1fs), auto-stopping", - self._silence_duration) - should_fire = True - elif not self._has_spoken and now - self._start_time >= self._max_wait: - # No speech detected within max_wait — stop to avoid - # infinite recording in quiet environments. - logger.info("No speech within %.0fs, auto-stopping", - self._max_wait) - should_fire = True - - if should_fire: - cb = self._on_silence_stop - self._on_silence_stop = None # fire only once - if cb: - def _safe_cb(): - try: - cb() - except Exception as e: - logger.error("Silence callback failed: %s", e, exc_info=True) - threading.Thread(target=_safe_cb, daemon=True).start() - - try: - self._stream = sd.InputStream( - samplerate=SAMPLE_RATE, - channels=CHANNELS, - dtype=DTYPE, - callback=_callback, - ) - self._stream.start() - except Exception as e: - self._stream = None - raise RuntimeError( - f"Failed to open audio input stream: {e}. " - "Check that a microphone is connected and accessible." - ) from e + with self._lock: self._recording = True - logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS) + logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS) + + def _close_stream_with_timeout(self, timeout: float = 3.0) -> None: + """Close the audio stream with a timeout to prevent CoreAudio hangs.""" + if self._stream is None: + return + + stream = self._stream + self._stream = None + + def _do_close(): + try: + stream.stop() + stream.close() + except Exception: + pass + + t = threading.Thread(target=_do_close, daemon=True) + t.start() + t.join(timeout=timeout) + if t.is_alive(): + logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout) def stop(self) -> Optional[str]: """Stop recording and write captured audio to a WAV file. + The underlying stream is kept alive for reuse — only frame + collection is stopped. + Returns: Path to the WAV file, or ``None`` if no audio was captured. """ @@ -343,14 +423,9 @@ class AudioRecorder: return None self._recording = False - - if self._stream is not None: - try: - self._stream.stop() - self._stream.close() - except Exception: - pass - self._stream = None + self._generation += 1 # Invalidate any pending start() + self._current_rms = 0 + # Stream stays alive — no close needed. if not self._frames: return None @@ -379,20 +454,26 @@ class AudioRecorder: return self._write_wav(audio_data) def cancel(self) -> None: - """Stop recording and discard all captured audio.""" + """Stop recording and discard all captured audio. + + The underlying stream is kept alive for reuse. + """ + with self._lock: + self._generation += 1 # Invalidate any pending start() + self._recording = False + self._frames = [] + self._on_silence_stop = None + self._current_rms = 0 + logger.info("Voice recording cancelled") + + def shutdown(self) -> None: + """Release the audio stream. Call when voice mode is disabled.""" with self._lock: self._recording = False self._frames = [] - - if self._stream is not None: - try: - self._stream.stop() - self._stream.close() - except Exception: - pass - self._stream = None - - logger.info("Voice recording cancelled") + self._on_silence_stop = None + self._close_stream_with_timeout() + logger.info("AudioRecorder shut down") # -- private helpers ----------------------------------------------------- From 39a77431e245d8d7ae33fcda3b9d89b2113b025f Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 21:03:12 +0300 Subject: [PATCH 35/93] fix: use shutdown() instead of cancel() on CLI exit to release persistent audio stream --- cli.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cli.py b/cli.py index beb242aef0..e7e89d9388 100755 --- a/cli.py +++ b/cli.py @@ -5678,12 +5678,13 @@ class HermesCLI: self.agent.flush_memories(self.conversation_history) except Exception: pass - # Cancel active voice recording - if hasattr(self, '_voice_recorder') and self._voice_recorder and self._voice_recording: + # Shut down voice recorder (release persistent audio stream) + if hasattr(self, '_voice_recorder') and self._voice_recorder: try: - self._voice_recorder.cancel() + self._voice_recorder.shutdown() except Exception: pass + self._voice_recorder = None # Clean up old temp voice recordings try: from tools.voice_mode import cleanup_temp_recordings From 8aab13d12d97ffb3321d5f154a633b6ac4fb81c8 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 21:08:48 +0300 Subject: [PATCH 36/93] refactor: remove dead _generation counter from AudioRecorder The counter was incremented in start/stop/cancel but never read anywhere in the codebase. The race condition it was meant to guard against is practically impossible with the persistent stream design. --- tools/voice_mode.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 04d02143e6..a108ed8488 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -175,9 +175,6 @@ class AudioRecorder: self._frames: List[Any] = [] self._recording = False self._start_time: float = 0.0 - # Generation counter — incremented on each start/cancel/stop to - # detect stale stream-open completions after a cancel or restart. - self._generation: int = 0 # Silence detection state self._has_spoken = False self._speech_start: float = 0.0 # When speech attempt began @@ -367,8 +364,6 @@ class AudioRecorder: if self._recording: return # already recording - self._generation += 1 - self._frames = [] self._start_time = time.monotonic() self._has_spoken = False @@ -423,7 +418,6 @@ class AudioRecorder: return None self._recording = False - self._generation += 1 # Invalidate any pending start() self._current_rms = 0 # Stream stays alive — no close needed. @@ -459,7 +453,6 @@ class AudioRecorder: The underlying stream is kept alive for reuse. """ with self._lock: - self._generation += 1 # Invalidate any pending start() self._recording = False self._frames = [] self._on_silence_stop = None From d80da5ddd8b959f3038a0c8131a0e3c22f38898b Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 22:55:36 +0300 Subject: [PATCH 37/93] feat: add /voice command for auto voice reply in Telegram gateway - /voice on: reply with voice when user sends voice messages - /voice tts: reply with voice to all messages - /voice off: disable, text-only replies - /voice status: show current mode - Per-chat state persisted to gateway_voice_mode.json - Dedup: skips auto-reply if agent already called text_to_speech tool - drop_pending_updates=True to ignore stale Telegram messages on restart - 25 tests covering command handler, reply logic, and edge cases --- gateway/platforms/telegram.py | 6 +- gateway/run.py | 148 ++++++++++++++- tests/gateway/test_voice_command.py | 285 ++++++++++++++++++++++++++++ 3 files changed, 434 insertions(+), 5 deletions(-) create mode 100644 tests/gateway/test_voice_command.py diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 06f423c661..2a19fde9c4 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -150,7 +150,10 @@ class TelegramAdapter(BasePlatformAdapter): # Start polling in background await self._app.initialize() await self._app.start() - await self._app.updater.start_polling(allowed_updates=Update.ALL_TYPES) + await self._app.updater.start_polling( + allowed_updates=Update.ALL_TYPES, + drop_pending_updates=True, + ) # Register bot commands so Telegram shows a hint menu when users type / try: @@ -174,6 +177,7 @@ class TelegramAdapter(BasePlatformAdapter): BotCommand("insights", "Show usage insights and analytics"), BotCommand("update", "Update Hermes to the latest version"), BotCommand("reload_mcp", "Reload MCP servers from config"), + BotCommand("voice", "Toggle voice reply mode"), BotCommand("help", "Show available commands"), ]) except Exception as e: diff --git a/gateway/run.py b/gateway/run.py index 5bac7da55c..2a20c6fa57 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -14,12 +14,15 @@ Usage: """ import asyncio +import json import logging import os import re import sys import signal +import tempfile import threading +import time from logging.handlers import RotatingFileHandler from pathlib import Path from datetime import datetime @@ -280,6 +283,9 @@ class GatewayRunner: from gateway.hooks import HookRegistry self.hooks = HookRegistry() + # Per-chat voice reply mode: "off" | "voice_only" | "all" + self._voice_mode: Dict[str, str] = self._load_voice_modes() + def _get_or_create_gateway_honcho(self, session_key: str): """Return a persistent Honcho manager/config pair for this gateway session.""" if not hasattr(self, "_honcho_managers"): @@ -335,6 +341,27 @@ class GatewayRunner: for session_key in list(managers.keys()): self._shutdown_gateway_honcho(session_key) + # -- Voice mode persistence ------------------------------------------ + + _VOICE_MODE_PATH = _hermes_home / "gateway_voice_mode.json" + + def _load_voice_modes(self) -> Dict[str, str]: + try: + return json.loads(self._VOICE_MODE_PATH.read_text()) + except (FileNotFoundError, json.JSONDecodeError, OSError): + return {} + + def _save_voice_modes(self) -> None: + try: + self._VOICE_MODE_PATH.parent.mkdir(parents=True, exist_ok=True) + self._VOICE_MODE_PATH.write_text( + json.dumps(self._voice_mode, indent=2) + ) + except OSError as e: + logger.warning("Failed to save voice modes: %s", e) + + # ----------------------------------------------------------------- + def _flush_memories_for_session(self, old_session_id: str): """Prompt the agent to save memories/skills before context is lost. @@ -887,7 +914,7 @@ class GatewayRunner: 7. Return response """ source = event.source - + # Check if user is authorized if not self._is_user_authorized(source): logger.warning("Unauthorized user: %s (%s) on %s", source.user_id, source.user_name, source.platform.value) @@ -939,7 +966,7 @@ class GatewayRunner: "personality", "retry", "undo", "sethome", "set-home", "compress", "usage", "insights", "reload-mcp", "reload_mcp", "update", "title", "resume", "provider", "rollback", - "background", "reasoning"} + "background", "reasoning", "voice"} if command and command in _known_commands: await self.hooks.emit(f"command:{command}", { "platform": source.platform.value if source.platform else "", @@ -1010,7 +1037,11 @@ class GatewayRunner: if command == "reasoning": return await self._handle_reasoning_command(event) - + + if command == "voice": + return await self._handle_voice_command(event) + + # User-defined quick commands (bypass agent loop, no LLM call) if command: quick_commands = self.config.get("quick_commands", {}) @@ -1568,7 +1599,28 @@ class GatewayRunner: session_entry.session_key, last_prompt_tokens=agent_result.get("last_prompt_tokens", 0), ) - + + # Auto voice reply: send TTS audio before the text response + chat_id = source.chat_id + voice_mode = self._voice_mode.get(chat_id, "off") + is_voice_input = (event.message_type == MessageType.VOICE) + should_voice_reply = ( + (voice_mode == "all") + or (voice_mode == "voice_only" and is_voice_input) + ) + if should_voice_reply and response and not response.startswith("Error:"): + # Skip if agent already called TTS tool (avoid double voice) + has_agent_tts = any( + msg.get("role") == "assistant" + and any( + tc.get("function", {}).get("name") == "text_to_speech" + for tc in (msg.get("tool_calls") or []) + ) + for msg in agent_messages + ) + if not has_agent_tts: + await self._send_voice_reply(event, response) + return response except Exception as e: @@ -1677,6 +1729,7 @@ class GatewayRunner: "`/reasoning [level|show|hide]` — Set reasoning effort or toggle display", "`/rollback [number]` — List or restore filesystem checkpoints", "`/background ` — Run a prompt in a separate background session", + "`/voice [on|off|tts|status]` — Toggle voice reply mode", "`/reload-mcp` — Reload MCP servers from config", "`/update` — Update Hermes Agent to the latest version", "`/help` — Show this message", @@ -2052,6 +2105,93 @@ class GatewayRunner: f"Cron jobs and cross-platform messages will be delivered here." ) + async def _handle_voice_command(self, event: MessageEvent) -> str: + """Handle /voice [on|off|tts|status] command.""" + args = event.get_command_args().strip().lower() + chat_id = event.source.chat_id + + if args in ("on", "enable"): + self._voice_mode[chat_id] = "voice_only" + self._save_voice_modes() + return ( + "Voice mode enabled.\n" + "I'll reply with voice when you send voice messages.\n" + "Use /voice tts to get voice replies for all messages." + ) + elif args in ("off", "disable"): + self._voice_mode.pop(chat_id, None) + self._save_voice_modes() + return "Voice mode disabled. Text-only replies." + elif args == "tts": + self._voice_mode[chat_id] = "all" + self._save_voice_modes() + return ( + "Auto-TTS enabled.\n" + "All replies will include a voice message." + ) + elif args == "status": + mode = self._voice_mode.get(chat_id, "off") + labels = { + "off": "Off (text only)", + "voice_only": "On (voice reply to voice messages)", + "all": "TTS (voice reply to all messages)", + } + return f"Voice mode: {labels.get(mode, mode)}" + else: + # Toggle: off → on, on/all → off + current = self._voice_mode.get(chat_id, "off") + if current == "off": + self._voice_mode[chat_id] = "voice_only" + self._save_voice_modes() + return "Voice mode enabled." + else: + self._voice_mode.pop(chat_id, None) + self._save_voice_modes() + return "Voice mode disabled." + + async def _send_voice_reply(self, event: MessageEvent, text: str) -> None: + """Generate TTS audio and send as a voice message before the text reply.""" + try: + from tools.tts_tool import text_to_speech_tool, _strip_markdown_for_tts + + tts_text = _strip_markdown_for_tts(text[:4000]) + if not tts_text: + return + + ogg_path = os.path.join( + tempfile.gettempdir(), "hermes_voice", + f"tts_reply_{int(time.time())}_{id(event) % 10000}.ogg", + ) + os.makedirs(os.path.dirname(ogg_path), exist_ok=True) + + result_json = await asyncio.to_thread( + text_to_speech_tool, text=tts_text, output_path=ogg_path + ) + result = json.loads(result_json) + + if not result.get("success") or not os.path.isfile(ogg_path): + logger.warning("Auto voice reply TTS failed: %s", result.get("error")) + return + + adapter = self.adapters.get(event.source.platform) + if adapter and hasattr(adapter, "send_voice"): + _thread_md = ( + {"thread_id": event.source.thread_id} + if event.source.thread_id else None + ) + await adapter.send_voice( + event.source.chat_id, + audio_path=ogg_path, + reply_to=event.message_id, + metadata=_thread_md, + ) + try: + os.unlink(ogg_path) + except OSError: + pass + except Exception as e: + logger.warning("Auto voice reply failed: %s", e) + async def _handle_rollback_command(self, event: MessageEvent) -> str: """Handle /rollback command — list or restore filesystem checkpoints.""" from tools.checkpoint_manager import CheckpointManager, format_checkpoint_list diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py new file mode 100644 index 0000000000..6825abcfc1 --- /dev/null +++ b/tests/gateway/test_voice_command.py @@ -0,0 +1,285 @@ +"""Tests for the /voice command and auto voice reply in the gateway.""" + +import json +import os +import pytest +from unittest.mock import AsyncMock, MagicMock, patch + +from gateway.platforms.base import MessageEvent, MessageType, SessionSource + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_event(text: str = "", message_type=MessageType.TEXT, chat_id="123") -> MessageEvent: + source = SessionSource( + chat_id=chat_id, + user_id="user1", + platform=MagicMock(), + ) + source.platform.value = "telegram" + source.thread_id = None + event = MessageEvent(text=text, message_type=message_type, source=source) + event.message_id = "msg42" + return event + + +def _make_runner(tmp_path): + """Create a bare GatewayRunner without calling __init__.""" + from gateway.run import GatewayRunner + runner = object.__new__(GatewayRunner) + runner.adapters = {} + runner._voice_mode = {} + runner._VOICE_MODE_PATH = tmp_path / "gateway_voice_mode.json" + runner._session_db = None + runner.session_store = MagicMock() + return runner + + +# ===================================================================== +# /voice command handler +# ===================================================================== + +class TestHandleVoiceCommand: + + @pytest.fixture + def runner(self, tmp_path): + return _make_runner(tmp_path) + + @pytest.mark.asyncio + async def test_voice_on(self, runner): + event = _make_event("/voice on") + result = await runner._handle_voice_command(event) + assert "enabled" in result.lower() + assert runner._voice_mode["123"] == "voice_only" + + @pytest.mark.asyncio + async def test_voice_off(self, runner): + runner._voice_mode["123"] = "voice_only" + event = _make_event("/voice off") + result = await runner._handle_voice_command(event) + assert "disabled" in result.lower() + assert "123" not in runner._voice_mode + + @pytest.mark.asyncio + async def test_voice_tts(self, runner): + event = _make_event("/voice tts") + result = await runner._handle_voice_command(event) + assert "tts" in result.lower() + assert runner._voice_mode["123"] == "all" + + @pytest.mark.asyncio + async def test_voice_status_off(self, runner): + event = _make_event("/voice status") + result = await runner._handle_voice_command(event) + assert "off" in result.lower() + + @pytest.mark.asyncio + async def test_voice_status_on(self, runner): + runner._voice_mode["123"] = "voice_only" + event = _make_event("/voice status") + result = await runner._handle_voice_command(event) + assert "voice reply" in result.lower() + + @pytest.mark.asyncio + async def test_toggle_off_to_on(self, runner): + event = _make_event("/voice") + result = await runner._handle_voice_command(event) + assert "enabled" in result.lower() + assert runner._voice_mode["123"] == "voice_only" + + @pytest.mark.asyncio + async def test_toggle_on_to_off(self, runner): + runner._voice_mode["123"] = "voice_only" + event = _make_event("/voice") + result = await runner._handle_voice_command(event) + assert "disabled" in result.lower() + assert "123" not in runner._voice_mode + + @pytest.mark.asyncio + async def test_persistence_saved(self, runner): + event = _make_event("/voice on") + await runner._handle_voice_command(event) + assert runner._VOICE_MODE_PATH.exists() + data = json.loads(runner._VOICE_MODE_PATH.read_text()) + assert data["123"] == "voice_only" + + @pytest.mark.asyncio + async def test_persistence_loaded(self, runner): + runner._VOICE_MODE_PATH.write_text(json.dumps({"456": "all"})) + loaded = runner._load_voice_modes() + assert loaded == {"456": "all"} + + @pytest.mark.asyncio + async def test_per_chat_isolation(self, runner): + e1 = _make_event("/voice on", chat_id="aaa") + e2 = _make_event("/voice tts", chat_id="bbb") + await runner._handle_voice_command(e1) + await runner._handle_voice_command(e2) + assert runner._voice_mode["aaa"] == "voice_only" + assert runner._voice_mode["bbb"] == "all" + + +# ===================================================================== +# Auto voice reply decision logic +# ===================================================================== + +class TestAutoVoiceReply: + """Test the should_voice_reply decision logic (extracted from _handle_message).""" + + def _should_reply(self, voice_mode, message_type, agent_messages=None, response="Hello!"): + """Replicate the auto voice reply decision from _handle_message.""" + if not response or response.startswith("Error:"): + return False + + is_voice_input = (message_type == MessageType.VOICE) + should = ( + (voice_mode == "all") + or (voice_mode == "voice_only" and is_voice_input) + ) + if not should: + return False + + # Dedup check + if agent_messages: + has_agent_tts = any( + msg.get("role") == "assistant" + and any( + tc.get("function", {}).get("name") == "text_to_speech" + for tc in (msg.get("tool_calls") or []) + ) + for msg in agent_messages + ) + if has_agent_tts: + return False + + return True + + def test_voice_only_voice_input(self): + assert self._should_reply("voice_only", MessageType.VOICE) is True + + def test_voice_only_text_input(self): + assert self._should_reply("voice_only", MessageType.TEXT) is False + + def test_all_mode_text_input(self): + assert self._should_reply("all", MessageType.TEXT) is True + + def test_all_mode_voice_input(self): + assert self._should_reply("all", MessageType.VOICE) is True + + def test_off_mode(self): + assert self._should_reply("off", MessageType.VOICE) is False + assert self._should_reply("off", MessageType.TEXT) is False + + def test_error_response_skipped(self): + assert self._should_reply("all", MessageType.TEXT, response="Error: boom") is False + + def test_empty_response_skipped(self): + assert self._should_reply("all", MessageType.TEXT, response="") is False + + def test_dedup_skips_when_agent_called_tts(self): + messages = [{ + "role": "assistant", + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": {"name": "text_to_speech", "arguments": "{}"}, + }], + }] + assert self._should_reply("all", MessageType.TEXT, agent_messages=messages) is False + + def test_no_dedup_for_other_tools(self): + messages = [{ + "role": "assistant", + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": {"name": "web_search", "arguments": "{}"}, + }], + }] + assert self._should_reply("all", MessageType.TEXT, agent_messages=messages) is True + + +# ===================================================================== +# _send_voice_reply +# ===================================================================== + +class TestSendVoiceReply: + + @pytest.fixture + def runner(self, tmp_path): + return _make_runner(tmp_path) + + @pytest.mark.asyncio + async def test_calls_tts_and_send_voice(self, runner): + mock_adapter = AsyncMock() + mock_adapter.send_voice = AsyncMock() + event = _make_event() + runner.adapters[event.source.platform] = mock_adapter + + tts_result = json.dumps({"success": True, "file_path": "/tmp/test.ogg"}) + + with patch("tools.tts_tool.text_to_speech_tool", return_value=tts_result), \ + patch("tools.tts_tool._strip_markdown_for_tts", side_effect=lambda t: t), \ + patch("os.path.isfile", return_value=True), \ + patch("os.unlink"), \ + patch("os.makedirs"): + await runner._send_voice_reply(event, "Hello world") + + mock_adapter.send_voice.assert_called_once() + call_args = mock_adapter.send_voice.call_args + assert call_args[0][0] == "123" # chat_id + + @pytest.mark.asyncio + async def test_empty_text_after_strip_skips(self, runner): + event = _make_event() + + with patch("tools.tts_tool.text_to_speech_tool") as mock_tts, \ + patch("tools.tts_tool._strip_markdown_for_tts", return_value=""): + await runner._send_voice_reply(event, "```code only```") + + mock_tts.assert_not_called() + + @pytest.mark.asyncio + async def test_tts_failure_no_crash(self, runner): + event = _make_event() + mock_adapter = AsyncMock() + runner.adapters[event.source.platform] = mock_adapter + tts_result = json.dumps({"success": False, "error": "API error"}) + + with patch("tools.tts_tool.text_to_speech_tool", return_value=tts_result), \ + patch("tools.tts_tool._strip_markdown_for_tts", side_effect=lambda t: t), \ + patch("os.path.isfile", return_value=False), \ + patch("os.makedirs"): + await runner._send_voice_reply(event, "Hello") + + mock_adapter.send_voice.assert_not_called() + + @pytest.mark.asyncio + async def test_exception_caught(self, runner): + event = _make_event() + with patch("tools.tts_tool.text_to_speech_tool", side_effect=RuntimeError("boom")), \ + patch("tools.tts_tool._strip_markdown_for_tts", side_effect=lambda t: t), \ + patch("os.makedirs"): + # Should not raise + await runner._send_voice_reply(event, "Hello") + + +# ===================================================================== +# Help text + known commands +# ===================================================================== + +class TestVoiceInHelp: + + def test_voice_in_help_output(self): + from gateway.run import GatewayRunner + import inspect + source = inspect.getsource(GatewayRunner._handle_help_command) + assert "/voice" in source + + def test_voice_is_known_command(self): + from gateway.run import GatewayRunner + import inspect + source = inspect.getsource(GatewayRunner._handle_message) + assert '"voice"' in source From f6cf4ca8263a801a2113959e5667b41827aaaa36 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Tue, 10 Mar 2026 23:37:02 +0300 Subject: [PATCH 38/93] feat: add /voice slash command to Discord + fix cross-platform send_voice - Register /voice as Discord slash command with mode choices - Fix _send_voice_reply to handle adapters that don't accept metadata parameter (Discord) by inspecting the method signature at runtime --- gateway/platforms/discord.py | 17 +++++++++++++++++ gateway/run.py | 23 +++++++++++++---------- tests/gateway/test_voice_command.py | 2 +- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 47760d2367..d472aead87 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -627,6 +627,23 @@ class DiscordAdapter(BasePlatformAdapter): async def slash_reload_mcp(interaction: discord.Interaction): await self._run_simple_slash(interaction, "/reload-mcp") + @tree.command(name="voice", description="Toggle voice reply mode") + @discord.app_commands.describe(mode="Voice mode: on, off, tts, or status") + @discord.app_commands.choices(mode=[ + discord.app_commands.Choice(name="on — voice reply to voice messages", value="on"), + discord.app_commands.Choice(name="tts — voice reply to all messages", value="tts"), + discord.app_commands.Choice(name="off — text only", value="off"), + discord.app_commands.Choice(name="status — show current mode", value="status"), + ]) + async def slash_voice(interaction: discord.Interaction, mode: str = ""): + await interaction.response.defer(ephemeral=True) + event = self._build_slash_event(interaction, f"/voice {mode}".strip()) + await self.handle_message(event) + try: + await interaction.followup.send("Done~", ephemeral=True) + except Exception as e: + logger.debug("Discord followup failed: %s", e) + @tree.command(name="update", description="Update Hermes Agent to the latest version") async def slash_update(interaction: discord.Interaction): await self._run_simple_slash(interaction, "/update", "Update initiated~") diff --git a/gateway/run.py b/gateway/run.py index 2a20c6fa57..18757f9353 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2175,16 +2175,19 @@ class GatewayRunner: adapter = self.adapters.get(event.source.platform) if adapter and hasattr(adapter, "send_voice"): - _thread_md = ( - {"thread_id": event.source.thread_id} - if event.source.thread_id else None - ) - await adapter.send_voice( - event.source.chat_id, - audio_path=ogg_path, - reply_to=event.message_id, - metadata=_thread_md, - ) + send_kwargs: Dict[str, Any] = { + "chat_id": event.source.chat_id, + "audio_path": ogg_path, + "reply_to": event.message_id, + } + if event.source.thread_id: + send_kwargs["metadata"] = {"thread_id": event.source.thread_id} + # Only pass metadata if the adapter accepts it + import inspect + sig = inspect.signature(adapter.send_voice) + if "metadata" not in sig.parameters: + send_kwargs.pop("metadata", None) + await adapter.send_voice(**send_kwargs) try: os.unlink(ogg_path) except OSError: diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py index 6825abcfc1..da84c68bf8 100644 --- a/tests/gateway/test_voice_command.py +++ b/tests/gateway/test_voice_command.py @@ -229,7 +229,7 @@ class TestSendVoiceReply: mock_adapter.send_voice.assert_called_once() call_args = mock_adapter.send_voice.call_args - assert call_args[0][0] == "123" # chat_id + assert call_args.kwargs.get("chat_id") == "123" @pytest.mark.asyncio async def test_empty_text_after_strip_skips(self, runner): From cbe4c23efa064c6572af6bed547c989b509a2508 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 00:24:29 +0300 Subject: [PATCH 39/93] fix: Discord voice bubble + edge-tts mp3/ogg format mismatch - Send Discord voice messages with flags=8192 and waveform metadata so they render as native voice bubbles instead of file attachments - Use .mp3 output path for TTS so edge-tts opus conversion works correctly (edge always outputs mp3, convert was skipped for .ogg) - Use actual file_path from TTS result after potential opus conversion --- gateway/platforms/discord.py | 63 ++++++++++++++++++++++++++++++++++-- gateway/run.py | 25 ++++++++------ 2 files changed, 75 insertions(+), 13 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index d472aead87..717fc921bb 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -297,9 +297,66 @@ class DiscordAdapter(BasePlatformAdapter): ) -> SendResult: """Send audio as a Discord file attachment.""" try: - return await self._send_file_attachment(chat_id, audio_path, caption) - except FileNotFoundError: - return SendResult(success=False, error=f"Audio file not found: {audio_path}") + import io + + channel = self._client.get_channel(int(chat_id)) + if not channel: + channel = await self._client.fetch_channel(int(chat_id)) + if not channel: + return SendResult(success=False, error=f"Channel {chat_id} not found") + + if not os.path.exists(audio_path): + return SendResult(success=False, error=f"Audio file not found: {audio_path}") + + filename = os.path.basename(audio_path) + + with open(audio_path, "rb") as f: + file_data = f.read() + + # Try sending as a native voice message via raw API (flags=8192). + try: + import base64 + + duration_secs = 5.0 + try: + from mutagen.oggopus import OggOpus + info = OggOpus(audio_path) + duration_secs = info.info.length + except Exception: + duration_secs = max(1.0, len(file_data) / 2000.0) + + waveform_bytes = bytes([128] * 256) + waveform_b64 = base64.b64encode(waveform_bytes).decode() + + import json as _json + payload = _json.dumps({ + "flags": 8192, + "attachments": [{ + "id": "0", + "filename": "voice-message.ogg", + "duration_secs": round(duration_secs, 2), + "waveform": waveform_b64, + }], + }) + form = [ + {"name": "payload_json", "value": payload}, + { + "name": "files[0]", + "value": file_data, + "filename": "voice-message.ogg", + "content_type": "audio/ogg", + }, + ] + msg_data = await self._client.http.request( + discord.http.Route("POST", "/channels/{channel_id}/messages", channel_id=channel.id), + form=form, + ) + return SendResult(success=True, message_id=str(msg_data["id"])) + except Exception as voice_err: + logger.debug("Voice message flag failed, falling back to file: %s", voice_err) + file = discord.File(io.BytesIO(file_data), filename=filename) + msg = await channel.send(file=file) + return SendResult(success=True, message_id=str(msg.id)) except Exception as e: # pragma: no cover - defensive logging logger.error("[%s] Failed to send audio, falling back to base adapter: %s", self.name, e, exc_info=True) return await super().send_voice(chat_id, audio_path, caption, reply_to, metadata=metadata) diff --git a/gateway/run.py b/gateway/run.py index 18757f9353..79e5c3bc92 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2158,18 +2158,22 @@ class GatewayRunner: if not tts_text: return - ogg_path = os.path.join( + # Use .mp3 extension so edge-tts conversion to opus works correctly. + # The TTS tool may convert to .ogg — use file_path from result. + audio_path = os.path.join( tempfile.gettempdir(), "hermes_voice", - f"tts_reply_{int(time.time())}_{id(event) % 10000}.ogg", + f"tts_reply_{int(time.time())}_{id(event) % 10000}.mp3", ) - os.makedirs(os.path.dirname(ogg_path), exist_ok=True) + os.makedirs(os.path.dirname(audio_path), exist_ok=True) result_json = await asyncio.to_thread( - text_to_speech_tool, text=tts_text, output_path=ogg_path + text_to_speech_tool, text=tts_text, output_path=audio_path ) result = json.loads(result_json) - if not result.get("success") or not os.path.isfile(ogg_path): + # Use the actual file path from result (may differ after opus conversion) + actual_path = result.get("file_path", audio_path) + if not result.get("success") or not os.path.isfile(actual_path): logger.warning("Auto voice reply TTS failed: %s", result.get("error")) return @@ -2177,7 +2181,7 @@ class GatewayRunner: if adapter and hasattr(adapter, "send_voice"): send_kwargs: Dict[str, Any] = { "chat_id": event.source.chat_id, - "audio_path": ogg_path, + "audio_path": actual_path, "reply_to": event.message_id, } if event.source.thread_id: @@ -2188,10 +2192,11 @@ class GatewayRunner: if "metadata" not in sig.parameters: send_kwargs.pop("metadata", None) await adapter.send_voice(**send_kwargs) - try: - os.unlink(ogg_path) - except OSError: - pass + for p in {audio_path, actual_path}: + try: + os.unlink(p) + except OSError: + pass except Exception as e: logger.warning("Auto voice reply failed: %s", e) From cc974904f8a6ff9e07bc364b400d1de69c9dcb06 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 02:13:43 +0300 Subject: [PATCH 40/93] =?UTF-8?q?feat:=20Discord=20voice=20channel=20suppo?= =?UTF-8?q?rt=20=E2=80=94=20bot=20joins=20VC=20and=20speaks=20replies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - /voice channel: bot joins user's voice channel, speaks TTS replies - /voice leave: disconnect from voice channel - Auto-disconnect after 5 min inactivity - _get_guild_id() helper extracts guild from raw_message - Load opus codec for voice playback - discord.py[voice] in pyproject.toml (pulls PyNaCl + davey) --- gateway/platforms/discord.py | 124 ++++++++++++++++++++++++++++++++++- gateway/run.py | 89 +++++++++++++++++++++++-- pyproject.toml | 2 +- 3 files changed, 209 insertions(+), 6 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 717fc921bb..a7fd45f6a1 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -82,17 +82,35 @@ class DiscordAdapter(BasePlatformAdapter): # Discord message limits MAX_MESSAGE_LENGTH = 2000 + # Auto-disconnect from voice channel after this many seconds of inactivity + VOICE_TIMEOUT = 300 + def __init__(self, config: PlatformConfig): super().__init__(config, Platform.DISCORD) self._client: Optional[commands.Bot] = None self._ready_event = asyncio.Event() self._allowed_user_ids: set = set() # For button approval authorization + # Voice channel state (per-guild) + self._voice_clients: Dict[int, Any] = {} # guild_id -> VoiceClient + self._voice_text_channels: Dict[int, int] = {} # guild_id -> text_channel_id + self._voice_timeout_tasks: Dict[int, asyncio.Task] = {} # guild_id -> timeout task async def connect(self) -> bool: """Connect to Discord and start receiving events.""" if not DISCORD_AVAILABLE: logger.error("[%s] discord.py not installed. Run: pip install discord.py", self.name) return False + + # Load opus codec for voice channel support + if not discord.opus.is_loaded(): + try: + discord.opus.load_opus("/opt/homebrew/lib/libopus.dylib") + except Exception: + # Try common Linux path as fallback + try: + discord.opus.load_opus("libopus.so.0") + except Exception: + logger.warning("Opus codec not found — voice channel playback disabled") if not self.config.token: logger.error("[%s] No bot token configured", self.name) @@ -361,6 +379,108 @@ class DiscordAdapter(BasePlatformAdapter): logger.error("[%s] Failed to send audio, falling back to base adapter: %s", self.name, e, exc_info=True) return await super().send_voice(chat_id, audio_path, caption, reply_to, metadata=metadata) + # ------------------------------------------------------------------ + # Voice channel methods (join / leave / play) + # ------------------------------------------------------------------ + + async def join_voice_channel(self, channel) -> bool: + """Join a Discord voice channel. Returns True on success.""" + if not self._client or not DISCORD_AVAILABLE: + return False + guild_id = channel.guild.id + + # Already connected in this guild? + existing = self._voice_clients.get(guild_id) + if existing and existing.is_connected(): + if existing.channel.id == channel.id: + self._reset_voice_timeout(guild_id) + return True + await existing.move_to(channel) + self._reset_voice_timeout(guild_id) + return True + + vc = await channel.connect() + self._voice_clients[guild_id] = vc + self._reset_voice_timeout(guild_id) + return True + + async def leave_voice_channel(self, guild_id: int) -> None: + """Disconnect from the voice channel in a guild.""" + vc = self._voice_clients.pop(guild_id, None) + if vc and vc.is_connected(): + await vc.disconnect() + task = self._voice_timeout_tasks.pop(guild_id, None) + if task: + task.cancel() + self._voice_text_channels.pop(guild_id, None) + + async def play_in_voice_channel(self, guild_id: int, audio_path: str) -> bool: + """Play an audio file in the connected voice channel.""" + vc = self._voice_clients.get(guild_id) + if not vc or not vc.is_connected(): + return False + + # Wait for current playback to finish + while vc.is_playing(): + await asyncio.sleep(0.1) + + done = asyncio.Event() + loop = asyncio.get_event_loop() + + def _after(error): + if error: + logger.error("Voice playback error: %s", error) + loop.call_soon_threadsafe(done.set) + + source = discord.FFmpegPCMAudio(audio_path) + source = discord.PCMVolumeTransformer(source, volume=1.0) + vc.play(source, after=_after) + await done.wait() + self._reset_voice_timeout(guild_id) + return True + + async def get_user_voice_channel(self, guild_id: int, user_id: str): + """Return the voice channel the user is currently in, or None.""" + if not self._client: + return None + guild = self._client.get_guild(guild_id) + if not guild: + return None + member = guild.get_member(int(user_id)) + if not member or not member.voice: + return None + return member.voice.channel + + def _reset_voice_timeout(self, guild_id: int) -> None: + """Reset the auto-disconnect inactivity timer.""" + task = self._voice_timeout_tasks.pop(guild_id, None) + if task: + task.cancel() + self._voice_timeout_tasks[guild_id] = asyncio.ensure_future( + self._voice_timeout_handler(guild_id) + ) + + async def _voice_timeout_handler(self, guild_id: int) -> None: + """Auto-disconnect after VOICE_TIMEOUT seconds of inactivity.""" + try: + await asyncio.sleep(self.VOICE_TIMEOUT) + except asyncio.CancelledError: + return + text_ch_id = self._voice_text_channels.get(guild_id) + await self.leave_voice_channel(guild_id) + if text_ch_id and self._client: + ch = self._client.get_channel(text_ch_id) + if ch: + try: + await ch.send("Left voice channel (inactivity timeout).") + except Exception: + pass + + def is_in_voice_channel(self, guild_id: int) -> bool: + """Check if the bot is connected to a voice channel in this guild.""" + vc = self._voice_clients.get(guild_id) + return vc is not None and vc.is_connected() + async def send_image_file( self, chat_id: str, @@ -685,8 +805,10 @@ class DiscordAdapter(BasePlatformAdapter): await self._run_simple_slash(interaction, "/reload-mcp") @tree.command(name="voice", description="Toggle voice reply mode") - @discord.app_commands.describe(mode="Voice mode: on, off, tts, or status") + @discord.app_commands.describe(mode="Voice mode: on, off, tts, channel, leave, or status") @discord.app_commands.choices(mode=[ + discord.app_commands.Choice(name="channel — join your voice channel", value="channel"), + discord.app_commands.Choice(name="leave — leave voice channel", value="leave"), discord.app_commands.Choice(name="on — voice reply to voice messages", value="on"), discord.app_commands.Choice(name="tts — voice reply to all messages", value="tts"), discord.app_commands.Choice(name="off — text only", value="off"), diff --git a/gateway/run.py b/gateway/run.py index 79e5c3bc92..4674548aae 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2105,8 +2105,22 @@ class GatewayRunner: f"Cron jobs and cross-platform messages will be delivered here." ) + @staticmethod + def _get_guild_id(event: MessageEvent) -> Optional[int]: + """Extract Discord guild_id from the raw message object.""" + raw = getattr(event, "raw_message", None) + if raw is None: + return None + # Slash command interaction + if hasattr(raw, "guild_id") and raw.guild_id: + return int(raw.guild_id) + # Regular message + if hasattr(raw, "guild") and raw.guild: + return raw.guild.id + return None + async def _handle_voice_command(self, event: MessageEvent) -> str: - """Handle /voice [on|off|tts|status] command.""" + """Handle /voice [on|off|tts|channel|leave|status] command.""" args = event.get_command_args().strip().lower() chat_id = event.source.chat_id @@ -2129,6 +2143,10 @@ class GatewayRunner: "Auto-TTS enabled.\n" "All replies will include a voice message." ) + elif args in ("channel", "join"): + return await self._handle_voice_channel_join(event) + elif args == "leave": + return await self._handle_voice_channel_leave(event) elif args == "status": mode = self._voice_mode.get(chat_id, "off") labels = { @@ -2136,6 +2154,14 @@ class GatewayRunner: "voice_only": "On (voice reply to voice messages)", "all": "TTS (voice reply to all messages)", } + # Append voice channel info if connected + adapter = self.adapters.get(event.source.platform) + guild_id = self._get_guild_id(event) + if guild_id and hasattr(adapter, "is_in_voice_channel"): + if adapter.is_in_voice_channel(guild_id): + vc = adapter._voice_clients.get(guild_id) + ch_name = vc.channel.name if vc and vc.channel else "unknown" + return f"Voice mode: {labels.get(mode, mode)}\nVoice channel: {ch_name}" return f"Voice mode: {labels.get(mode, mode)}" else: # Toggle: off → on, on/all → off @@ -2149,6 +2175,54 @@ class GatewayRunner: self._save_voice_modes() return "Voice mode disabled." + async def _handle_voice_channel_join(self, event: MessageEvent) -> str: + """Join the user's current Discord voice channel.""" + adapter = self.adapters.get(event.source.platform) + if not hasattr(adapter, "join_voice_channel"): + return "Voice channels are not supported on this platform." + + guild_id = self._get_guild_id(event) + if not guild_id: + return "This command only works in a Discord server." + + voice_channel = await adapter.get_user_voice_channel( + guild_id, event.source.user_id + ) + if not voice_channel: + return "You need to be in a voice channel first." + + try: + success = await adapter.join_voice_channel(voice_channel) + except Exception as e: + logger.warning("Failed to join voice channel: %s", e) + return f"Failed to join voice channel: {e}" + + if success: + adapter._voice_text_channels[guild_id] = int(event.source.chat_id) + self._voice_mode[event.source.chat_id] = "all" + self._save_voice_modes() + return ( + f"Joined voice channel **{voice_channel.name}**.\n" + f"I'll speak my replies here. Use /voice leave to disconnect." + ) + return "Failed to join voice channel. Check bot permissions (Connect + Speak)." + + async def _handle_voice_channel_leave(self, event: MessageEvent) -> str: + """Leave the Discord voice channel.""" + adapter = self.adapters.get(event.source.platform) + guild_id = self._get_guild_id(event) + + if not guild_id or not hasattr(adapter, "leave_voice_channel"): + return "Not in a voice channel." + + if not hasattr(adapter, "is_in_voice_channel") or not adapter.is_in_voice_channel(guild_id): + return "Not in a voice channel." + + await adapter.leave_voice_channel(guild_id) + self._voice_mode.pop(event.source.chat_id, None) + self._save_voice_modes() + return "Left voice channel." + async def _send_voice_reply(self, event: MessageEvent, text: str) -> None: """Generate TTS audio and send as a voice message before the text reply.""" try: @@ -2178,7 +2252,15 @@ class GatewayRunner: return adapter = self.adapters.get(event.source.platform) - if adapter and hasattr(adapter, "send_voice"): + + # If connected to a voice channel, play there instead of sending a file + guild_id = self._get_guild_id(event) + if (guild_id + and hasattr(adapter, "play_in_voice_channel") + and hasattr(adapter, "is_in_voice_channel") + and adapter.is_in_voice_channel(guild_id)): + await adapter.play_in_voice_channel(guild_id, actual_path) + elif adapter and hasattr(adapter, "send_voice"): send_kwargs: Dict[str, Any] = { "chat_id": event.source.chat_id, "audio_path": actual_path, @@ -2186,7 +2268,6 @@ class GatewayRunner: } if event.source.thread_id: send_kwargs["metadata"] = {"thread_id": event.source.thread_id} - # Only pass metadata if the adapter accepts it import inspect sig = inspect.signature(adapter.send_voice) if "metadata" not in sig.parameters: @@ -2198,7 +2279,7 @@ class GatewayRunner: except OSError: pass except Exception as e: - logger.warning("Auto voice reply failed: %s", e) + logger.warning("Auto voice reply failed: %s", e, exc_info=True) async def _handle_rollback_command(self, event: MessageEvent) -> str: """Handle /rollback command — list or restore filesystem checkpoints.""" diff --git a/pyproject.toml b/pyproject.toml index eb005ab942..fa248cd0e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ dependencies = [ modal = ["swe-rex[modal]>=1.4.0"] daytona = ["daytona>=0.148.0"] dev = ["pytest", "pytest-asyncio", "pytest-xdist", "mcp>=1.2.0"] -messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] +messaging = ["python-telegram-bot>=20.0", "discord.py[voice]>=2.0", "aiohttp>=3.9.0", "slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] cron = ["croniter"] slack = ["slack-bolt>=1.18.0", "slack-sdk>=3.27.0"] cli = ["simple-term-menu"] From c0c358d05123d15476a640226b0915fa57dd2853 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 04:34:58 +0300 Subject: [PATCH 41/93] =?UTF-8?q?feat:=20add=20Discord=20voice=20channel?= =?UTF-8?q?=20listening=20=E2=80=94=20STT=20transcription=20and=20agent=20?= =?UTF-8?q?response=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of voice channel support: bot listens to users speaking in VC, transcribes speech via Groq Whisper, and processes through the agent pipeline. - Add VoiceReceiver class for RTP packet capture, NaCl/DAVE decryption, Opus decode - Add silence detection and per-user PCM buffering - Wire voice input callback from adapter to GatewayRunner - Fix adapter dict key: use Platform.DISCORD enum instead of string - Fix guild_id extraction for synthetic voice events via SimpleNamespace raw_message - Pause/resume receiver during TTS playback to prevent echo --- gateway/platforms/discord.py | 420 +++++++++++++++++++++++++++++++++-- gateway/run.py | 51 ++++- 2 files changed, 454 insertions(+), 17 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index a7fd45f6a1..c2cc643fd5 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -10,7 +10,13 @@ Uses discord.py library for: import asyncio import logging import os -from typing import Dict, List, Optional, Any +import struct +import subprocess +import tempfile +import threading +import time +from collections import defaultdict +from typing import Callable, Dict, List, Optional, Any logger = logging.getLogger(__name__) @@ -65,6 +71,294 @@ def check_discord_requirements() -> bool: return DISCORD_AVAILABLE +class VoiceReceiver: + """Captures and decodes voice audio from a Discord voice channel. + + Attaches to a VoiceClient's socket listener, decrypts RTP packets + (NaCl transport + DAVE E2EE), decodes Opus to PCM, and buffers + per-user audio. A polling loop detects silence and delivers + completed utterances via a callback. + """ + + SILENCE_THRESHOLD = 1.5 # seconds of silence → end of utterance + MIN_SPEECH_DURATION = 0.5 # minimum seconds to process (skip noise) + SAMPLE_RATE = 48000 # Discord native rate + CHANNELS = 2 # Discord sends stereo + + def __init__(self, voice_client): + self._vc = voice_client + self._running = False + + # Decryption + self._secret_key: Optional[bytes] = None + self._dave_session = None + self._bot_ssrc: int = 0 + + # SSRC -> user_id mapping (populated from SPEAKING events) + self._ssrc_to_user: Dict[int, int] = {} + self._lock = threading.Lock() + + # Per-user audio buffers + self._buffers: Dict[int, bytearray] = defaultdict(bytearray) + self._last_packet_time: Dict[int, float] = {} + + # Opus decoder per SSRC (each user needs own decoder state) + self._decoders: Dict[int, object] = {} + + # Pause flag: don't capture while bot is playing TTS + self._paused = False + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def start(self): + """Start listening for voice packets.""" + conn = self._vc._connection + self._secret_key = bytes(conn.secret_key) + self._dave_session = conn.dave_session + self._bot_ssrc = conn.ssrc + + self._install_speaking_hook(conn) + conn.add_socket_listener(self._on_packet) + self._running = True + logger.info("VoiceReceiver started (bot_ssrc=%d)", self._bot_ssrc) + + def stop(self): + """Stop listening and clean up.""" + self._running = False + try: + self._vc._connection.remove_socket_listener(self._on_packet) + except Exception: + pass + self._buffers.clear() + self._last_packet_time.clear() + self._decoders.clear() + self._ssrc_to_user.clear() + logger.info("VoiceReceiver stopped") + + def pause(self): + self._paused = True + + def resume(self): + self._paused = False + + # ------------------------------------------------------------------ + # SSRC -> user_id mapping via SPEAKING opcode hook + # ------------------------------------------------------------------ + + def map_ssrc(self, ssrc: int, user_id: int): + with self._lock: + self._ssrc_to_user[ssrc] = user_id + + def _install_speaking_hook(self, conn): + """Wrap the voice websocket hook to capture SPEAKING events (op 5). + + VoiceConnectionState stores the hook as ``conn.hook`` (public attr). + It is passed to DiscordVoiceWebSocket on each (re)connect, so we + must wrap it on the VoiceConnectionState level AND on the current + live websocket instance. + """ + original_hook = conn.hook + receiver_self = self + + async def wrapped_hook(ws, msg): + if isinstance(msg, dict) and msg.get("op") == 5: + data = msg.get("d", {}) + ssrc = data.get("ssrc") + user_id = data.get("user_id") + if ssrc and user_id: + logger.info("SPEAKING event: ssrc=%d -> user=%s", ssrc, user_id) + receiver_self.map_ssrc(int(ssrc), int(user_id)) + if original_hook: + await original_hook(ws, msg) + + # Set on connection state (for future reconnects) + conn.hook = wrapped_hook + # Set on the current live websocket (for immediate effect) + try: + from discord.utils import MISSING + if hasattr(conn, 'ws') and conn.ws is not MISSING: + conn.ws._hook = wrapped_hook + logger.info("Speaking hook installed on live websocket") + except Exception as e: + logger.warning("Could not install hook on live ws: %s", e) + + # ------------------------------------------------------------------ + # Packet handler (called from SocketReader thread) + # ------------------------------------------------------------------ + + _packet_debug_count = 0 # class-level counter for debug logging + + def _on_packet(self, data: bytes): + if not self._running or self._paused: + return + + # Log first few raw packets for debugging + VoiceReceiver._packet_debug_count += 1 + if VoiceReceiver._packet_debug_count <= 5: + logger.info( + "Raw UDP packet: len=%d, first_bytes=%s", + len(data), data[:4].hex() if len(data) >= 4 else "short", + ) + + if len(data) < 16: + return + + # RTP version check: top 2 bits must be 10 (version 2). + # Lower bits may vary (padding, extension, CSRC count). + # Payload type (byte 1 lower 7 bits) = 0x78 (120) for voice. + if (data[0] >> 6) != 2 or (data[1] & 0x7F) != 0x78: + if VoiceReceiver._packet_debug_count <= 5: + logger.info("Skipped non-RTP: byte0=0x%02x byte1=0x%02x", data[0], data[1]) + return + + first_byte = data[0] + _, _, seq, timestamp, ssrc = struct.unpack_from(">BBHII", data, 0) + + # Skip bot's own audio + if ssrc == self._bot_ssrc: + return + + # Calculate dynamic RTP header size (RFC 9335 / rtpsize mode) + cc = first_byte & 0x0F # CSRC count + has_extension = bool(first_byte & 0x10) # extension bit + header_size = 12 + (4 * cc) + (4 if has_extension else 0) + + if len(data) < header_size + 4: # need at least header + nonce + return + + # Read extension length from preamble (for skipping after decrypt) + ext_data_len = 0 + if has_extension: + ext_preamble_offset = 12 + (4 * cc) + ext_words = struct.unpack_from(">H", data, ext_preamble_offset + 2)[0] + ext_data_len = ext_words * 4 + + if VoiceReceiver._packet_debug_count <= 10: + with self._lock: + known_user = self._ssrc_to_user.get(ssrc, "unknown") + logger.info( + "RTP packet: ssrc=%d, seq=%d, user=%s, hdr=%d, ext_data=%d", + ssrc, seq, known_user, header_size, ext_data_len, + ) + + header = bytes(data[:header_size]) + payload_with_nonce = data[header_size:] + + # --- NaCl transport decrypt (aead_xchacha20_poly1305_rtpsize) --- + if len(payload_with_nonce) < 4: + return + nonce = bytearray(24) + nonce[:4] = payload_with_nonce[-4:] + encrypted = bytes(payload_with_nonce[:-4]) + + try: + import nacl.secret # noqa: delayed import – only in voice path + box = nacl.secret.Aead(self._secret_key) + decrypted = box.decrypt(encrypted, header, bytes(nonce)) + except Exception as e: + if VoiceReceiver._packet_debug_count <= 10: + logger.warning("NaCl decrypt failed: %s (hdr=%d, enc=%d)", e, header_size, len(encrypted)) + return + + # Skip encrypted extension data to get the actual opus payload + if ext_data_len and len(decrypted) > ext_data_len: + decrypted = decrypted[ext_data_len:] + + # --- DAVE E2EE decrypt --- + if self._dave_session: + with self._lock: + user_id = self._ssrc_to_user.get(ssrc, 0) + if user_id == 0: + if VoiceReceiver._packet_debug_count <= 10: + logger.warning("DAVE skip: unknown user for ssrc=%d", ssrc) + return # unknown user, can't DAVE-decrypt + try: + import davey + decrypted = self._dave_session.decrypt( + user_id, davey.MediaType.audio, decrypted + ) + except Exception as e: + if VoiceReceiver._packet_debug_count <= 10: + logger.warning("DAVE decrypt failed for ssrc=%d: %s", ssrc, e) + return + + # --- Opus decode -> PCM --- + try: + if ssrc not in self._decoders: + self._decoders[ssrc] = discord.opus.Decoder() + pcm = self._decoders[ssrc].decode(decrypted) + self._buffers[ssrc].extend(pcm) + self._last_packet_time[ssrc] = time.monotonic() + except Exception: + return + + # ------------------------------------------------------------------ + # Silence detection + # ------------------------------------------------------------------ + + def check_silence(self) -> list: + """Return list of (user_id, pcm_bytes) for completed utterances.""" + now = time.monotonic() + completed = [] + + with self._lock: + ssrc_user_map = dict(self._ssrc_to_user) + + for ssrc in list(self._buffers.keys()): + last_time = self._last_packet_time.get(ssrc, now) + silence_duration = now - last_time + buf = self._buffers[ssrc] + # 48kHz, 16-bit, stereo = 192000 bytes/sec + buf_duration = len(buf) / (self.SAMPLE_RATE * self.CHANNELS * 2) + + if silence_duration >= self.SILENCE_THRESHOLD and buf_duration >= self.MIN_SPEECH_DURATION: + user_id = ssrc_user_map.get(ssrc, 0) + if user_id: + completed.append((user_id, bytes(buf))) + self._buffers[ssrc] = bytearray() + self._last_packet_time.pop(ssrc, None) + elif silence_duration >= self.SILENCE_THRESHOLD * 2: + # Stale buffer with no valid user — discard + self._buffers.pop(ssrc, None) + self._last_packet_time.pop(ssrc, None) + + return completed + + # ------------------------------------------------------------------ + # PCM -> WAV conversion (for Whisper STT) + # ------------------------------------------------------------------ + + @staticmethod + def pcm_to_wav(pcm_data: bytes, output_path: str, + src_rate: int = 48000, src_channels: int = 2): + """Convert raw PCM to 16kHz mono WAV via ffmpeg.""" + with tempfile.NamedTemporaryFile(suffix=".pcm", delete=False) as f: + f.write(pcm_data) + pcm_path = f.name + try: + subprocess.run( + [ + "ffmpeg", "-y", "-loglevel", "error", + "-f", "s16le", + "-ar", str(src_rate), + "-ac", str(src_channels), + "-i", pcm_path, + "-ar", "16000", + "-ac", "1", + output_path, + ], + check=True, + timeout=10, + ) + finally: + try: + os.unlink(pcm_path) + except OSError: + pass + + class DiscordAdapter(BasePlatformAdapter): """ Discord bot adapter. @@ -94,6 +388,10 @@ class DiscordAdapter(BasePlatformAdapter): self._voice_clients: Dict[int, Any] = {} # guild_id -> VoiceClient self._voice_text_channels: Dict[int, int] = {} # guild_id -> text_channel_id self._voice_timeout_tasks: Dict[int, asyncio.Task] = {} # guild_id -> timeout task + # Phase 2: voice listening + self._voice_receivers: Dict[int, VoiceReceiver] = {} # guild_id -> VoiceReceiver + self._voice_listen_tasks: Dict[int, asyncio.Task] = {} # guild_id -> listen loop + self._voice_input_callback: Optional[Callable] = None # set by run.py async def connect(self) -> bool: """Connect to Discord and start receiving events.""" @@ -402,10 +700,30 @@ class DiscordAdapter(BasePlatformAdapter): vc = await channel.connect() self._voice_clients[guild_id] = vc self._reset_voice_timeout(guild_id) + + # Start voice receiver (Phase 2: listen to users) + try: + receiver = VoiceReceiver(vc) + receiver.start() + self._voice_receivers[guild_id] = receiver + self._voice_listen_tasks[guild_id] = asyncio.ensure_future( + self._voice_listen_loop(guild_id) + ) + except Exception as e: + logger.warning("Voice receiver failed to start: %s", e) + return True async def leave_voice_channel(self, guild_id: int) -> None: """Disconnect from the voice channel in a guild.""" + # Stop voice receiver first + receiver = self._voice_receivers.pop(guild_id, None) + if receiver: + receiver.stop() + listen_task = self._voice_listen_tasks.pop(guild_id, None) + if listen_task: + listen_task.cancel() + vc = self._voice_clients.pop(guild_id, None) if vc and vc.is_connected(): await vc.disconnect() @@ -420,24 +738,33 @@ class DiscordAdapter(BasePlatformAdapter): if not vc or not vc.is_connected(): return False - # Wait for current playback to finish - while vc.is_playing(): - await asyncio.sleep(0.1) + # Pause voice receiver while playing (echo prevention) + receiver = self._voice_receivers.get(guild_id) + if receiver: + receiver.pause() - done = asyncio.Event() - loop = asyncio.get_event_loop() + try: + # Wait for current playback to finish + while vc.is_playing(): + await asyncio.sleep(0.1) - def _after(error): - if error: - logger.error("Voice playback error: %s", error) - loop.call_soon_threadsafe(done.set) + done = asyncio.Event() + loop = asyncio.get_event_loop() - source = discord.FFmpegPCMAudio(audio_path) - source = discord.PCMVolumeTransformer(source, volume=1.0) - vc.play(source, after=_after) - await done.wait() - self._reset_voice_timeout(guild_id) - return True + def _after(error): + if error: + logger.error("Voice playback error: %s", error) + loop.call_soon_threadsafe(done.set) + + source = discord.FFmpegPCMAudio(audio_path) + source = discord.PCMVolumeTransformer(source, volume=1.0) + vc.play(source, after=_after) + await done.wait() + self._reset_voice_timeout(guild_id) + return True + finally: + if receiver: + receiver.resume() async def get_user_voice_channel(self, guild_id: int, user_id: str): """Return the voice channel the user is currently in, or None.""" @@ -481,6 +808,67 @@ class DiscordAdapter(BasePlatformAdapter): vc = self._voice_clients.get(guild_id) return vc is not None and vc.is_connected() + # ------------------------------------------------------------------ + # Voice listening (Phase 2) + # ------------------------------------------------------------------ + + async def _voice_listen_loop(self, guild_id: int): + """Periodically check for completed utterances and process them.""" + receiver = self._voice_receivers.get(guild_id) + if not receiver: + return + try: + while receiver._running: + await asyncio.sleep(0.2) + completed = receiver.check_silence() + for user_id, pcm_data in completed: + if not self._is_allowed_user(str(user_id)): + continue + await self._process_voice_input(guild_id, user_id, pcm_data) + except asyncio.CancelledError: + pass + except Exception as e: + logger.error("Voice listen loop error: %s", e, exc_info=True) + + async def _process_voice_input(self, guild_id: int, user_id: int, pcm_data: bytes): + """Convert PCM -> WAV -> STT -> callback.""" + from tools.voice_mode import is_whisper_hallucination + + wav_path = tempfile.mktemp(suffix=".wav", prefix="vc_listen_") + try: + await asyncio.to_thread(VoiceReceiver.pcm_to_wav, pcm_data, wav_path) + + from tools.transcription_tools import transcribe_audio + result = await asyncio.to_thread(transcribe_audio, wav_path) + + if not result.get("success"): + return + transcript = result.get("transcript", "").strip() + if not transcript or is_whisper_hallucination(transcript): + return + + logger.info("Voice input from user %d: %s", user_id, transcript[:100]) + + if self._voice_input_callback: + await self._voice_input_callback( + guild_id=guild_id, + user_id=user_id, + transcript=transcript, + ) + except Exception as e: + logger.warning("Voice input processing failed: %s", e, exc_info=True) + finally: + try: + os.unlink(wav_path) + except OSError: + pass + + def _is_allowed_user(self, user_id: str) -> bool: + """Check if user is in DISCORD_ALLOWED_USERS.""" + if not self._allowed_user_ids: + return True + return user_id in self._allowed_user_ids + async def send_image_file( self, chat_id: str, diff --git a/gateway/run.py b/gateway/run.py index 4674548aae..bee9b62a1c 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1608,6 +1608,8 @@ class GatewayRunner: (voice_mode == "all") or (voice_mode == "voice_only" and is_voice_input) ) + logger.info("Voice reply check: chat_id=%s, voice_mode=%s, is_voice=%s, should_reply=%s, has_response=%s", + chat_id, voice_mode, is_voice_input, should_voice_reply, bool(response)) if should_voice_reply and response and not response.startswith("Error:"): # Skip if agent already called TTS tool (avoid double voice) has_agent_tts = any( @@ -1618,6 +1620,7 @@ class GatewayRunner: ) for msg in agent_messages ) + logger.info("Voice reply: has_agent_tts=%s, calling _send_voice_reply", has_agent_tts) if not has_agent_tts: await self._send_voice_reply(event, response) @@ -2201,9 +2204,12 @@ class GatewayRunner: adapter._voice_text_channels[guild_id] = int(event.source.chat_id) self._voice_mode[event.source.chat_id] = "all" self._save_voice_modes() + # Wire voice input callback so the adapter can deliver transcripts + if hasattr(adapter, "_voice_input_callback"): + adapter._voice_input_callback = self._handle_voice_channel_input return ( f"Joined voice channel **{voice_channel.name}**.\n" - f"I'll speak my replies here. Use /voice leave to disconnect." + f"I'll speak my replies and listen to you. Use /voice leave to disconnect." ) return "Failed to join voice channel. Check bot permissions (Connect + Speak)." @@ -2223,6 +2229,49 @@ class GatewayRunner: self._save_voice_modes() return "Left voice channel." + async def _handle_voice_channel_input( + self, guild_id: int, user_id: int, transcript: str + ): + """Handle transcribed voice from a user in a voice channel. + + Creates a synthetic MessageEvent and processes it through the + adapter's full message pipeline (session, typing, agent, TTS reply). + """ + adapter = self.adapters.get(Platform.DISCORD) + if not adapter: + return + + text_ch_id = adapter._voice_text_channels.get(guild_id) + if not text_ch_id: + return + + # Show transcript in text channel + try: + channel = adapter._client.get_channel(text_ch_id) + if channel: + await channel.send(f"**[Voice]** <@{user_id}>: {transcript}") + except Exception: + pass + + # Build a synthetic MessageEvent and feed through the normal pipeline + source = SessionSource( + platform=Platform.DISCORD, + chat_id=str(text_ch_id), + user_id=str(user_id), + user_name=str(user_id), + ) + # Use SimpleNamespace as raw_message so _get_guild_id() can extract + # guild_id and _send_voice_reply() plays audio in the voice channel. + from types import SimpleNamespace + event = MessageEvent( + source=source, + text=transcript, + message_type=MessageType.VOICE, + raw_message=SimpleNamespace(guild_id=guild_id, guild=None), + ) + + await adapter.handle_message(event) + async def _send_voice_reply(self, event: MessageEvent, text: str) -> None: """Generate TTS audio and send as a voice message before the text reply.""" try: From 2bb2312ea275edfff0c07667f86faad87876a5cc Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:13:33 +0300 Subject: [PATCH 42/93] docs: add comprehensive voice mode documentation Cover CLI voice mode, Telegram/Discord auto voice reply, and Discord voice channel support. Include setup guide with bot permissions, OAuth2 invite URL, privileged intents, system dependencies, and Python packages. Update discord.md voice messages section with correct STT key reference. --- .../docs/user-guide/features/voice-mode.md | 400 ++++++++++++++++++ website/docs/user-guide/messaging/discord.md | 4 +- 2 files changed, 402 insertions(+), 2 deletions(-) create mode 100644 website/docs/user-guide/features/voice-mode.md diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md new file mode 100644 index 0000000000..8a0f74b588 --- /dev/null +++ b/website/docs/user-guide/features/voice-mode.md @@ -0,0 +1,400 @@ +--- +sidebar_position: 10 +title: "Voice Mode" +description: "Real-time voice conversations with Hermes Agent — CLI, Telegram, Discord text, and Discord voice channels" +--- + +# Voice Mode + +Hermes Agent supports full voice interaction across CLI and messaging platforms. Talk to the agent using your microphone, hear spoken replies, and have live voice conversations in Discord voice channels. + +## Overview + +| Feature | Platform | Description | +|---------|----------|-------------| +| **Interactive Voice** | CLI | Press Ctrl+B to record, agent auto-detects silence and responds | +| **Auto Voice Reply** | Telegram, Discord | Agent sends spoken audio alongside text responses | +| **Voice Channel** | Discord | Bot joins VC, listens to users speaking, speaks replies back | + +## Requirements + +### Python Packages + +```bash +# CLI voice mode (microphone + audio playback) +pip install hermes-agent[voice] + +# Discord + Telegram messaging (includes discord.py[voice] for VC support) +pip install hermes-agent[messaging] + +# Premium TTS (ElevenLabs) +pip install hermes-agent[tts-premium] + +# Everything at once +pip install hermes-agent[all] +``` + +| Extra | Packages | Required For | +|-------|----------|-------------| +| `voice` | `sounddevice`, `numpy` | CLI voice mode | +| `messaging` | `discord.py[voice]`, `python-telegram-bot`, `aiohttp` | Discord & Telegram bots | +| `tts-premium` | `elevenlabs` | ElevenLabs TTS provider | + +:::info +`discord.py[voice]` installs **PyNaCl** (for voice encryption) and **opus bindings** automatically. This is required for Discord voice channel support. +::: + +### System Dependencies + +```bash +# macOS +brew install portaudio ffmpeg opus + +# Ubuntu/Debian +sudo apt install portaudio19-dev ffmpeg libopus0 +``` + +| Dependency | Purpose | Required For | +|-----------|---------|-------------| +| **PortAudio** | Microphone input and audio playback | CLI voice mode | +| **ffmpeg** | Audio format conversion (MP3 → Opus, PCM → WAV) | All platforms | +| **Opus** | Discord voice codec | Discord voice channels | + +### API Keys + +Add to `~/.hermes/.env`: + +```bash +# Speech-to-Text (at least one required) +GROQ_API_KEY=your-key # Groq Whisper — fast, free tier available (recommended) +VOICE_TOOLS_OPENAI_KEY=your-key # OpenAI Whisper — alternative + +# Text-to-Speech (optional — Edge TTS works without any key) +ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality +``` + +--- + +## CLI Voice Mode + +### Quick Start + +``` +/voice Toggle voice mode on/off +/voice on Enable voice mode +/voice off Disable voice mode +/voice tts Toggle TTS output +/voice status Show current state +``` + +### How It Works + +1. Enable voice mode with `/voice on` +2. **Press Ctrl+B** — a beep plays (880Hz), recording starts +3. **Speak** — a live audio level bar shows your input: `● [▁▂▃▅▇▇▅▂] ❯` +4. **Stop speaking** — after 3 seconds of silence, recording auto-stops +5. **Two beeps** play (660Hz) confirming the recording ended +6. Audio is transcribed via Whisper and sent to the agent +7. If TTS is enabled, the agent's reply is spoken aloud +8. Recording **automatically restarts** — speak again without pressing any key + +This loop continues until you press **Ctrl+B** during recording (exits continuous mode) or 3 consecutive recordings detect no speech. + +:::tip +The record key is configurable via `voice.record_key` in `~/.hermes/config.yaml` (default: `ctrl+b`). +::: + +### Silence Detection + +Two-stage algorithm detects when you've finished speaking: + +1. **Speech confirmation** — waits for audio above the RMS threshold (200) for at least 0.3s, tolerating brief dips between syllables +2. **End detection** — once speech is confirmed, triggers after 3.0 seconds of continuous silence + +If no speech is detected at all for 15 seconds, recording stops automatically. + +Both `silence_threshold` and `silence_duration` are configurable in `config.yaml`. + +### Streaming TTS + +When TTS is enabled, the agent speaks its reply **sentence-by-sentence** as it generates text — you don't wait for the full response: + +1. Buffers text deltas into complete sentences (min 20 chars) +2. Strips markdown formatting and `` blocks +3. Generates and plays audio per sentence in real-time + +### Hallucination Filter + +Whisper sometimes generates phantom text from silence or background noise ("Thank you for watching", "Subscribe", etc.). The agent filters these out using a database of 498+ known hallucination phrases across multiple languages. + +--- + +## Gateway Voice Reply (Telegram & Discord) + +### Commands + +These work in both Telegram and Discord text channels: + +``` +/voice Toggle voice mode on/off +/voice on Voice replies only when you send a voice message +/voice tts Voice replies for ALL messages +/voice off Disable voice replies +/voice status Show current setting +``` + +### Modes + +| Mode | Command | Behavior | +|------|---------|----------| +| `off` | `/voice off` | Text only (default) | +| `voice_only` | `/voice on` | Speaks reply only when you send a voice message | +| `all` | `/voice tts` | Speaks reply to every message | + +Voice mode setting is persisted across gateway restarts. + +### Platform Delivery + +| Platform | Format | Notes | +|----------|--------|-------| +| **Telegram** | Voice bubble (Opus/OGG) | Plays inline in chat. ffmpeg converts MP3 → Opus if needed | +| **Discord** | Audio file attachment (MP3) | Sent alongside text response | + +--- + +## Discord Voice Channels + +The most immersive voice feature: the bot joins a Discord voice channel, listens to users speaking, transcribes their speech, processes through the agent, and speaks the reply back in the voice channel. + +### Setup + +#### 1. Discord Bot Permissions + +If you already have a Discord bot set up for text (see [Discord Setup Guide](../messaging/discord.md)), you need to add voice permissions. + +Go to the [Discord Developer Portal](https://discord.com/developers/applications) → your application → **Installation** → **Default Install Settings** → **Guild Install**: + +**Add these permissions to the existing text permissions:** + +| Permission | Purpose | Required | +|-----------|---------|----------| +| **Connect** | Join voice channels | Yes | +| **Speak** | Play TTS audio in voice channels | Yes | +| **Use Voice Activity** | Detect when users are speaking | Recommended | + +**Updated Permissions Integer:** + +| Level | Integer | What's Included | +|-------|---------|----------------| +| Text only | `274878286912` | View Channels, Send Messages, Read History, Embeds, Attachments, Threads, Reactions | +| Text + Voice | `274881432640` | All above + Connect, Speak | + +**Re-invite the bot** with the updated permissions URL: + +``` +https://discord.com/oauth2/authorize?client_id=YOUR_APP_ID&scope=bot+applications.commands&permissions=274881432640 +``` + +Replace `YOUR_APP_ID` with your Application ID from the Developer Portal. + +:::warning +Re-inviting the bot to a server it's already in will update its permissions without removing it. You won't lose any data or configuration. +::: + +#### 2. Privileged Gateway Intents + +In the [Developer Portal](https://discord.com/developers/applications) → your application → **Bot** → **Privileged Gateway Intents**, enable all three: + +| Intent | Purpose | +|--------|---------| +| **Presence Intent** | Detect user online/offline status | +| **Server Members Intent** | Map voice SSRC identifiers to Discord user IDs | +| **Message Content Intent** | Read text message content in channels | + +All three are required for full voice channel functionality. **Server Members Intent** is especially critical — without it, the bot cannot identify who is speaking in the voice channel. + +#### 3. Opus Codec + +The Opus codec library must be installed on the machine running the gateway: + +```bash +# macOS (Homebrew) +brew install opus + +# Ubuntu/Debian +sudo apt install libopus0 +``` + +The bot auto-loads the codec from: +- **macOS:** `/opt/homebrew/lib/libopus.dylib` +- **Linux:** `libopus.so.0` + +#### 4. Environment Variables + +```bash +# ~/.hermes/.env + +# Discord bot (already configured for text) +DISCORD_BOT_TOKEN=your-bot-token +DISCORD_ALLOWED_USERS=your-user-id + +# STT — at least one required for voice channel listening +GROQ_API_KEY=your-key # Recommended (fast, free tier) + +# TTS — optional, Edge TTS (free) is the default +# ELEVENLABS_API_KEY=your-key # Premium quality +``` + +### Commands + +``` +/voice join Bot joins your current voice channel +/voice channel Alias for /voice join +/voice leave Bot disconnects from voice channel +/voice status Show voice mode and connected channel +``` + +:::info +You must be in a voice channel before running `/voice join`. The bot joins the same VC you're in. +::: + +### How It Works + +When the bot joins a voice channel, it: + +1. **Captures audio** via Discord's UDP socket (RTP packets) +2. **Decrypts** using NaCl transport encryption (aead_xchacha20_poly1305_rtpsize) +3. **Decrypts** DAVE end-to-end encryption (Discord Audio/Video Encryption) +4. **Decodes** Opus audio to raw PCM (48kHz stereo, per-user decoder) +5. **Detects silence** — 1.5s of silence after at least 0.5s of speech triggers processing +6. **Converts** PCM to 16kHz mono WAV via ffmpeg +7. **Transcribes** via Whisper STT (Groq or OpenAI) +8. **Processes** through the full agent pipeline (session, tools, memory) +9. **Generates TTS** reply audio +10. **Plays** the reply in the voice channel + +### Text Channel Integration + +When the bot is in a voice channel: + +- Transcripts appear in the text channel: `[Voice] @user: what you said` +- Agent responses are sent as text in the channel AND spoken in the VC +- The text channel is the one where `/voice join` was issued + +### Echo Prevention + +The bot automatically pauses its audio listener while playing TTS replies, preventing it from hearing and re-processing its own output. + +### Access Control + +Only users listed in `DISCORD_ALLOWED_USERS` can interact via voice. Other users' audio is silently ignored. + +```bash +# ~/.hermes/.env +DISCORD_ALLOWED_USERS=284102345871466496 +``` + +--- + +## Configuration Reference + +### config.yaml + +```yaml +# Voice recording (CLI) +voice: + record_key: "ctrl+b" # Key to start/stop recording + max_recording_seconds: 120 # Maximum recording length + auto_tts: false # Auto-enable TTS when voice mode starts + silence_threshold: 200 # RMS level (0-32767) below which counts as silence + silence_duration: 3.0 # Seconds of silence before auto-stop + +# Speech-to-Text +stt: + enabled: true + model: "whisper-1" # Or: whisper-large-v3-turbo (Groq) + +# Text-to-Speech +tts: + provider: "edge" # "edge" (free) | "elevenlabs" | "openai" + edge: + voice: "en-US-AriaNeural" # 322 voices, 74 languages + elevenlabs: + voice_id: "pNInz6obpgDQGcFmaJgB" # Adam + model_id: "eleven_multilingual_v2" + openai: + model: "gpt-4o-mini-tts" + voice: "alloy" # alloy, echo, fable, onyx, nova, shimmer +``` + +### Environment Variables + +```bash +# Speech-to-Text providers +GROQ_API_KEY=... # Groq Whisper (recommended — fast, free tier) +VOICE_TOOLS_OPENAI_KEY=... # OpenAI Whisper (alternative) + +# Text-to-Speech providers (Edge TTS needs no key) +ELEVENLABS_API_KEY=... # ElevenLabs (premium quality) +# OpenAI TTS uses VOICE_TOOLS_OPENAI_KEY + +# Discord voice channel +DISCORD_BOT_TOKEN=... +DISCORD_ALLOWED_USERS=... +``` + +### STT Provider Comparison + +| Provider | Model | Speed | Quality | Cost | +|----------|-------|-------|---------|------| +| **Groq** | `whisper-large-v3-turbo` | Very fast (~0.5s) | Good | Free tier | +| **Groq** | `whisper-large-v3` | Fast (~1s) | Better | Free tier | +| **OpenAI** | `whisper-1` | Fast (~1s) | Good | Low | +| **OpenAI** | `gpt-4o-transcribe` | Medium (~2s) | Best | Higher | + +### TTS Provider Comparison + +| Provider | Quality | Cost | Latency | Key Required | +|----------|---------|------|---------|-------------| +| **Edge TTS** | Good | Free | ~1s | No | +| **ElevenLabs** | Excellent | Paid | ~2s | Yes | +| **OpenAI TTS** | Good | Paid | ~1.5s | Yes | + +--- + +## Troubleshooting + +### "No audio device found" (CLI) + +PortAudio is not installed: + +```bash +brew install portaudio # macOS +sudo apt install portaudio19-dev # Ubuntu +``` + +### Bot joins VC but doesn't hear me + +- Check your Discord user ID is in `DISCORD_ALLOWED_USERS` +- Make sure you're not muted in Discord +- The bot needs a SPEAKING event from Discord before it can map your audio — start speaking within a few seconds of joining + +### Bot hears me but doesn't respond + +- Verify STT key is set (`GROQ_API_KEY` or `VOICE_TOOLS_OPENAI_KEY`) +- Check the LLM model is configured and accessible +- Review gateway logs: `tail -f ~/.hermes/logs/gateway.log` + +### Bot responds in text but not in voice channel + +- TTS provider may be failing — check API key and quota +- Edge TTS (free, no key) is the default fallback +- Check logs for TTS errors + +### Whisper returns garbage text + +The hallucination filter catches most cases automatically. If you're still getting phantom transcripts: + +- Use a quieter environment +- Adjust `silence_threshold` in config (higher = less sensitive) +- Try a different STT model diff --git a/website/docs/user-guide/messaging/discord.md b/website/docs/user-guide/messaging/discord.md index 38fb9598a9..0fc7f8cbc5 100644 --- a/website/docs/user-guide/messaging/discord.md +++ b/website/docs/user-guide/messaging/discord.md @@ -210,8 +210,8 @@ Replace the ID with the actual channel ID (right-click → Copy Channel ID with Hermes Agent supports Discord voice messages: -- **Incoming voice messages** are automatically transcribed using Whisper (requires `VOICE_TOOLS_OPENAI_KEY` to be set in your environment). -- **Text-to-speech**: When TTS is enabled, the bot can send spoken responses as MP3 file attachments. +- **Incoming voice messages** are automatically transcribed using Whisper (requires `GROQ_API_KEY` or `VOICE_TOOLS_OPENAI_KEY` to be set in your environment). +- **Text-to-speech**: Use `/voice tts` to have the bot send spoken audio responses alongside text replies. ## Troubleshooting From 75bd5a582b444df481c2b79ceeffc867714a8829 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 15:29:23 +0300 Subject: [PATCH 43/93] docs: improve voice mode docs with prerequisites, startup commands, and platform links --- .../docs/user-guide/features/voice-mode.md | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index 8a0f74b588..291721017d 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -8,6 +8,18 @@ description: "Real-time voice conversations with Hermes Agent — CLI, Telegram, Hermes Agent supports full voice interaction across CLI and messaging platforms. Talk to the agent using your microphone, hear spoken replies, and have live voice conversations in Discord voice channels. +## Prerequisites + +Before using voice features, make sure you have: + +1. **Hermes Agent installed** — `pip install hermes-agent` (see [Getting Started](../../getting-started.md)) +2. **An LLM provider configured** — set `OPENAI_API_KEY`, `OPENAI_BASE_URL`, and `LLM_MODEL` in `~/.hermes/.env` +3. **A working base setup** — run `hermes` to verify the agent responds to text before enabling voice + +:::tip +The `~/.hermes/` directory and default `config.yaml` are created automatically the first time you run `hermes`. You only need to create `~/.hermes/.env` manually for API keys. +::: + ## Overview | Feature | Platform | Description | @@ -79,6 +91,14 @@ ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality ### Quick Start +Start the CLI and enable voice mode: + +```bash +hermes # Start the interactive CLI +``` + +Then use these commands inside the CLI: + ``` /voice Toggle voice mode on/off /voice on Enable voice mode @@ -89,7 +109,7 @@ ELEVENLABS_API_KEY=your-key # ElevenLabs — premium quality ### How It Works -1. Enable voice mode with `/voice on` +1. Start the CLI with `hermes` and enable voice mode with `/voice on` 2. **Press Ctrl+B** — a beep plays (880Hz), recording starts 3. **Speak** — a live audio level bar shows your input: `● [▁▂▃▅▇▇▅▂] ❯` 4. **Stop speaking** — after 3 seconds of silence, recording auto-stops @@ -125,12 +145,23 @@ When TTS is enabled, the agent speaks its reply **sentence-by-sentence** as it g ### Hallucination Filter -Whisper sometimes generates phantom text from silence or background noise ("Thank you for watching", "Subscribe", etc.). The agent filters these out using a database of 498+ known hallucination phrases across multiple languages. +Whisper sometimes generates phantom text from silence or background noise ("Thank you for watching", "Subscribe", etc.). The agent filters these out using a set of 26 known hallucination phrases across multiple languages, plus a regex pattern that catches repetitive variations. --- ## Gateway Voice Reply (Telegram & Discord) +If you haven't set up your messaging bots yet, see the platform-specific guides: +- [Telegram Setup Guide](../messaging/telegram.md) +- [Discord Setup Guide](../messaging/discord.md) + +Start the gateway to connect to your messaging platforms: + +```bash +hermes gateway # Start the gateway (connects to configured platforms) +hermes gateway setup # Interactive setup wizard for first-time configuration +``` + ### Commands These work in both Telegram and Discord text channels: @@ -245,8 +276,18 @@ GROQ_API_KEY=your-key # Recommended (fast, free tier) # ELEVENLABS_API_KEY=your-key # Premium quality ``` +### Start the Gateway + +```bash +hermes gateway # Start with existing configuration +``` + +The bot should come online in Discord within a few seconds. + ### Commands +Use these in the Discord text channel where the bot is present: + ``` /voice join Bot joins your current voice channel /voice channel Alias for /voice join From e50323f73098c821619f998d4d5668836cef3ad7 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:30:38 +0300 Subject: [PATCH 44/93] fix(test): add missing _voice_mode attr to GatewayRunner test stubs --- tests/gateway/test_background_command.py | 1 + tests/gateway/test_resume_command.py | 1 + tests/gateway/test_run_progress_topics.py | 1 + tests/gateway/test_session_hygiene.py | 1 + tests/gateway/test_title_command.py | 1 + tests/gateway/test_update_command.py | 1 + 6 files changed, 6 insertions(+) diff --git a/tests/gateway/test_background_command.py b/tests/gateway/test_background_command.py index 6a780fb13f..027742ea01 100644 --- a/tests/gateway/test_background_command.py +++ b/tests/gateway/test_background_command.py @@ -32,6 +32,7 @@ def _make_runner(): from gateway.run import GatewayRunner runner = object.__new__(GatewayRunner) runner.adapters = {} + runner._voice_mode = {} runner._session_db = None runner._reasoning_config = None runner._provider_routing = {} diff --git a/tests/gateway/test_resume_command.py b/tests/gateway/test_resume_command.py index 17adcd2e74..987afbce32 100644 --- a/tests/gateway/test_resume_command.py +++ b/tests/gateway/test_resume_command.py @@ -36,6 +36,7 @@ def _make_runner(session_db=None, current_session_id="current_session_001", from gateway.run import GatewayRunner runner = object.__new__(GatewayRunner) runner.adapters = {} + runner._voice_mode = {} runner._session_db = session_db runner._running_agents = {} diff --git a/tests/gateway/test_run_progress_topics.py b/tests/gateway/test_run_progress_topics.py index 20ae712a20..66d13e0d01 100644 --- a/tests/gateway/test_run_progress_topics.py +++ b/tests/gateway/test_run_progress_topics.py @@ -77,6 +77,7 @@ def _make_runner(adapter): runner = object.__new__(GatewayRunner) runner.adapters = {Platform.TELEGRAM: adapter} + runner._voice_mode = {} runner._prefill_messages = [] runner._ephemeral_system_prompt = "" runner._reasoning_config = None diff --git a/tests/gateway/test_session_hygiene.py b/tests/gateway/test_session_hygiene.py index d627c20565..7e75b906d5 100644 --- a/tests/gateway/test_session_hygiene.py +++ b/tests/gateway/test_session_hygiene.py @@ -266,6 +266,7 @@ async def test_session_hygiene_messages_stay_in_originating_topic(monkeypatch, t platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="fake-token")} ) runner.adapters = {Platform.TELEGRAM: adapter} + runner._voice_mode = {} runner.hooks = SimpleNamespace(emit=AsyncMock(), loaded_hooks=False) runner.session_store = MagicMock() runner.session_store.get_or_create_session.return_value = SessionEntry( diff --git a/tests/gateway/test_title_command.py b/tests/gateway/test_title_command.py index 7f7c782a71..d5bad6c57a 100644 --- a/tests/gateway/test_title_command.py +++ b/tests/gateway/test_title_command.py @@ -31,6 +31,7 @@ def _make_runner(session_db=None): from gateway.run import GatewayRunner runner = object.__new__(GatewayRunner) runner.adapters = {} + runner._voice_mode = {} runner._session_db = session_db # Mock session_store that returns a session entry with a known session_id diff --git a/tests/gateway/test_update_command.py b/tests/gateway/test_update_command.py index 063f3c5acf..0aad419a58 100644 --- a/tests/gateway/test_update_command.py +++ b/tests/gateway/test_update_command.py @@ -33,6 +33,7 @@ def _make_runner(): from gateway.run import GatewayRunner runner = object.__new__(GatewayRunner) runner.adapters = {} + runner._voice_mode = {} return runner From a3905ef2890f42a129bead139358b4114badfe82 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 18:53:23 +0300 Subject: [PATCH 45/93] =?UTF-8?q?feat:=20add=20web=20gateway=20=E2=80=94?= =?UTF-8?q?=20browser-based=20chat=20UI=20over=20WebSocket?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New platform adapter that serves a full-featured chat interface via HTTP. Enables access from any device on the network (phone, tablet, desktop). Features: - aiohttp server with WebSocket real-time messaging - Token-based authentication - Markdown rendering (marked.js) + code highlighting (highlight.js) - Voice recording via MediaRecorder API + STT transcription - Image, voice, and document display - Typing indicator + message editing (streaming support) - Mobile responsive dark theme - Auto-reconnect on disconnect - Media file cleanup (24h TTL) Config: WEB_UI_ENABLED=true, WEB_UI_PORT=8765, WEB_UI_TOKEN= No new dependencies — uses aiohttp already in [messaging] extra. --- gateway/config.py | 16 + gateway/platforms/web.py | 1191 ++++++++++++++++++++++++++++++++++++++ gateway/run.py | 12 + 3 files changed, 1219 insertions(+) create mode 100644 gateway/platforms/web.py diff --git a/gateway/config.py b/gateway/config.py index e45eede7ca..ab51574aa0 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -31,6 +31,7 @@ class Platform(Enum): SIGNAL = "signal" HOMEASSISTANT = "homeassistant" EMAIL = "email" + WEB = "web" @dataclass @@ -176,6 +177,9 @@ class GatewayConfig: # Email uses extra dict for config (address + imap_host + smtp_host) elif platform == Platform.EMAIL and config.extra.get("address"): connected.append(platform) + # Web UI uses enabled flag only + elif platform == Platform.WEB: + connected.append(platform) return connected def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]: @@ -466,6 +470,18 @@ def _apply_env_overrides(config: GatewayConfig) -> None: name=os.getenv("EMAIL_HOME_ADDRESS_NAME", "Home"), ) + # Web UI + web_enabled = os.getenv("WEB_UI_ENABLED", "").lower() in ("true", "1", "yes") + if web_enabled: + if Platform.WEB not in config.platforms: + config.platforms[Platform.WEB] = PlatformConfig() + config.platforms[Platform.WEB].enabled = True + config.platforms[Platform.WEB].extra.update({ + "port": int(os.getenv("WEB_UI_PORT", "8765")), + "host": os.getenv("WEB_UI_HOST", "0.0.0.0"), + "token": os.getenv("WEB_UI_TOKEN", ""), + }) + # Session settings idle_minutes = os.getenv("SESSION_IDLE_MINUTES") if idle_minutes: diff --git a/gateway/platforms/web.py b/gateway/platforms/web.py new file mode 100644 index 0000000000..9fe7e636c9 --- /dev/null +++ b/gateway/platforms/web.py @@ -0,0 +1,1191 @@ +""" +Web platform adapter. + +Provides a browser-based chat interface via HTTP + WebSocket. +Serves a single-page chat UI with markdown rendering, code highlighting, +voice messages, and mobile responsive design. + +No external dependencies beyond aiohttp (already in messaging extra). +""" + +import asyncio +import base64 +import json +import logging +import os +import secrets +import shutil +import socket +import time +import uuid +from pathlib import Path +from typing import Dict, List, Optional, Any + +logger = logging.getLogger(__name__) + +try: + from aiohttp import web + AIOHTTP_AVAILABLE = True +except ImportError: + AIOHTTP_AVAILABLE = False + web = None + +import sys +from pathlib import Path as _Path +sys.path.insert(0, str(_Path(__file__).resolve().parents[2])) + +from gateway.config import Platform, PlatformConfig +from gateway.platforms.base import ( + BasePlatformAdapter, + MessageEvent, + MessageType, + SendResult, +) + + +def check_web_requirements() -> bool: + """Check if aiohttp is available.""" + return AIOHTTP_AVAILABLE + + +class WebAdapter(BasePlatformAdapter): + """ + Web-based chat adapter. + + Runs a local HTTP server serving a chat UI. Clients connect via + WebSocket for real-time bidirectional messaging. + """ + + def __init__(self, config: PlatformConfig): + super().__init__(config, Platform.WEB) + self._app: Optional[web.Application] = None + self._runner: Optional[web.AppRunner] = None + self._site: Optional[web.TCPSite] = None + + # Config + self._host: str = config.extra.get("host", "0.0.0.0") + self._port: int = config.extra.get("port", 8765) + self._token: str = config.extra.get("token", "") or secrets.token_hex(16) + + # Connected WebSocket clients: session_id -> ws + self._clients: Dict[str, web.WebSocketResponse] = {} + + # Media directory for uploaded/generated files + self._media_dir = Path.home() / ".hermes" / "web_media" + + # Cleanup task handle + self._cleanup_task: Optional[asyncio.Task] = None + + async def connect(self) -> bool: + """Start the HTTP server and begin accepting connections.""" + if not AIOHTTP_AVAILABLE: + return False + + self._media_dir.mkdir(parents=True, exist_ok=True) + + self._app = web.Application(client_max_size=50 * 1024 * 1024) # 50MB upload limit + self._app.router.add_get("/", self._handle_index) + self._app.router.add_get("/ws", self._handle_websocket) + self._app.router.add_post("/upload", self._handle_upload) + self._app.router.add_static("/media", str(self._media_dir), show_index=False) + + self._runner = web.AppRunner(self._app) + await self._runner.setup() + + try: + self._site = web.TCPSite(self._runner, self._host, self._port) + await self._site.start() + except OSError as e: + logger.error("Failed to start web server on %s:%s — %s", self._host, self._port, e) + await self._runner.cleanup() + return False + + self._running = True + self._cleanup_task = asyncio.ensure_future(self._media_cleanup_loop()) + + local_ip = self._get_local_ip() + print(f"[{self.name}] Web UI: http://{local_ip}:{self._port}") + print(f"[{self.name}] Access token: {self._token}") + + return True + + async def disconnect(self) -> None: + """Stop the server and close all connections.""" + if self._cleanup_task: + self._cleanup_task.cancel() + self._cleanup_task = None + + for ws in list(self._clients.values()): + try: + await ws.close() + except Exception: + pass + self._clients.clear() + + if self._site: + await self._site.stop() + if self._runner: + await self._runner.cleanup() + + self._running = False + self._app = None + self._runner = None + self._site = None + print(f"[{self.name}] Disconnected") + + async def send( + self, + chat_id: str, + content: str, + reply_to: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> SendResult: + """Send a text message to all connected clients.""" + msg_id = str(uuid.uuid4())[:8] + payload = { + "type": "message", + "id": msg_id, + "content": content, + "timestamp": time.time(), + } + await self._broadcast(payload) + return SendResult(success=True, message_id=msg_id) + + async def edit_message( + self, chat_id: str, message_id: str, content: str + ) -> SendResult: + """Edit a previously sent message (used for streaming updates).""" + payload = { + "type": "edit", + "id": message_id, + "content": content, + "timestamp": time.time(), + } + await self._broadcast(payload) + return SendResult(success=True, message_id=message_id) + + async def send_typing(self, chat_id: str, metadata=None) -> None: + """Send typing indicator to all clients.""" + await self._broadcast({"type": "typing"}) + + async def send_image( + self, + chat_id: str, + image_url: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send an image to all connected clients.""" + msg_id = str(uuid.uuid4())[:8] + payload = { + "type": "image", + "id": msg_id, + "url": image_url, + "caption": caption or "", + "timestamp": time.time(), + } + await self._broadcast(payload) + return SendResult(success=True, message_id=msg_id) + + async def send_voice( + self, + chat_id: str, + audio_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + **kwargs, + ) -> SendResult: + """Send a voice message by copying audio to media dir and broadcasting URL.""" + filename = f"voice_{uuid.uuid4().hex[:8]}{Path(audio_path).suffix}" + dest = self._media_dir / filename + try: + shutil.copy2(audio_path, dest) + except Exception as e: + return SendResult(success=False, error=f"Failed to copy audio: {e}") + + msg_id = str(uuid.uuid4())[:8] + payload = { + "type": "voice", + "id": msg_id, + "url": f"/media/{filename}", + "caption": caption or "", + "timestamp": time.time(), + } + await self._broadcast(payload) + return SendResult(success=True, message_id=msg_id) + + async def send_image_file( + self, + chat_id: str, + image_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a local image file by copying to media dir.""" + filename = f"img_{uuid.uuid4().hex[:8]}{Path(image_path).suffix}" + dest = self._media_dir / filename + try: + shutil.copy2(image_path, dest) + except Exception as e: + return SendResult(success=False, error=f"Failed to copy image: {e}") + return await self.send_image(chat_id, f"/media/{filename}", caption, reply_to) + + async def send_document( + self, + chat_id: str, + file_path: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + reply_to: Optional[str] = None, + **kwargs, + ) -> SendResult: + """Send a document file by copying to media dir.""" + orig_name = file_name or Path(file_path).name + safe_name = f"{uuid.uuid4().hex[:8]}_{orig_name}" + dest = self._media_dir / safe_name + try: + shutil.copy2(file_path, dest) + except Exception as e: + return SendResult(success=False, error=f"Failed to copy file: {e}") + + msg_id = str(uuid.uuid4())[:8] + payload = { + "type": "document", + "id": msg_id, + "url": f"/media/{safe_name}", + "filename": orig_name, + "caption": caption or "", + "timestamp": time.time(), + } + await self._broadcast(payload) + return SendResult(success=True, message_id=msg_id) + + async def get_chat_info(self, chat_id: str) -> Dict[str, Any]: + """Return basic chat info for the web session.""" + return {"name": "Web Chat", "type": "dm"} + + # ---- HTTP Handlers ---- + + async def _handle_index(self, request: web.Request) -> web.Response: + """Serve the chat UI HTML page.""" + html = _build_chat_html() + return web.Response(text=html, content_type="text/html") + + async def _handle_websocket(self, request: web.Request) -> web.WebSocketResponse: + """Handle WebSocket connections for real-time chat.""" + ws = web.WebSocketResponse(max_msg_size=50 * 1024 * 1024) + await ws.prepare(request) + + session_id = uuid.uuid4().hex[:12] + authenticated = False + + try: + async for msg in ws: + if msg.type == web.WSMsgType.TEXT: + try: + data = json.loads(msg.data) + except json.JSONDecodeError: + continue + + msg_type = data.get("type", "") + + # Auth handshake + if msg_type == "auth": + if data.get("token") == self._token: + authenticated = True + self._clients[session_id] = ws + await ws.send_str(json.dumps({ + "type": "auth_ok", + "session_id": session_id, + })) + else: + await ws.send_str(json.dumps({ + "type": "auth_fail", + "error": "Invalid token", + })) + continue + + if not authenticated: + await ws.send_str(json.dumps({"type": "auth_required"})) + continue + + # Chat message + if msg_type == "message": + text = data.get("text", "").strip() + if text: + await self._process_user_message(session_id, text) + + # Voice message (base64 audio) + elif msg_type == "voice": + await self._process_voice_message(session_id, data) + + elif msg.type in (web.WSMsgType.ERROR, web.WSMsgType.CLOSE): + break + except Exception as e: + logger.debug("WebSocket session %s error: %s", session_id, e) + finally: + self._clients.pop(session_id, None) + + return ws + + async def _handle_upload(self, request: web.Request) -> web.Response: + """Handle file uploads (images, voice recordings).""" + token = request.headers.get("Authorization", "").replace("Bearer ", "") + if token != self._token: + return web.json_response({"error": "Unauthorized"}, status=401) + + reader = await request.multipart() + field = await reader.next() + if not field: + return web.json_response({"error": "No file"}, status=400) + + orig_name = field.filename or "file" + filename = f"upload_{uuid.uuid4().hex[:8]}_{orig_name}" + dest = self._media_dir / filename + + with open(dest, "wb") as f: + while True: + chunk = await field.read_chunk() + if not chunk: + break + f.write(chunk) + + return web.json_response({"url": f"/media/{filename}", "filename": filename}) + + # ---- Message Processing ---- + + async def _process_user_message(self, session_id: str, text: str) -> None: + """Build MessageEvent from user text and feed to handler.""" + msg_type = MessageType.COMMAND if text.startswith("/") else MessageType.TEXT + + source = self.build_source( + chat_id="web", + chat_name="Web Chat", + chat_type="dm", + user_id=session_id, + user_name="Web User", + ) + + event = MessageEvent( + text=text, + message_type=msg_type, + source=source, + message_id=uuid.uuid4().hex[:8], + ) + + if self._message_handler: + await self.handle_message(event) + + async def _process_voice_message(self, session_id: str, data: dict) -> None: + """Decode base64 voice audio, transcribe via STT, and process as message.""" + import tempfile + + audio_b64 = data.get("audio", "") + if not audio_b64: + return + + audio_bytes = base64.b64decode(audio_b64) + fmt = data.get("format", "webm") + tmp_path = os.path.join( + tempfile.gettempdir(), + f"web_voice_{uuid.uuid4().hex[:8]}.{fmt}", + ) + + with open(tmp_path, "wb") as f: + f.write(audio_bytes) + + try: + from tools.transcription_tools import transcribe_audio + result = await asyncio.to_thread(transcribe_audio, tmp_path) + + if not result.get("success"): + await self._send_to_session(session_id, { + "type": "error", + "error": f"Transcription failed: {result.get('error', 'Unknown')}", + }) + return + + transcript = result.get("transcript", "").strip() + if not transcript: + return + + # Show transcript to user + await self._send_to_session(session_id, { + "type": "transcript", + "text": transcript, + }) + + # Process as voice message + source = self.build_source( + chat_id="web", + chat_name="Web Chat", + chat_type="dm", + user_id=session_id, + user_name="Web User", + ) + event = MessageEvent( + text=transcript, + message_type=MessageType.VOICE, + source=source, + message_id=uuid.uuid4().hex[:8], + media_urls=[tmp_path], + media_types=[f"audio/{fmt}"], + ) + if self._message_handler: + await self.handle_message(event) + except Exception as e: + logger.warning("Voice processing failed: %s", e, exc_info=True) + finally: + try: + os.unlink(tmp_path) + except OSError: + pass + + # ---- Internal Utilities ---- + + async def _broadcast(self, payload: dict) -> None: + """Send JSON payload to all connected WebSocket clients.""" + data = json.dumps(payload) + dead: List[str] = [] + for sid, ws in self._clients.items(): + try: + await ws.send_str(data) + except Exception: + dead.append(sid) + for sid in dead: + self._clients.pop(sid, None) + + async def _send_to_session(self, session_id: str, payload: dict) -> None: + """Send a message to a specific client session.""" + ws = self._clients.get(session_id) + if ws: + try: + await ws.send_str(json.dumps(payload)) + except Exception: + self._clients.pop(session_id, None) + + async def _media_cleanup_loop(self) -> None: + """Periodically delete old media files (older than 24h).""" + try: + while self._running: + await asyncio.sleep(3600) + cutoff = time.time() - 86400 + removed = 0 + for f in self._media_dir.iterdir(): + if f.is_file() and f.stat().st_mtime < cutoff: + try: + f.unlink() + removed += 1 + except OSError: + pass + if removed: + logger.debug("Web media cleanup: removed %d old file(s)", removed) + except asyncio.CancelledError: + pass + + @staticmethod + def _get_local_ip() -> str: + """Get the machine's LAN IP address.""" + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + ip = s.getsockname()[0] + s.close() + return ip + except Exception: + return "127.0.0.1" + + +# --------------------------------------------------------------------------- +# Chat UI HTML +# --------------------------------------------------------------------------- + +def _build_chat_html() -> str: + """Build the complete single-page chat UI as an HTML string.""" + return ''' + + + + +Hermes + + + + + + + + +
+

Hermes

+

Enter access token to connect

+ + +
Invalid token. Try again.
+
+ + +
+
+
Hermes
+ Connected +
+
+
+
+
+ + + +
+
+ + + +''' diff --git a/gateway/run.py b/gateway/run.py index bee9b62a1c..73bde75d42 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -829,6 +829,13 @@ class GatewayRunner: return None return EmailAdapter(config) + elif platform == Platform.WEB: + from gateway.platforms.web import WebAdapter, check_web_requirements + if not check_web_requirements(): + logger.warning("Web: aiohttp not installed. Run: pip install aiohttp") + return None + return WebAdapter(config) + return None def _is_user_authorized(self, source: SessionSource) -> bool: @@ -848,6 +855,11 @@ class GatewayRunner: if source.platform == Platform.HOMEASSISTANT: return True + # Web UI users are authenticated via token at the WebSocket level. + # No additional allowlist check needed. + if source.platform == Platform.WEB: + return True + user_id = source.user_id if not user_id: return False From 4e3b14dc692b148ef30c533c3aaea9346437cc83 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 18:54:06 +0300 Subject: [PATCH 46/93] docs: add Web UI config to .env.example --- .env.example | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.env.example b/.env.example index 3d3ad1de96..7423bf5429 100644 --- a/.env.example +++ b/.env.example @@ -213,6 +213,13 @@ VOICE_TOOLS_OPENAI_KEY= # EMAIL_ALLOWED_USERS=your@email.com # EMAIL_HOME_ADDRESS=your@email.com +# Web UI (browser-based chat interface on local network) +# Access from phone/tablet/desktop at http://:8765 +# WEB_UI_ENABLED=false +# WEB_UI_PORT=8765 +# WEB_UI_HOST=0.0.0.0 +# WEB_UI_TOKEN= # Auto-generated if empty + # Gateway-wide: allow ALL users without an allowlist (default: false = deny) # Only set to true if you intentionally want open access. # GATEWAY_ALLOW_ALL_USERS=false From ddfbc22b7c99ea2f0a17a6d5954e7d642b4105fe Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 19:01:54 +0300 Subject: [PATCH 47/93] feat: add /remote-control command to start web UI on demand Type /remote-control from any platform (Telegram, Discord, etc.) to instantly start the web UI without restarting the gateway. - Auto-generates access token if not provided - Shows URL + token in response - Optional: /remote-control [port] [token] - Reports status if already running - Added to /help command list --- gateway/run.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/gateway/run.py b/gateway/run.py index 73bde75d42..16a0db8502 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -978,7 +978,7 @@ class GatewayRunner: "personality", "retry", "undo", "sethome", "set-home", "compress", "usage", "insights", "reload-mcp", "reload_mcp", "update", "title", "resume", "provider", "rollback", - "background", "reasoning", "voice"} + "background", "reasoning", "voice", "remote-control", "remote_control"} if command and command in _known_commands: await self.hooks.emit(f"command:{command}", { "platform": source.platform.value if source.platform else "", @@ -1053,6 +1053,9 @@ class GatewayRunner: if command == "voice": return await self._handle_voice_command(event) + if command in ("remote-control", "remote_control"): + return await self._handle_remote_control_command(event) + # User-defined quick commands (bypass agent loop, no LLM call) if command: @@ -1745,6 +1748,7 @@ class GatewayRunner: "`/rollback [number]` — List or restore filesystem checkpoints", "`/background ` — Run a prompt in a separate background session", "`/voice [on|off|tts|status]` — Toggle voice reply mode", + "`/remote-control [port] [token]` — Start web UI for remote access", "`/reload-mcp` — Reload MCP servers from config", "`/update` — Update Hermes Agent to the latest version", "`/help` — Show this message", @@ -2401,6 +2405,58 @@ class GatewayRunner: ) return f"❌ {result['error']}" + async def _handle_remote_control_command(self, event: MessageEvent) -> str: + """Handle /remote-control — start or show the web UI for remote access.""" + from gateway.config import Platform, PlatformConfig + + # Already running? + if Platform.WEB in self.adapters: + adapter = self.adapters[Platform.WEB] + local_ip = adapter._get_local_ip() + return ( + f"Web UI already running.\n" + f"URL: http://{local_ip}:{adapter._port}\n" + f"Token: {adapter._token}" + ) + + # Start web adapter on the fly + try: + from gateway.platforms.web import WebAdapter, check_web_requirements + if not check_web_requirements(): + return "Web UI requires aiohttp. Run: pip install aiohttp" + + args = event.get_command_args().strip() + port = 8765 + token = "" + for part in args.split(): + if part.isdigit(): + port = int(part) + elif part and not part.startswith("-"): + token = part + + web_config = PlatformConfig( + enabled=True, + extra={"port": port, "host": "0.0.0.0", "token": token}, + ) + adapter = WebAdapter(web_config) + adapter.set_message_handler(self._handle_message) + + success = await adapter.connect() + if not success: + return f"Failed to start Web UI on port {port}. Port may be in use." + + self.adapters[Platform.WEB] = adapter + local_ip = adapter._get_local_ip() + return ( + f"Web UI started!\n" + f"URL: http://{local_ip}:{adapter._port}\n" + f"Token: {adapter._token}\n" + f"Open this URL on your phone or any device on the same network." + ) + except Exception as e: + logger.error("Failed to start web UI: %s", e, exc_info=True) + return f"Failed to start Web UI: {e}" + async def _handle_background_command(self, event: MessageEvent) -> str: """Handle /background — run a prompt in a separate background session. From 536be3e0f6ce9dc740a23c26a8393f144adc7490 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 19:15:12 +0300 Subject: [PATCH 48/93] fix: show correct LAN IP when VPN is active Detect all network interfaces instead of relying on UDP trick which returns VPN IP. Prefers 192.168.x.x/10.x.x.x over VPN ranges. Shows all available IPs in console output. --- gateway/platforms/web.py | 60 ++++++++++++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/gateway/platforms/web.py b/gateway/platforms/web.py index 9fe7e636c9..e8a94892c0 100644 --- a/gateway/platforms/web.py +++ b/gateway/platforms/web.py @@ -103,8 +103,12 @@ class WebAdapter(BasePlatformAdapter): self._running = True self._cleanup_task = asyncio.ensure_future(self._media_cleanup_loop()) - local_ip = self._get_local_ip() - print(f"[{self.name}] Web UI: http://{local_ip}:{self._port}") + all_ips = self._get_local_ips() + primary_ip = self._get_local_ip() + print(f"[{self.name}] Web UI: http://{primary_ip}:{self._port}") + for ip in all_ips: + if ip != primary_ip: + print(f"[{self.name}] also: http://{ip}:{self._port}") print(f"[{self.name}] Access token: {self._token}") return True @@ -484,16 +488,50 @@ class WebAdapter(BasePlatformAdapter): pass @staticmethod - def _get_local_ip() -> str: - """Get the machine's LAN IP address.""" + def _get_local_ips() -> List[str]: + """Get all non-loopback IPv4 addresses on this machine.""" + ips = [] try: - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - s.connect(("8.8.8.8", 80)) - ip = s.getsockname()[0] - s.close() - return ip - except Exception: - return "127.0.0.1" + import netifaces + for iface in netifaces.interfaces(): + addrs = netifaces.ifaddresses(iface).get(netifaces.AF_INET, []) + for addr in addrs: + ip = addr.get("addr", "") + if ip and not ip.startswith("127."): + ips.append(ip) + except ImportError: + # Fallback: parse ifconfig output + import subprocess + try: + out = subprocess.check_output(["ifconfig"], text=True, timeout=5) + for line in out.splitlines(): + line = line.strip() + if line.startswith("inet ") and "127.0.0.1" not in line: + parts = line.split() + if len(parts) >= 2: + ips.append(parts[1]) + except Exception: + pass + if not ips: + # Last resort: UDP trick (may return VPN IP) + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(("8.8.8.8", 80)) + ips.append(s.getsockname()[0]) + s.close() + except Exception: + ips.append("127.0.0.1") + return ips + + @staticmethod + def _get_local_ip() -> str: + """Get the most likely LAN IP address.""" + ips = WebAdapter._get_local_ips() + # Prefer 192.168.x.x or 10.x.x.x over VPN ranges like 172.16.x.x + for ip in ips: + if ip.startswith("192.168.") or ip.startswith("10."): + return ip + return ips[0] if ips else "127.0.0.1" # --------------------------------------------------------------------------- From db51cfa60ed028711a748a64df7733170a6726e7 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 19:24:40 +0300 Subject: [PATCH 49/93] docs: add Web UI setup guide and update gateway docs - New web.md with full setup, features, security, and troubleshooting - Update index.md: architecture diagram, platform table, commands, links --- website/docs/user-guide/messaging/index.md | 42 ++-- website/docs/user-guide/messaging/web.md | 214 +++++++++++++++++++++ 2 files changed, 236 insertions(+), 20 deletions(-) create mode 100644 website/docs/user-guide/messaging/web.md diff --git a/website/docs/user-guide/messaging/index.md b/website/docs/user-guide/messaging/index.md index 2aa2605e6f..b018cb9d92 100644 --- a/website/docs/user-guide/messaging/index.md +++ b/website/docs/user-guide/messaging/index.md @@ -1,12 +1,12 @@ --- sidebar_position: 1 title: "Messaging Gateway" -description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, or Home Assistant — architecture and setup overview" +description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, Home Assistant, or your browser — architecture and setup overview" --- # Messaging Gateway -Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, or Home Assistant. The gateway is a single background process that connects to all your configured platforms, handles sessions, runs cron jobs, and delivers voice messages. +Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, Home Assistant, or your browser. The gateway is a single background process that connects to all your configured platforms, handles sessions, runs cron jobs, and delivers voice messages. ## Architecture @@ -15,24 +15,24 @@ Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, or Home │ Hermes Gateway │ ├───────────────────────────────────────────────────────────────────────────────┤ │ │ -│ ┌──────────┐ ┌─────────┐ ┌──────────┐ ┌───────┐ ┌───────┐ ┌───────┐ ┌────┐│ -│ │ Telegram │ │ Discord │ │ WhatsApp │ │ Slack │ │Signal │ │ Email │ │ HA ││ -│ │ Adapter │ │ Adapter │ │ Adapter │ │Adapter│ │Adapter│ │Adapter│ │Adpt││ -│ └────┬─────┘ └────┬────┘ └────┬─────┘ └──┬────┘ └──┬────┘ └──┬────┘ └─┬──┘│ -│ │ │ │ │ │ │ │ │ -│ └─────────────┴───────────┴───────────┴─────────┴─────────┴────────┘ │ -│ │ │ -│ ┌────────▼────────┐ │ -│ │ Session Store │ │ -│ │ (per-chat) │ │ -│ └────────┬────────┘ │ -│ │ │ -│ ┌────────▼────────┐ │ -│ │ AIAgent │ │ -│ │ (run_agent) │ │ -│ └─────────────────┘ │ -│ │ -└───────────────────────────────────────────────────────────────────────────────┘ +│ ┌──────────┐ ┌─────────┐ ┌──────────┐ ┌───────┐ ┌───────┐ ┌───────┐ ┌────┐ ┌─────┐│ +│ │ Telegram │ │ Discord │ │ WhatsApp │ │ Slack │ │Signal │ │ Email │ │ HA │ │ Web ││ +│ │ Adapter │ │ Adapter │ │ Adapter │ │Adapter│ │Adapter│ │Adapter│ │Adpt│ │Adpt ││ +│ └────┬─────┘ └────┬────┘ └────┬─────┘ └──┬────┘ └──┬────┘ └──┬────┘ └─┬──┘ └──┬──┘│ +│ │ │ │ │ │ │ │ │ │ +│ └─────────────┴───────────┴───────────┴─────────┴─────────┴────────┴───────┘ │ +│ │ │ +│ ┌────────▼────────┐ │ +│ │ Session Store │ │ +│ │ (per-chat) │ │ +│ └────────┬────────┘ │ +│ │ │ +│ ┌────────▼────────┐ │ +│ │ AIAgent │ │ +│ │ (run_agent) │ │ +│ └─────────────────┘ │ +│ │ +└───────────────────────────────────────────────────────────────────────────────────────┘ ``` Each platform adapter receives messages, routes them through a per-chat session store, and dispatches them to the AIAgent for processing. The gateway also runs the cron scheduler, ticking every 60 seconds to execute any due jobs. @@ -81,6 +81,7 @@ hermes gateway status # Check service status | `/background ` | Run a prompt in a separate background session | | `/reload-mcp` | Reload MCP servers from config | | `/update` | Update Hermes Agent to the latest version | +| `/remote-control [port] [token]` | Start web UI for remote access | | `/help` | Show available commands | | `/` | Invoke any installed skill | @@ -220,3 +221,4 @@ Each platform has its own toolset: - [Signal Setup](signal.md) - [Email Setup](email.md) - [Home Assistant Integration](homeassistant.md) +- [Web UI Setup](web.md) diff --git a/website/docs/user-guide/messaging/web.md b/website/docs/user-guide/messaging/web.md new file mode 100644 index 0000000000..1bd76dfb27 --- /dev/null +++ b/website/docs/user-guide/messaging/web.md @@ -0,0 +1,214 @@ +--- +sidebar_position: 8 +title: "Web UI" +description: "Access Hermes from any browser on your network — phone, tablet, or desktop" +--- + +# Web UI Setup + +Access Hermes from any browser on your local network. Open the URL on your phone, tablet, or another computer — no app install, no third-party account needed. + +:::info No External Dependencies +The Web adapter uses `aiohttp`, which is already included in the `[messaging]` extra. No additional packages or external services are required. +::: + +## Overview + +| Component | Value | +|-----------|-------| +| **Library** | `aiohttp` (HTTP + WebSocket) | +| **Connection** | Local network (LAN) | +| **Auth** | Token-based (auto-generated or custom) | +| **Features** | Markdown, code highlighting, voice messages, images, mobile responsive | + +--- + +## Quick Start + +### Option 1: On-Demand via Command + +Start the gateway normally, then type from any connected platform (Telegram, Discord, etc.): + +``` +/remote-control +``` + +The bot replies with the URL and access token. Open the URL on your phone. + +You can also specify a custom port and token: + +``` +/remote-control 9000 mysecrettoken +``` + +### Option 2: Auto-Start with Gateway + +Add to `~/.hermes/.env`: + +```bash +WEB_UI_ENABLED=true +WEB_UI_PORT=8765 # default: 8765 +WEB_UI_TOKEN=mytoken # auto-generated if empty +``` + +Start the gateway: + +```bash +hermes gateway +``` + +The web UI starts automatically alongside your other platforms. + +--- + +## Step 1: Configure + +Add to `~/.hermes/.env`: + +```bash +# Enable Web UI +WEB_UI_ENABLED=true + +# Port to listen on (default: 8765) +WEB_UI_PORT=8765 + +# Bind address (default: 0.0.0.0 = all interfaces, for LAN access) +# Set to 127.0.0.1 for localhost-only access +WEB_UI_HOST=0.0.0.0 + +# Access token (leave empty to auto-generate on each startup) +WEB_UI_TOKEN=your-secret-token +``` + +## Step 2: Start the Gateway + +```bash +hermes gateway +``` + +You'll see output like: + +``` +[Web] Web UI: http://192.168.1.106:8765 +[Web] Access token: your-secret-token +``` + +## Step 3: Open in Browser + +1. Open the URL shown in the console on any device on the same network +2. Enter the access token +3. Start chatting + +--- + +## Features + +### Markdown & Code Highlighting + +Bot responses render full GitHub-flavored Markdown with syntax-highlighted code blocks powered by highlight.js. + +### Voice Messages + +Click the microphone button to record a voice message. The audio is transcribed via Whisper STT and sent to the agent. If voice mode is enabled (`/voice tts`), the bot replies with audio playback in the browser. + +### Images & Files + +- Images display inline in the chat +- Documents show as download links +- Generated images from the agent appear automatically + +### Mobile Responsive + +The UI adapts to phone screens — full chat experience with touch-friendly input and buttons. + +### Typing Indicator + +Shows an animated indicator while the agent is processing your message. + +### Auto-Reconnect + +If the connection drops (server restart, network change), the client automatically reconnects with exponential backoff. + +--- + +## Firewall & Network + +### macOS Firewall + +macOS may block incoming connections by default. If devices on your network can't connect: + +1. **System Settings** > **Network** > **Firewall** +2. Either disable the firewall temporarily, or add Python to the allowed apps + +### Localhost Only + +To restrict access to the local machine only: + +```bash +WEB_UI_HOST=127.0.0.1 +``` + +### Remote Access (Outside LAN) + +The Web UI is designed for local network access. For access from outside your network, use a tunnel: + +```bash +# Using ngrok +ngrok http 8765 + +# Using Cloudflare Tunnel +cloudflared tunnel --url http://localhost:8765 + +# Using Tailscale (recommended — encrypted, no port forwarding) +# Install Tailscale on both devices, then access via Tailscale IP +``` + +--- + +## Security + +- **Token authentication** — every WebSocket connection must authenticate with the correct token before sending messages +- **No data leaves your network** — the server runs locally, chat data stays on your machine +- **No HTTPS by default** — traffic is unencrypted on the LAN. Use a reverse proxy or tunnel for encryption +- **File uploads** require the auth token in the `Authorization` header +- **Media cleanup** — uploaded and generated files are automatically deleted after 24 hours + +--- + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `WEB_UI_ENABLED` | `false` | Enable the web gateway | +| `WEB_UI_PORT` | `8765` | HTTP server port | +| `WEB_UI_HOST` | `0.0.0.0` | Bind address (`0.0.0.0` = LAN, `127.0.0.1` = localhost) | +| `WEB_UI_TOKEN` | (auto) | Access token. Auto-generated if empty. | + +--- + +## Troubleshooting + +### "Server not found" on phone + +- Verify both devices are on the **same WiFi network** +- Check if macOS firewall is blocking (see Firewall section above) +- Try the IP address shown in console output, not `localhost` +- If using VPN, the console shows all available IPs — try each one + +### Port already in use + +Change the port in `.env`: + +```bash +WEB_UI_PORT=9000 +``` + +### Voice recording not working + +- Browser must support `MediaRecorder` API (Chrome, Firefox, Safari 14.5+) +- HTTPS is required for microphone access on non-localhost origins +- On localhost (`127.0.0.1`), HTTP works fine for microphone + +### CDN resources not loading + +The UI loads `marked.js` and `highlight.js` from CDN. If you're offline or behind a restrictive proxy, markdown rendering and code highlighting won't work but basic chat still functions. From d3e09df01aaafda9bfa106c4e6c0b88f2be5b26f Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 20:16:57 +0300 Subject: [PATCH 50/93] feat: add voice conversation support and futuristic UI redesign - Auto-TTS: voice messages get spoken response (audio first, then text) - STT: Groq Whisper fallback when VOICE_TOOLS_OPENAI_KEY not set - Futuristic UI: glassmorphism, centered container, purple theme, glow effects - Voice bubble: custom waveform player with seek and progress - Invisible TTS playback via play_tts() method (no audio file in chat) - Add hermes-web toolset with full tool access - Register Platform.WEB in toolset/config maps - Update docs for voice conversation feature --- gateway/platforms/base.py | 48 ++- gateway/platforms/web.py | 377 +++++++++++++++++++---- gateway/run.py | 8 +- toolsets.py | 8 +- website/docs/user-guide/messaging/web.md | 6 +- 5 files changed, 369 insertions(+), 78 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 718f997959..71e97285eb 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -537,6 +537,20 @@ class BasePlatformAdapter(ABC): text = f"{caption}\n{text}" return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + async def play_tts( + self, + chat_id: str, + audio_path: str, + **kwargs, + ) -> SendResult: + """ + Play auto-TTS audio for voice replies. + + Override in subclasses for invisible playback (e.g. Web UI). + Default falls back to send_voice (shows audio player). + """ + return await self.send_voice(chat_id=chat_id, audio_path=audio_path, **kwargs) + async def send_video( self, chat_id: str, @@ -718,7 +732,31 @@ class BasePlatformAdapter(ABC): if images: logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response)) - # Send the text portion first (if any remains after extractions) + # Auto-TTS: if voice message, generate audio FIRST (before sending text) + _tts_path = None + if event.message_type == MessageType.VOICE and text_content and not media_files: + try: + from tools.tts_tool import text_to_speech_tool, check_tts_requirements + if check_tts_requirements(): + import json as _json + speech_text = re.sub(r'[*_`#\[\]()]', '', text_content)[:4000] + tts_result_str = await asyncio.to_thread( + text_to_speech_tool, text=speech_text + ) + tts_data = _json.loads(tts_result_str) + _tts_path = tts_data.get("file_path") + except Exception as tts_err: + logger.warning("[%s] Auto-TTS failed: %s", self.name, tts_err) + + # Play TTS audio before text (voice-first experience) + if _tts_path and Path(_tts_path).exists(): + await self.play_tts( + chat_id=event.source.chat_id, + audio_path=_tts_path, + metadata=_thread_metadata, + ) + + # Send the text portion if text_content: logger.info("[%s] Sending response (%d chars) to %s", self.name, len(text_content), event.source.chat_id) result = await self.send( @@ -727,7 +765,7 @@ class BasePlatformAdapter(ABC): reply_to=event.message_id, metadata=_thread_metadata, ) - + # Log send failures (don't raise - user already saw tool progress) if not result.success: print(f"[{self.name}] Failed to send response: {result.error}") @@ -740,10 +778,10 @@ class BasePlatformAdapter(ABC): ) if not fallback_result.success: print(f"[{self.name}] Fallback send also failed: {fallback_result.error}") - + # Human-like pacing delay between text and media human_delay = self._get_human_delay() - + # Send extracted images as native attachments if images: logger.info("[%s] Extracted %d image(s) to send as attachments", self.name, len(images)) @@ -771,7 +809,7 @@ class BasePlatformAdapter(ABC): logger.error("[%s] Failed to send image: %s", self.name, img_result.error) except Exception as img_err: logger.error("[%s] Error sending image: %s", self.name, img_err, exc_info=True) - + # Send extracted media files — route by file type _AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'} _VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'} diff --git a/gateway/platforms/web.py b/gateway/platforms/web.py index e8a94892c0..741ec62d9c 100644 --- a/gateway/platforms/web.py +++ b/gateway/platforms/web.py @@ -218,6 +218,27 @@ class WebAdapter(BasePlatformAdapter): await self._broadcast(payload) return SendResult(success=True, message_id=msg_id) + async def play_tts( + self, + chat_id: str, + audio_path: str, + **kwargs, + ) -> SendResult: + """Play TTS audio invisibly — no bubble in chat, just audio playback.""" + filename = f"tts_{uuid.uuid4().hex[:8]}{Path(audio_path).suffix}" + dest = self._media_dir / filename + try: + shutil.copy2(audio_path, dest) + except Exception as e: + return SendResult(success=False, error=f"Failed to copy audio: {e}") + + payload = { + "type": "play_audio", + "url": f"/media/{filename}", + } + await self._broadcast(payload) + return SendResult(success=True) + async def send_image_file( self, chat_id: str, @@ -551,27 +572,36 @@ def _build_chat_html() -> str: - - - - -
-

Hermes

-

Enter access token to connect

- - -
Invalid token. Try again.
-
- - -
-
-
Hermes
- Connected -
-
-
-
-
- - - -
-
- - - -''' diff --git a/gateway/run.py b/gateway/run.py index 173b2551e9..75449d6295 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -829,13 +829,6 @@ class GatewayRunner: return None return EmailAdapter(config) - elif platform == Platform.WEB: - from gateway.platforms.web import WebAdapter, check_web_requirements - if not check_web_requirements(): - logger.warning("Web: aiohttp not installed. Run: pip install aiohttp") - return None - return WebAdapter(config) - return None def _is_user_authorized(self, source: SessionSource) -> bool: @@ -855,11 +848,6 @@ class GatewayRunner: if source.platform == Platform.HOMEASSISTANT: return True - # Web UI users are authenticated via token at the WebSocket level. - # No additional allowlist check needed. - if source.platform == Platform.WEB: - return True - user_id = source.user_id if not user_id: return False @@ -978,7 +966,7 @@ class GatewayRunner: "personality", "retry", "undo", "sethome", "set-home", "compress", "usage", "insights", "reload-mcp", "reload_mcp", "update", "title", "resume", "provider", "rollback", - "background", "reasoning", "voice", "remote-control", "remote_control"} + "background", "reasoning", "voice"} if command and command in _known_commands: await self.hooks.emit(f"command:{command}", { "platform": source.platform.value if source.platform else "", @@ -1053,10 +1041,6 @@ class GatewayRunner: if command == "voice": return await self._handle_voice_command(event) - if command in ("remote-control", "remote_control"): - return await self._handle_remote_control_command(event) - - # User-defined quick commands (bypass agent loop, no LLM call) if command: quick_commands = self.config.get("quick_commands", {}) @@ -1741,7 +1725,6 @@ class GatewayRunner: "`/rollback [number]` — List or restore filesystem checkpoints", "`/background ` — Run a prompt in a separate background session", "`/voice [on|off|tts|status]` — Toggle voice reply mode", - "`/remote-control [port] [token]` — Start web UI for remote access", "`/reload-mcp` — Reload MCP servers from config", "`/update` — Update Hermes Agent to the latest version", "`/help` — Show this message", @@ -2415,10 +2398,6 @@ class GatewayRunner: } if event.source.thread_id: send_kwargs["metadata"] = {"thread_id": event.source.thread_id} - import inspect - sig = inspect.signature(adapter.send_voice) - if "metadata" not in sig.parameters: - send_kwargs.pop("metadata", None) await adapter.send_voice(**send_kwargs) except Exception as e: logger.warning("Auto voice reply failed: %s", e, exc_info=True) @@ -2488,62 +2467,6 @@ class GatewayRunner: ) return f"❌ {result['error']}" - async def _handle_remote_control_command(self, event: MessageEvent) -> str: - """Handle /remote-control — start or show the web UI for remote access.""" - from gateway.config import Platform, PlatformConfig - - is_dm = event.source and event.source.chat_type == "dm" - - # Already running? - if Platform.WEB in self.adapters: - adapter = self.adapters[Platform.WEB] - local_ip = adapter._get_local_ip() - token_display = adapter._token if is_dm else "(hidden — use in DM to see token)" - return ( - f"Web UI already running.\n" - f"URL: http://{local_ip}:{adapter._port}\n" - f"Token: {token_display}" - ) - - # Start web adapter on the fly - try: - from gateway.platforms.web import WebAdapter, check_web_requirements - if not check_web_requirements(): - return "Web UI requires aiohttp. Run: pip install aiohttp" - - args = event.get_command_args().strip() - port = 8765 - token = "" - for part in args.split(): - if part.isdigit(): - port = int(part) - elif part and not part.startswith("-"): - token = part - - web_config = PlatformConfig( - enabled=True, - extra={"port": port, "host": "127.0.0.1", "token": token}, - ) - adapter = WebAdapter(web_config) - adapter.set_message_handler(self._handle_message) - - success = await adapter.connect() - if not success: - return f"Failed to start Web UI on port {port}. Port may be in use." - - self.adapters[Platform.WEB] = adapter - local_ip = adapter._get_local_ip() - token_display = adapter._token if is_dm else "(hidden — use in DM to see token)" - return ( - f"Web UI started!\n" - f"URL: http://{local_ip}:{adapter._port}\n" - f"Token: {token_display}\n" - f"Open this URL on your phone or any device on the same network." - ) - except Exception as e: - logger.error("Failed to start web UI: %s", e, exc_info=True) - return f"Failed to start Web UI: {e}" - async def _handle_background_command(self, event: MessageEvent) -> str: """Handle /background — run a prompt in a separate background session. @@ -2607,7 +2530,6 @@ class GatewayRunner: Platform.SIGNAL: "hermes-signal", Platform.HOMEASSISTANT: "hermes-homeassistant", Platform.EMAIL: "hermes-email", - Platform.WEB: "hermes-web", } platform_toolsets_config = {} try: @@ -2629,7 +2551,6 @@ class GatewayRunner: Platform.SIGNAL: "signal", Platform.HOMEASSISTANT: "homeassistant", Platform.EMAIL: "email", - Platform.WEB: "web", }.get(source.platform, "telegram") config_toolsets = platform_toolsets_config.get(platform_config_key) @@ -3517,7 +3438,6 @@ class GatewayRunner: Platform.SIGNAL: "hermes-signal", Platform.HOMEASSISTANT: "hermes-homeassistant", Platform.EMAIL: "hermes-email", - Platform.WEB: "hermes-web", } # Try to load platform_toolsets from config @@ -3542,7 +3462,6 @@ class GatewayRunner: Platform.SIGNAL: "signal", Platform.HOMEASSISTANT: "homeassistant", Platform.EMAIL: "email", - Platform.WEB: "web", }.get(source.platform, "telegram") # Use config override if present (list of toolsets), otherwise hardcoded default diff --git a/gateway/session.py b/gateway/session.py index 3e42db4fe3..86e42b5950 100644 --- a/gateway/session.py +++ b/gateway/session.py @@ -383,7 +383,11 @@ class SessionStore: with open(sessions_file, "r", encoding="utf-8") as f: data = json.load(f) for key, entry_data in data.items(): - self._entries[key] = SessionEntry.from_dict(entry_data) + try: + self._entries[key] = SessionEntry.from_dict(entry_data) + except (ValueError, KeyError): + # Skip entries with unknown/removed platform values + continue except Exception as e: print(f"[gateway] Warning: Failed to load sessions: {e}") diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py index 45595d35b1..4925f28459 100644 --- a/tests/gateway/test_voice_command.py +++ b/tests/gateway/test_voice_command.py @@ -390,33 +390,6 @@ class TestDiscordPlayTtsSkip: # Web play_tts sends play_audio (not voice bubble) # ===================================================================== -class TestWebPlayTts: - """Web adapter play_tts sends invisible play_audio, not a voice bubble.""" - - @pytest.mark.asyncio - async def test_play_tts_sends_play_audio(self, tmp_path): - from gateway.platforms.web import WebAdapter - from gateway.config import PlatformConfig - - config = PlatformConfig(enabled=True, extra={ - "port": 0, "host": "127.0.0.1", "token": "tok", - }) - adapter = WebAdapter(config) - adapter._broadcast = AsyncMock() - adapter._media_dir = tmp_path / "media" - adapter._media_dir.mkdir() - - audio_file = tmp_path / "test.ogg" - audio_file.write_bytes(b"fake audio") - - result = await adapter.play_tts(chat_id="web", audio_path=str(audio_file)) - assert result.success is True - - payload = adapter._broadcast.call_args[0][0] - assert payload["type"] == "play_audio" - assert "/media/" in payload["url"] - - # ===================================================================== # Help text + known commands # ===================================================================== diff --git a/tests/gateway/test_web.py b/tests/gateway/test_web.py deleted file mode 100644 index 141b0fa107..0000000000 --- a/tests/gateway/test_web.py +++ /dev/null @@ -1,926 +0,0 @@ -"""Tests for the Web UI gateway platform adapter. - -Covers: -1. Platform enum exists with correct value -2. Config loading from env vars via _apply_env_overrides -3. WebAdapter init and config parsing (port, host, token) -4. Token auto-generation when not provided -5. check_web_requirements function -6. HTTP server start/stop (connect/disconnect) -7. Auth screen served on GET / -8. Media directory creation and cleanup -9. WebSocket auth handshake (auth_ok / auth_fail) -10. WebSocket message routing (text, voice) -11. Auto-TTS play_tts sends invisible playback -12. Authorization bypass (Web platform always authorized) -13. Toolset registration (hermes-web in toolset maps) -14. LAN IP detection (_get_local_ip / _get_local_ips) -15. Security: path traversal sanitization -16. Security: media endpoint authentication -17. Security: hmac.compare_digest for token comparison -18. Security: DOMPurify XSS prevention -19. Security: default bind to 127.0.0.1 -20. Security: /remote-control token hiding in group chats -21. Network: VPN/multi-interface IP detection edge cases -22. Network: startup message token exposure -""" - -import asyncio -import json -import os -import unittest -from pathlib import Path -from unittest.mock import patch, MagicMock, AsyncMock - -import pytest - -from gateway.config import GatewayConfig, Platform, PlatformConfig, _apply_env_overrides -from gateway.platforms.base import SendResult - - -# =========================================================================== -# 1. Platform Enum -# =========================================================================== - - -class TestPlatformEnum(unittest.TestCase): - """Verify WEB is in the Platform enum.""" - - def test_web_in_platform_enum(self): - self.assertEqual(Platform.WEB.value, "web") - - def test_web_distinct_from_others(self): - platforms = [p.value for p in Platform] - self.assertIn("web", platforms) - self.assertEqual(platforms.count("web"), 1) - - -# =========================================================================== -# 2. Config loading from env vars -# =========================================================================== - - -class TestConfigEnvOverrides(unittest.TestCase): - """Verify web UI config is loaded from environment variables.""" - - @patch.dict(os.environ, { - "WEB_UI_ENABLED": "true", - "WEB_UI_PORT": "9000", - "WEB_UI_HOST": "127.0.0.1", - "WEB_UI_TOKEN": "mytoken", - }, clear=False) - def test_web_config_loaded_from_env(self): - config = GatewayConfig() - _apply_env_overrides(config) - self.assertIn(Platform.WEB, config.platforms) - self.assertTrue(config.platforms[Platform.WEB].enabled) - self.assertEqual(config.platforms[Platform.WEB].extra["port"], 9000) - self.assertEqual(config.platforms[Platform.WEB].extra["host"], "127.0.0.1") - self.assertEqual(config.platforms[Platform.WEB].extra["token"], "mytoken") - - @patch.dict(os.environ, { - "WEB_UI_ENABLED": "true", - "WEB_UI_TOKEN": "", - "WEB_UI_HOST": "", - }, clear=False) - def test_web_defaults(self): - config = GatewayConfig() - _apply_env_overrides(config) - self.assertIn(Platform.WEB, config.platforms) - self.assertEqual(config.platforms[Platform.WEB].extra["port"], 8765) - self.assertEqual(config.platforms[Platform.WEB].extra["host"], "127.0.0.1") - self.assertEqual(config.platforms[Platform.WEB].extra["token"], "") - - @patch.dict(os.environ, {}, clear=True) - def test_web_not_loaded_without_env(self): - config = GatewayConfig() - _apply_env_overrides(config) - self.assertNotIn(Platform.WEB, config.platforms) - - @patch.dict(os.environ, {"WEB_UI_ENABLED": "false"}, clear=False) - def test_web_not_loaded_when_disabled(self): - config = GatewayConfig() - _apply_env_overrides(config) - self.assertNotIn(Platform.WEB, config.platforms) - - -# =========================================================================== -# 3. WebAdapter init -# =========================================================================== - - -class TestWebAdapterInit: - """Test adapter initialization and config parsing.""" - - def _make_adapter(self, **extra): - from gateway.platforms.web import WebAdapter - defaults = {"port": 8765, "host": "0.0.0.0", "token": ""} - defaults.update(extra) - config = PlatformConfig(enabled=True, extra=defaults) - return WebAdapter(config) - - def test_default_port(self): - adapter = self._make_adapter() - assert adapter._port == 8765 - - def test_custom_port(self): - adapter = self._make_adapter(port=9999) - assert adapter._port == 9999 - - def test_custom_host(self): - adapter = self._make_adapter(host="127.0.0.1") - assert adapter._host == "127.0.0.1" - - def test_explicit_token(self): - adapter = self._make_adapter(token="secret123") - assert adapter._token == "secret123" - - def test_auto_generated_token(self): - adapter = self._make_adapter(token="") - assert len(adapter._token) > 0 - assert adapter._token != "" - - def test_name_property(self): - adapter = self._make_adapter() - assert adapter.name == "Web" - - -# =========================================================================== -# 4. check_web_requirements -# =========================================================================== - - -class TestCheckRequirements: - def test_aiohttp_available(self): - from gateway.platforms.web import check_web_requirements - # aiohttp is installed in the test env - assert check_web_requirements() is True - - -# =========================================================================== -# 5. HTTP server connect/disconnect -# =========================================================================== - - -def _get_free_port(): - """Get a free port from the OS.""" - import socket - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("127.0.0.1", 0)) - return s.getsockname()[1] - - -class TestServerLifecycle: - """Test that the aiohttp server starts and stops correctly.""" - - def _make_adapter(self): - from gateway.platforms.web import WebAdapter - port = _get_free_port() - config = PlatformConfig(enabled=True, extra={ - "port": port, "host": "127.0.0.1", "token": "test", - }) - return WebAdapter(config) - - @pytest.mark.asyncio - async def test_connect_starts_server(self): - adapter = self._make_adapter() - try: - result = await adapter.connect() - assert result is True - assert adapter._runner is not None - finally: - await adapter.disconnect() - - @pytest.mark.asyncio - async def test_disconnect_stops_server(self): - adapter = self._make_adapter() - await adapter.connect() - await adapter.disconnect() - assert adapter._runner is None or True # cleanup done - - @pytest.mark.asyncio - async def test_serves_html_on_get(self): - import aiohttp - adapter = self._make_adapter() - try: - await adapter.connect() - port = adapter._port - async with aiohttp.ClientSession() as session: - async with session.get(f"http://127.0.0.1:{port}/") as resp: - assert resp.status == 200 - text = await resp.text() - assert "Hermes" in text - assert "= 1 - - -# =========================================================================== -# 13. play_tts base class fallback -# =========================================================================== - - -class TestPlayTtsBaseFallback: - """Test that base class play_tts falls back to send_voice.""" - - @pytest.mark.asyncio - async def test_base_play_tts_calls_send_voice(self): - """Web adapter overrides play_tts; verify it sends play_audio not voice.""" - from gateway.platforms.web import WebAdapter - config = PlatformConfig(enabled=True, extra={ - "port": 8765, "host": "127.0.0.1", "token": "tok", - }) - adapter = WebAdapter(config) - adapter._broadcast = AsyncMock() - adapter._media_dir = Path("/tmp/test_media") - adapter._media_dir.mkdir(exist_ok=True) - - import tempfile - with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: - f.write(b"fake") - tmp = f.name - try: - result = await adapter.play_tts(chat_id="test", audio_path=tmp) - assert result.success is True - payload = adapter._broadcast.call_args[0][0] - assert payload["type"] == "play_audio" - finally: - os.unlink(tmp) - - -# =========================================================================== -# 14. Media directory management -# =========================================================================== - - -class TestMediaDirectory: - """Test media directory is created on adapter init.""" - - def test_media_dir_created(self, tmp_path): - from gateway.platforms.web import WebAdapter - config = PlatformConfig(enabled=True, extra={ - "port": 8765, "host": "127.0.0.1", "token": "tok", - }) - adapter = WebAdapter(config) - assert adapter._media_dir.exists() or True # may use default path - - -# =========================================================================== -# 15. Security: Path traversal sanitization -# =========================================================================== - - -class TestPathTraversalSanitization: - """Upload filenames with traversal sequences are sanitized.""" - - def test_path_name_strips_traversal(self): - """Path.name strips directory traversal from filenames.""" - assert Path("../../../etc/passwd").name == "passwd" - assert Path("normal_file.txt").name == "normal_file.txt" - assert Path("/absolute/path/file.txt").name == "file.txt" - - @pytest.mark.asyncio - async def test_upload_produces_safe_filename(self): - import aiohttp - from gateway.platforms.web import WebAdapter - - port = _get_free_port() - config = PlatformConfig(enabled=True, extra={ - "port": port, "host": "127.0.0.1", "token": "tok", - }) - adapter = WebAdapter(config) - try: - await adapter.connect() - async with aiohttp.ClientSession() as session: - data = aiohttp.FormData() - data.add_field("file", b"test content", - filename="safe_file.txt", - content_type="application/octet-stream") - async with session.post( - f"http://127.0.0.1:{port}/upload", - data=data, - headers={"Authorization": "Bearer tok"}, - ) as resp: - assert resp.status == 200 - result = await resp.json() - assert result["filename"].startswith("upload_") - assert "safe_file.txt" in result["filename"] - # File must be inside media dir, not escaped - assert result["url"].startswith("/media/") - finally: - await adapter.disconnect() - - def test_sanitize_in_source_code(self): - """Verify source code uses Path().name for filename sanitization.""" - import inspect - from gateway.platforms.web import WebAdapter - source = inspect.getsource(WebAdapter._handle_upload) - assert "Path(" in source and ".name" in source - - -# =========================================================================== -# 16. Security: Media endpoint authentication -# =========================================================================== - - -class TestMediaEndpointAuth: - """Media files require a valid token query parameter.""" - - @pytest.mark.asyncio - async def test_media_without_token_returns_401(self): - import aiohttp - from gateway.platforms.web import WebAdapter - - port = _get_free_port() - config = PlatformConfig(enabled=True, extra={ - "port": port, "host": "127.0.0.1", "token": "secret", - }) - adapter = WebAdapter(config) - try: - await adapter.connect() - async with aiohttp.ClientSession() as session: - async with session.get( - f"http://127.0.0.1:{port}/media/test.txt" - ) as resp: - assert resp.status == 401 - - finally: - await adapter.disconnect() - - @pytest.mark.asyncio - async def test_media_with_wrong_token_returns_401(self): - import aiohttp - from gateway.platforms.web import WebAdapter - - port = _get_free_port() - config = PlatformConfig(enabled=True, extra={ - "port": port, "host": "127.0.0.1", "token": "secret", - }) - adapter = WebAdapter(config) - try: - await adapter.connect() - async with aiohttp.ClientSession() as session: - async with session.get( - f"http://127.0.0.1:{port}/media/test.txt?token=wrong" - ) as resp: - assert resp.status == 401 - finally: - await adapter.disconnect() - - @pytest.mark.asyncio - async def test_media_with_valid_token_serves_file(self): - import aiohttp - from gateway.platforms.web import WebAdapter - - port = _get_free_port() - config = PlatformConfig(enabled=True, extra={ - "port": port, "host": "127.0.0.1", "token": "secret", - }) - adapter = WebAdapter(config) - try: - await adapter.connect() - # Create a test file in the media directory - test_file = adapter._media_dir / "testfile.txt" - test_file.write_text("hello") - - async with aiohttp.ClientSession() as session: - async with session.get( - f"http://127.0.0.1:{port}/media/testfile.txt?token=secret" - ) as resp: - assert resp.status == 200 - body = await resp.text() - assert body == "hello" - finally: - await adapter.disconnect() - - @pytest.mark.asyncio - async def test_media_path_traversal_in_url_blocked(self): - import aiohttp - from gateway.platforms.web import WebAdapter - - port = _get_free_port() - config = PlatformConfig(enabled=True, extra={ - "port": port, "host": "127.0.0.1", "token": "secret", - }) - adapter = WebAdapter(config) - try: - await adapter.connect() - async with aiohttp.ClientSession() as session: - async with session.get( - f"http://127.0.0.1:{port}/media/..%2F..%2Fetc%2Fpasswd?token=secret" - ) as resp: - assert resp.status == 404 - finally: - await adapter.disconnect() - - -# =========================================================================== -# 17. Security: hmac.compare_digest for token comparison -# =========================================================================== - - -class TestHmacTokenComparison: - """Verify source code uses hmac.compare_digest, not == / !=.""" - - def test_no_equality_operator_for_token(self): - import inspect - from gateway.platforms.web import WebAdapter - source = inspect.getsource(WebAdapter) - # There should be no `== self._token` or `!= self._token` in the source - assert "== self._token" not in source, \ - "Token comparison must use hmac.compare_digest, not ==" - assert "!= self._token" not in source, \ - "Token comparison must use hmac.compare_digest, not !=" - - def test_hmac_compare_digest_used(self): - import inspect - from gateway.platforms.web import WebAdapter - source = inspect.getsource(WebAdapter) - assert "hmac.compare_digest" in source - - -# =========================================================================== -# 18. Security: DOMPurify XSS prevention -# =========================================================================== - - -class TestDomPurifyPresent: - """HTML template includes DOMPurify for XSS prevention.""" - - def test_dompurify_script_included(self): - from gateway.platforms.web import _build_chat_html - html = _build_chat_html() - assert "dompurify" in html.lower() - assert "DOMPurify.sanitize" in html - - def test_marked_output_sanitized(self): - from gateway.platforms.web import _build_chat_html - html = _build_chat_html() - assert "DOMPurify.sanitize(marked.parse(" in html - - -# =========================================================================== -# 19. Security: default bind to localhost -# =========================================================================== - - -class TestDefaultBindLocalhost: - """Default host should be 127.0.0.1, not 0.0.0.0.""" - - def test_adapter_default_host(self): - from gateway.platforms.web import WebAdapter - config = PlatformConfig(enabled=True, extra={}) - adapter = WebAdapter(config) - assert adapter._host == "127.0.0.1" - - @patch.dict(os.environ, {"WEB_UI_ENABLED": "true"}, clear=True) - def test_config_default_host(self): - config = GatewayConfig() - _apply_env_overrides(config) - assert config.platforms[Platform.WEB].extra["host"] == "127.0.0.1" - - -# =========================================================================== -# 20. Security: /remote-control token hiding in group chats -# =========================================================================== - - -class TestRemoteControlTokenHiding: - """Token should be hidden when /remote-control is used in group chats.""" - - def _make_runner(self, tmp_path): - from gateway.run import GatewayRunner - runner = object.__new__(GatewayRunner) - runner.adapters = {} - runner._voice_mode = {} - runner._VOICE_MODE_PATH = tmp_path / "voice.json" - runner._session_db = None - runner.session_store = MagicMock() - return runner - - def _make_event(self, chat_type="dm"): - from gateway.platforms.base import MessageEvent, SessionSource - source = SessionSource( - chat_id="test", - user_id="user1", - platform=Platform.WEB, - chat_type=chat_type, - ) - event = MessageEvent(text="/remote-control", source=source) - event.message_id = "msg1" - return event - - @pytest.mark.asyncio - async def test_token_visible_in_dm(self, tmp_path): - from gateway.platforms.web import WebAdapter - runner = self._make_runner(tmp_path) - # Simulate a running web adapter - config = PlatformConfig(enabled=True, extra={ - "port": 8765, "host": "127.0.0.1", "token": "mysecret", - }) - adapter = WebAdapter(config) - runner.adapters[Platform.WEB] = adapter - event = self._make_event(chat_type="dm") - result = await runner._handle_remote_control_command(event) - assert "mysecret" in result - - @pytest.mark.asyncio - async def test_token_hidden_in_group(self, tmp_path): - from gateway.platforms.web import WebAdapter - runner = self._make_runner(tmp_path) - config = PlatformConfig(enabled=True, extra={ - "port": 8765, "host": "127.0.0.1", "token": "mysecret", - }) - adapter = WebAdapter(config) - runner.adapters[Platform.WEB] = adapter - event = self._make_event(chat_type="group") - result = await runner._handle_remote_control_command(event) - assert "mysecret" not in result - assert "hidden" in result.lower() - - -# =========================================================================== -# 21. VPN / multi-interface IP detection edge cases -# =========================================================================== - -class TestVpnAndMultiInterfaceIp: - """IP detection must prefer LAN IPs over VPN and handle edge cases.""" - - def test_lan_preferred_over_vpn(self): - """192.168.x.x or 10.x.x.x should be chosen over 172.16.x.x VPN.""" - from gateway.platforms.web import WebAdapter - with unittest.mock.patch.object( - WebAdapter, "_get_local_ips", - return_value=["172.16.0.2", "192.168.1.106"], - ): - ip = WebAdapter._get_local_ip() - assert ip == "192.168.1.106" - - def test_ten_network_preferred_over_vpn(self): - """10.x.x.x corporate LAN should be preferred over 172.16.x.x VPN.""" - from gateway.platforms.web import WebAdapter - with unittest.mock.patch.object( - WebAdapter, "_get_local_ips", - return_value=["172.16.5.1", "10.0.0.50"], - ): - ip = WebAdapter._get_local_ip() - assert ip == "10.0.0.50" - - def test_only_vpn_ip_still_returned(self): - """If only VPN IP exists, return it rather than nothing.""" - from gateway.platforms.web import WebAdapter - with unittest.mock.patch.object( - WebAdapter, "_get_local_ips", - return_value=["172.16.0.2"], - ): - ip = WebAdapter._get_local_ip() - assert ip == "172.16.0.2" - - def test_no_interfaces_returns_localhost(self): - """If no IPs found at all, fall back to 127.0.0.1.""" - from gateway.platforms.web import WebAdapter - with unittest.mock.patch.object( - WebAdapter, "_get_local_ips", - return_value=[], - ): - ip = WebAdapter._get_local_ip() - assert ip == "127.0.0.1" - - def test_multiple_lan_ips_returns_first_match(self): - """Multiple LAN IPs: first 192.168/10.x match wins.""" - from gateway.platforms.web import WebAdapter - with unittest.mock.patch.object( - WebAdapter, "_get_local_ips", - return_value=["172.16.0.2", "192.168.1.50", "10.0.0.1"], - ): - ip = WebAdapter._get_local_ip() - assert ip == "192.168.1.50" - - def test_get_local_ips_excludes_loopback(self): - """_get_local_ips must not return 127.x.x.x addresses.""" - from gateway.platforms.web import WebAdapter - import inspect - source = inspect.getsource(WebAdapter._get_local_ips) - # Must filter out 127.x addresses - assert "127." in source, \ - "_get_local_ips must filter loopback addresses" - - def test_get_local_ips_netifaces_fallback(self): - """When netifaces is unavailable, ifconfig fallback must work.""" - from gateway.platforms.web import WebAdapter - import inspect - source = inspect.getsource(WebAdapter._get_local_ips) - assert "ifconfig" in source, \ - "_get_local_ips must have ifconfig fallback" - assert "ImportError" in source, \ - "_get_local_ips must catch netifaces ImportError" - - -# =========================================================================== -# 22. Startup message token exposure -# =========================================================================== - -class TestStartupTokenExposure: - """Configured tokens must not be printed in startup output.""" - - def test_auto_generated_flag_when_no_token(self): - """Token auto-generation flag must be set when no token provided.""" - from gateway.platforms.web import WebAdapter - config = PlatformConfig(enabled=True, extra={ - "port": 8765, "host": "127.0.0.1", "token": "", - }) - adapter = WebAdapter(config) - assert adapter._token_auto_generated is True - assert len(adapter._token) == 32 # secrets.token_hex(16) = 32 chars - - def test_configured_flag_when_token_set(self): - """Token auto-generation flag must be False when token is provided.""" - from gateway.platforms.web import WebAdapter - config = PlatformConfig(enabled=True, extra={ - "port": 8765, "host": "127.0.0.1", "token": "mytoken123", - }) - adapter = WebAdapter(config) - assert adapter._token_auto_generated is False - assert adapter._token == "mytoken123" - - def test_startup_log_hides_configured_token(self): - """connect() must not print the token value when set via env.""" - from gateway.platforms.web import WebAdapter - import inspect - source = inspect.getsource(WebAdapter.connect) - # Must check _token_auto_generated before printing - assert "_token_auto_generated" in source, \ - "connect() must check _token_auto_generated before printing token" - - def test_startup_log_shows_auto_token(self): - """connect() must print the token when auto-generated.""" - from gateway.platforms.web import WebAdapter - import inspect - source = inspect.getsource(WebAdapter.connect) - # Must have a branch that prints the actual token - assert "auto-generated" in source, \ - "connect() must indicate when token is auto-generated" diff --git a/tests/tools/test_transcription.py b/tests/tools/test_transcription.py index 433c9466f5..fe3b24a8d3 100644 --- a/tests/tools/test_transcription.py +++ b/tests/tools/test_transcription.py @@ -125,7 +125,7 @@ class TestTranscribeLocal: mock_model.transcribe.return_value = ([mock_segment], mock_info) with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ - patch("tools.transcription_tools.WhisperModel", return_value=mock_model), \ + patch("faster_whisper.WhisperModel", return_value=mock_model), \ patch("tools.transcription_tools._local_model", None): from tools.transcription_tools import _transcribe_local result = _transcribe_local(str(audio_file), "base") @@ -164,7 +164,7 @@ class TestTranscribeOpenAI: mock_client.audio.transcriptions.create.return_value = "Hello from OpenAI" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai result = _transcribe_openai(str(audio_file), "whisper-1") diff --git a/tests/tools/test_transcription_tools.py b/tests/tools/test_transcription_tools.py index 384802b465..2f5b7cfbee 100644 --- a/tests/tools/test_transcription_tools.py +++ b/tests/tools/test_transcription_tools.py @@ -160,7 +160,7 @@ class TestTranscribeGroq: mock_client.audio.transcriptions.create.return_value = "hello world" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq result = _transcribe_groq(sample_wav, "whisper-large-v3-turbo") @@ -175,7 +175,7 @@ class TestTranscribeGroq: mock_client.audio.transcriptions.create.return_value = " hello world \n" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq result = _transcribe_groq(sample_wav, "whisper-large-v3-turbo") @@ -188,7 +188,7 @@ class TestTranscribeGroq: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client) as mock_openai_cls: + patch("openai.OpenAI", return_value=mock_client) as mock_openai_cls: from tools.transcription_tools import _transcribe_groq, GROQ_BASE_URL _transcribe_groq(sample_wav, "whisper-large-v3-turbo") @@ -202,7 +202,7 @@ class TestTranscribeGroq: mock_client.audio.transcriptions.create.side_effect = Exception("API error") with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq result = _transcribe_groq(sample_wav, "whisper-large-v3-turbo") @@ -216,7 +216,7 @@ class TestTranscribeGroq: mock_client.audio.transcriptions.create.side_effect = PermissionError("denied") with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq result = _transcribe_groq(sample_wav, "whisper-large-v3-turbo") @@ -244,7 +244,7 @@ class TestTranscribeOpenAIExtended: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client) as mock_openai_cls: + patch("openai.OpenAI", return_value=mock_client) as mock_openai_cls: from tools.transcription_tools import _transcribe_openai, OPENAI_BASE_URL _transcribe_openai(sample_wav, "whisper-1") @@ -258,7 +258,7 @@ class TestTranscribeOpenAIExtended: mock_client.audio.transcriptions.create.return_value = " hello \n" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai result = _transcribe_openai(sample_wav, "whisper-1") @@ -271,7 +271,7 @@ class TestTranscribeOpenAIExtended: mock_client.audio.transcriptions.create.side_effect = PermissionError("denied") with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai result = _transcribe_openai(sample_wav, "whisper-1") @@ -300,7 +300,7 @@ class TestTranscribeLocalExtended: mock_whisper_cls = MagicMock(return_value=mock_model) with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ - patch("tools.transcription_tools.WhisperModel", mock_whisper_cls), \ + patch("faster_whisper.WhisperModel", mock_whisper_cls), \ patch("tools.transcription_tools._local_model", None), \ patch("tools.transcription_tools._local_model_name", None): from tools.transcription_tools import _transcribe_local @@ -326,7 +326,7 @@ class TestTranscribeLocalExtended: mock_whisper_cls = MagicMock(return_value=mock_model) with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ - patch("tools.transcription_tools.WhisperModel", mock_whisper_cls), \ + patch("faster_whisper.WhisperModel", mock_whisper_cls), \ patch("tools.transcription_tools._local_model", None), \ patch("tools.transcription_tools._local_model_name", None): from tools.transcription_tools import _transcribe_local @@ -342,7 +342,7 @@ class TestTranscribeLocalExtended: mock_whisper_cls = MagicMock(side_effect=RuntimeError("CUDA out of memory")) with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ - patch("tools.transcription_tools.WhisperModel", mock_whisper_cls), \ + patch("faster_whisper.WhisperModel", mock_whisper_cls), \ patch("tools.transcription_tools._local_model", None): from tools.transcription_tools import _transcribe_local result = _transcribe_local(str(audio), "large-v3") @@ -366,7 +366,7 @@ class TestTranscribeLocalExtended: mock_model.transcribe.return_value = ([seg1, seg2], mock_info) with patch("tools.transcription_tools._HAS_FASTER_WHISPER", True), \ - patch("tools.transcription_tools.WhisperModel", return_value=mock_model), \ + patch("faster_whisper.WhisperModel", return_value=mock_model), \ patch("tools.transcription_tools._local_model", None): from tools.transcription_tools import _transcribe_local result = _transcribe_local(str(audio), "base") @@ -387,7 +387,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "hello world" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq, DEFAULT_GROQ_STT_MODEL _transcribe_groq(sample_wav, "whisper-1") @@ -401,7 +401,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq, DEFAULT_GROQ_STT_MODEL _transcribe_groq(sample_wav, "gpt-4o-transcribe") @@ -415,7 +415,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "hello world" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai, DEFAULT_STT_MODEL _transcribe_openai(sample_wav, "whisper-large-v3-turbo") @@ -429,7 +429,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai, DEFAULT_STT_MODEL _transcribe_openai(sample_wav, "distil-whisper-large-v3-en") @@ -443,7 +443,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq _transcribe_groq(sample_wav, "whisper-large-v3") @@ -457,7 +457,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai _transcribe_openai(sample_wav, "gpt-4o-mini-transcribe") @@ -472,7 +472,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_groq _transcribe_groq(sample_wav, "my-custom-model") @@ -486,7 +486,7 @@ class TestModelAutoCorrection: mock_client.audio.transcriptions.create.return_value = "test" with patch("tools.transcription_tools._HAS_OPENAI", True), \ - patch("tools.transcription_tools.OpenAI", return_value=mock_client): + patch("openai.OpenAI", return_value=mock_client): from tools.transcription_tools import _transcribe_openai _transcribe_openai(sample_wav, "my-custom-model") diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index 70424fee9a..cb86b881f9 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -345,6 +345,10 @@ class TestPlayAudioFile: np = pytest.importorskip("numpy") mock_sd_obj = MagicMock() + # Simulate stream completing immediately (get_stream().active = False) + mock_stream = MagicMock() + mock_stream.active = False + mock_sd_obj.get_stream.return_value = mock_stream def _fake_import(): return mock_sd_obj, np @@ -357,7 +361,7 @@ class TestPlayAudioFile: assert result is True mock_sd_obj.play.assert_called_once() - mock_sd_obj.wait.assert_called_once() + mock_sd_obj.stop.assert_called_once() def test_returns_false_when_no_player(self, monkeypatch, sample_wav): def _fail_import(): diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 09ffb6a7a6..a20ba41341 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -34,18 +34,9 @@ logger = logging.getLogger(__name__) # Optional imports — graceful degradation # --------------------------------------------------------------------------- -try: - from faster_whisper import WhisperModel - _HAS_FASTER_WHISPER = True -except ImportError: - _HAS_FASTER_WHISPER = False - WhisperModel = None # type: ignore[assignment,misc] - -try: - from openai import OpenAI, APIError, APIConnectionError, APITimeoutError - _HAS_OPENAI = True -except ImportError: - _HAS_OPENAI = False +import importlib.util as _ilu +_HAS_FASTER_WHISPER = _ilu.find_spec("faster_whisper") is not None +_HAS_OPENAI = _ilu.find_spec("openai") is not None # --------------------------------------------------------------------------- # Constants @@ -67,7 +58,7 @@ OPENAI_MODELS = {"whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"} GROQ_MODELS = {"whisper-large-v3", "whisper-large-v3-turbo", "distil-whisper-large-v3-en"} # Singleton for the local model — loaded once, reused across calls -_local_model: Optional["WhisperModel"] = None +_local_model: Optional[object] = None _local_model_name: Optional[str] = None # --------------------------------------------------------------------------- @@ -195,6 +186,7 @@ def _transcribe_local(file_path: str, model_name: str) -> Dict[str, Any]: return {"success": False, "transcript": "", "error": "faster-whisper not installed"} try: + from faster_whisper import WhisperModel # Lazy-load the model (downloads on first use, ~150 MB for 'base') if _local_model is None or _local_model_name != model_name: logger.info("Loading faster-whisper model '%s' (first load downloads the model)...", model_name) @@ -235,6 +227,7 @@ def _transcribe_groq(file_path: str, model_name: str) -> Dict[str, Any]: model_name = DEFAULT_GROQ_STT_MODEL try: + from openai import OpenAI, APIError, APIConnectionError, APITimeoutError client = OpenAI(api_key=api_key, base_url=GROQ_BASE_URL, timeout=30, max_retries=0) with open(file_path, "rb") as audio_file: @@ -282,6 +275,7 @@ def _transcribe_openai(file_path: str, model_name: str) -> Dict[str, Any]: model_name = DEFAULT_STT_MODEL try: + from openai import OpenAI, APIError, APIConnectionError, APITimeoutError client = OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL, timeout=30, max_retries=0) with open(file_path, "rb") as audio_file: diff --git a/tools/voice_mode.py b/tools/voice_mode.py index d5ae94f6a1..3afe533a5d 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -636,7 +636,13 @@ def play_audio_file(file_path: str) -> bool: sample_rate = wf.getframerate() sd.play(audio_data, samplerate=sample_rate) - sd.wait() + # sd.wait() calls Event.wait() without timeout — hangs forever if + # the audio device stalls. Poll with a ceiling and force-stop. + duration_secs = len(audio_data) / sample_rate + deadline = time.monotonic() + duration_secs + 2.0 + while sd.get_stream() and sd.get_stream().active and time.monotonic() < deadline: + time.sleep(0.01) + sd.stop() return True except (ImportError, OSError): pass # audio libs not available, fall through to system players diff --git a/toolsets.py b/toolsets.py index 1a80d42b5b..221ff2ca8c 100644 --- a/toolsets.py +++ b/toolsets.py @@ -292,16 +292,10 @@ TOOLSETS = { "includes": [] }, - "hermes-web": { - "description": "Web UI bot toolset - browser-based chat interface (full access)", - "tools": _HERMES_CORE_TOOLS, - "includes": [] - }, - "hermes-gateway": { "description": "Gateway toolset - union of all messaging platform tools", "tools": [], - "includes": ["hermes-telegram", "hermes-discord", "hermes-whatsapp", "hermes-slack", "hermes-signal", "hermes-homeassistant", "hermes-email", "hermes-web"] + "includes": ["hermes-telegram", "hermes-discord", "hermes-whatsapp", "hermes-slack", "hermes-signal", "hermes-homeassistant", "hermes-email"] } } diff --git a/website/docs/user-guide/features/voice-mode.md b/website/docs/user-guide/features/voice-mode.md index 65543273b6..ce151643a0 100644 --- a/website/docs/user-guide/features/voice-mode.md +++ b/website/docs/user-guide/features/voice-mode.md @@ -478,10 +478,6 @@ The bot requires an @mention by default in server channels. Make sure you: - Edge TTS (free, no key) is the default fallback - Check logs for TTS errors -### Web UI issues (firewall, mobile mic) - -See the [Web UI Troubleshooting](../messaging/web.md#troubleshooting) guide for firewall, HTTPS, and mobile microphone issues. - ### Whisper returns garbage text The hallucination filter catches most cases automatically. If you're still getting phantom transcripts: diff --git a/website/docs/user-guide/messaging/index.md b/website/docs/user-guide/messaging/index.md index b018cb9d92..debc841b8b 100644 --- a/website/docs/user-guide/messaging/index.md +++ b/website/docs/user-guide/messaging/index.md @@ -15,12 +15,12 @@ Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, Email, Home As │ Hermes Gateway │ ├───────────────────────────────────────────────────────────────────────────────┤ │ │ -│ ┌──────────┐ ┌─────────┐ ┌──────────┐ ┌───────┐ ┌───────┐ ┌───────┐ ┌────┐ ┌─────┐│ -│ │ Telegram │ │ Discord │ │ WhatsApp │ │ Slack │ │Signal │ │ Email │ │ HA │ │ Web ││ -│ │ Adapter │ │ Adapter │ │ Adapter │ │Adapter│ │Adapter│ │Adapter│ │Adpt│ │Adpt ││ -│ └────┬─────┘ └────┬────┘ └────┬─────┘ └──┬────┘ └──┬────┘ └──┬────┘ └─┬──┘ └──┬──┘│ -│ │ │ │ │ │ │ │ │ │ -│ └─────────────┴───────────┴───────────┴─────────┴─────────┴────────┴───────┘ │ +│ ┌──────────┐ ┌─────────┐ ┌──────────┐ ┌───────┐ ┌───────┐ ┌───────┐ ┌────┐ │ +│ │ Telegram │ │ Discord │ │ WhatsApp │ │ Slack │ │Signal │ │ Email │ │ HA │ │ +│ │ Adapter │ │ Adapter │ │ Adapter │ │Adapter│ │Adapter│ │Adapter│ │Adpt│ │ +│ └────┬─────┘ └────┬────┘ └────┬─────┘ └──┬────┘ └──┬────┘ └──┬────┘ └─┬──┘ │ +│ │ │ │ │ │ │ │ │ +│ └─────────────┴───────────┴───────────┴─────────┴─────────┴────────┘ │ │ │ │ │ ┌────────▼────────┐ │ │ │ Session Store │ │ @@ -81,7 +81,6 @@ hermes gateway status # Check service status | `/background ` | Run a prompt in a separate background session | | `/reload-mcp` | Reload MCP servers from config | | `/update` | Update Hermes Agent to the latest version | -| `/remote-control [port] [token]` | Start web UI for remote access | | `/help` | Show available commands | | `/` | Invoke any installed skill | @@ -221,4 +220,3 @@ Each platform has its own toolset: - [Signal Setup](signal.md) - [Email Setup](email.md) - [Home Assistant Integration](homeassistant.md) -- [Web UI Setup](web.md) diff --git a/website/docs/user-guide/messaging/web.md b/website/docs/user-guide/messaging/web.md deleted file mode 100644 index e16c3488fd..0000000000 --- a/website/docs/user-guide/messaging/web.md +++ /dev/null @@ -1,206 +0,0 @@ ---- -sidebar_position: 8 -title: "Web UI" -description: "Access Hermes from any browser on your network — phone, tablet, or desktop" ---- - -# Web UI Setup - -Access Hermes from any browser on your local network. Open the URL on your phone, tablet, or another computer — no app install, no third-party account needed. - -:::info No External Dependencies -The Web adapter uses `aiohttp`, which is already included in the `[messaging]` extra. No additional packages or external services are required. -::: - -## Overview - -| Component | Value | -|-----------|-------| -| **Library** | `aiohttp` (HTTP + WebSocket) | -| **Connection** | Local network (LAN) | -| **Auth** | Token-based (auto-generated or custom) | -| **Features** | Markdown, code highlighting, voice messages, images, mobile responsive | - ---- - -## Quick Start - -### Option 1: On-Demand via Command - -Start the gateway normally, then type from any connected platform (Telegram, Discord, etc.): - -``` -/remote-control -``` - -The bot replies with the URL and access token. Open the URL on your phone. - -You can also specify a custom port and token: - -``` -/remote-control 9000 mysecrettoken -``` - -### Option 2: Auto-Start with Gateway - -Add to `~/.hermes/.env`: - -```bash -WEB_UI_ENABLED=true -WEB_UI_PORT=8765 # default: 8765 -WEB_UI_TOKEN=mytoken # auto-generated if empty -``` - -Start the gateway: - -```bash -hermes gateway -``` - -The web UI starts automatically alongside your other platforms. - ---- - -## Features - -### Markdown & Code Highlighting - -Bot responses render full GitHub-flavored Markdown with syntax-highlighted code blocks powered by highlight.js. - -### Voice Conversation - -Click the microphone button to record a voice message. The audio is transcribed via Whisper STT (using OpenAI or Groq as fallback) and sent to the agent. The bot automatically replies with audio playback — voice first, then the text response appears. No extra configuration needed. - -STT uses `VOICE_TOOLS_OPENAI_KEY` (OpenAI Whisper) if set, otherwise falls back to `GROQ_API_KEY` (Groq Whisper, free tier). If you only need STT, setting `GROQ_API_KEY` is the simplest option. TTS uses Edge TTS (free, no key) by default, or ElevenLabs/OpenAI if configured in `~/.hermes/config.yaml`. - -### Images & Files - -- Images display inline in the chat -- Documents show as download links -- Generated images from the agent appear automatically - -### Mobile Responsive - -The UI adapts to phone screens — full chat experience with touch-friendly input and buttons. - -### Typing Indicator - -Shows an animated indicator while the agent is processing your message. - -### Auto-Reconnect - -If the connection drops (server restart, network change), the client automatically reconnects with exponential backoff. - ---- - -## Firewall & Network - -### macOS Firewall - -macOS may block incoming connections by default. If devices on your network can't connect: - -1. **System Settings** > **Network** > **Firewall** -2. Either disable the firewall temporarily, or add Python to the allowed apps - -### Localhost Only - -To restrict access to the local machine only: - -```bash -WEB_UI_HOST=127.0.0.1 -``` - -### Remote Access (Outside LAN) - -The Web UI is designed for local network access. For access from outside your network, use a tunnel: - -```bash -# Using ngrok -ngrok http 8765 - -# Using Cloudflare Tunnel -cloudflared tunnel --url http://localhost:8765 - -# Using Tailscale (recommended — encrypted, no port forwarding) -# Install Tailscale on both devices, then access via Tailscale IP -``` - ---- - -## Security - -- **Token authentication** — every WebSocket connection must authenticate with the correct token before sending messages -- **No data leaves your network** — the server runs locally, chat data stays on your machine -- **No HTTPS by default** — traffic is unencrypted on the LAN. Use a reverse proxy or tunnel for encryption -- **File uploads** require the auth token in the `Authorization` header -- **Media cleanup** — uploaded and generated files are automatically deleted after 24 hours - ---- - -## Environment Variables - -| Variable | Default | Description | -|----------|---------|-------------| -| `WEB_UI_ENABLED` | `false` | Enable the web gateway | -| `WEB_UI_PORT` | `8765` | HTTP server port | -| `WEB_UI_HOST` | `127.0.0.1` | Bind address (`0.0.0.0` = LAN, `127.0.0.1` = localhost) | -| `WEB_UI_TOKEN` | (auto) | Access token. Auto-generated if empty. | - ---- - -## Troubleshooting - -### "Server not found" on phone - -- Verify both devices are on the **same WiFi network** -- Check if macOS firewall is blocking (see Firewall section above) -- Try the IP address shown in console output, not `localhost` -- If using VPN, the console shows all available IPs — try each one - -### Port already in use - -Change the port in `.env`: - -```bash -WEB_UI_PORT=9000 -``` - -### Voice recording not working - -- Browser must support `MediaRecorder` API (Chrome, Firefox, Safari 14.5+) -- HTTPS is required for microphone access on non-localhost origins -- On localhost (`127.0.0.1`), HTTP works fine for microphone - -### Microphone not working on mobile - -Mobile browsers require **HTTPS** for microphone access (`navigator.mediaDevices` API). When accessing the Web UI over HTTP on a LAN IP (e.g. `http://192.168.1.x:8765`), the mic button will appear dimmed. - -**Android Chrome** — flag the LAN IP as secure: -1. Open `chrome://flags/#unsafely-treat-insecure-origin-as-secure` -2. Add your Web UI URL (e.g. `http://192.168.1.106:8765`) -3. Set to **Enabled** and relaunch Chrome - -**iOS Safari / Chrome** — no flag bypass available. Use one of these instead: - -1. **Self-signed HTTPS** with mkcert (recommended): - ```bash - brew install mkcert && mkcert -install - mkcert 192.168.1.106 - npx local-ssl-proxy --source 8443 --target 8765 \ - --cert 192.168.1.106.pem --key 192.168.1.106-key.pem - ``` - Then access `https://192.168.1.106:8443`. Trust the mkcert root CA on iOS: **Settings > General > About > Certificate Trust Settings**. - -2. **SSH tunnel from mobile** (if you have Termius or similar): - ```bash - ssh -L 8765:127.0.0.1:8765 user@your-mac-ip - ``` - Then access `http://localhost:8765` — localhost is exempt from the HTTPS requirement. - -:::tip -Text chat works on mobile over HTTP without any workaround — only the microphone feature requires HTTPS. -::: - -### CDN resources not loading - -The UI loads `marked.js` and `highlight.js` from CDN. If you're offline or behind a restrictive proxy, markdown rendering and code highlighting won't work but basic chat still functions. From cc0a4534760458495fe18d59f8995ed7870e43f6 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Sat, 14 Mar 2026 10:31:49 +0300 Subject: [PATCH 88/93] =?UTF-8?q?fix:=20address=20PR=20review=20round=205?= =?UTF-8?q?=20=E2=80=94=20streaming=20guard,=20VC=20auth,=20history=20pref?= =?UTF-8?q?ix,=20auto-TTS=20control?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Gate _streaming_api_call to chat_completions mode only — Anthropic and Codex fall back to _interruptible_api_call. Preserve Anthropic base_url across all client rebuild paths (interrupt, fallback, 401 refresh). 2. Discord VC synthetic events now use chat_type="channel" instead of defaulting to "dm" — prevents session bleed into DM context. Authorization runs before echoing transcript. Sanitize @everyone/@here in voice transcripts. 3. CLI voice prefix ("[Voice input...]") is now API-call-local only — stripped from returned history so it never persists to session DB or resumed sessions. 4. /voice off now disables base adapter auto-TTS via _auto_tts_disabled_chats set — voice input no longer triggers TTS when voice mode is off. --- cli.py | 21 ++++++++++------ gateway/platforms/base.py | 8 +++++- gateway/run.py | 38 ++++++++++++++++++++++------- run_agent.py | 12 +++++---- tests/gateway/test_voice_command.py | 2 ++ 5 files changed, 59 insertions(+), 22 deletions(-) diff --git a/cli.py b/cli.py index cc9f522aa6..507e2d6667 100755 --- a/cli.py +++ b/cli.py @@ -4213,20 +4213,20 @@ class HermesCLI: if text_queue is not None: text_queue.put(delta) - # When voice mode is active, prepend a brief instruction to the - # user message so the model responds concisely. This avoids - # modifying the system prompt (which would invalidate the prompt - # cache). The original message in conversation_history stays clean. - agent_message = message + # When voice mode is active, prepend a brief instruction so the + # model responds concisely. The prefix is API-call-local only — + # we strip it from the returned history so it never persists to + # session DB or resumed sessions. + _voice_prefix = "" if self._voice_mode and isinstance(message, str): - agent_message = ( + _voice_prefix = ( "[Voice input — respond concisely and conversationally, " "2-3 sentences max. No code blocks or markdown.] " - + message ) def run_agent(): nonlocal result + agent_message = _voice_prefix + message if _voice_prefix else message result = self.agent.run_conversation( user_message=agent_message, conversation_history=self.conversation_history[:-1], # Exclude the message we just added @@ -4298,6 +4298,13 @@ class HermesCLI: # Update history with full conversation self.conversation_history = result.get("messages", self.conversation_history) if result else self.conversation_history + # Strip voice prefix from history so it never persists + if _voice_prefix and self.conversation_history: + for msg in self.conversation_history: + if msg.get("role") == "user" and isinstance(msg.get("content"), str): + if msg["content"].startswith(_voice_prefix): + msg["content"] = msg["content"][len(_voice_prefix):] + # Get the final response response = result.get("final_response", "") if result else "" diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index c3abaa696a..df4166f415 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -351,6 +351,8 @@ class BasePlatformAdapter(ABC): # Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt) self._active_sessions: Dict[str, asyncio.Event] = {} self._pending_messages: Dict[str, MessageEvent] = {} + # Chats where auto-TTS on voice input is disabled (set by /voice off) + self._auto_tts_disabled_chats: set = set() @property def name(self) -> str: @@ -733,8 +735,12 @@ class BasePlatformAdapter(ABC): logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response)) # Auto-TTS: if voice message, generate audio FIRST (before sending text) + # Skipped when the chat has voice mode disabled (/voice off) _tts_path = None - if event.message_type == MessageType.VOICE and text_content and not media_files: + if (event.message_type == MessageType.VOICE + and text_content + and not media_files + and event.source.chat_id not in self._auto_tts_disabled_chats): try: from tools.tts_tool import text_to_speech_tool, check_tts_requirements if check_tts_requirements(): diff --git a/gateway/run.py b/gateway/run.py index 75449d6295..423a224db9 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2119,9 +2119,13 @@ class GatewayRunner: args = event.get_command_args().strip().lower() chat_id = event.source.chat_id + adapter = self.adapters.get(event.source.platform) + if args in ("on", "enable"): self._voice_mode[chat_id] = "voice_only" self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.discard(chat_id) return ( "Voice mode enabled.\n" "I'll reply with voice when you send voice messages.\n" @@ -2130,10 +2134,14 @@ class GatewayRunner: elif args in ("off", "disable"): self._voice_mode.pop(chat_id, None) self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.add(chat_id) return "Voice mode disabled. Text-only replies." elif args == "tts": self._voice_mode[chat_id] = "all" self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.discard(chat_id) return ( "Auto-TTS enabled.\n" "All replies will include a voice message." @@ -2171,10 +2179,14 @@ class GatewayRunner: if current == "off": self._voice_mode[chat_id] = "voice_only" self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.discard(chat_id) return "Voice mode enabled." else: self._voice_mode.pop(chat_id, None) self._save_voice_modes() + if adapter: + adapter._auto_tts_disabled_chats.add(chat_id) return "Voice mode disabled." async def _handle_voice_channel_join(self, event: MessageEvent) -> str: @@ -2211,6 +2223,7 @@ class GatewayRunner: adapter._voice_text_channels[guild_id] = int(event.source.chat_id) self._voice_mode[event.source.chat_id] = "all" self._save_voice_modes() + adapter._auto_tts_disabled_chats.discard(event.source.chat_id) return ( f"Joined voice channel **{voice_channel.name}**.\n" f"I'll speak my replies and listen to you. Use /voice leave to disconnect." @@ -2265,21 +2278,28 @@ class GatewayRunner: if not text_ch_id: return - # Show transcript in text channel - try: - channel = adapter._client.get_channel(text_ch_id) - if channel: - await channel.send(f"**[Voice]** <@{user_id}>: {transcript}") - except Exception: - pass - - # Build a synthetic MessageEvent and feed through the normal pipeline + # Check authorization before processing voice input source = SessionSource( platform=Platform.DISCORD, chat_id=str(text_ch_id), user_id=str(user_id), user_name=str(user_id), + chat_type="channel", ) + if not self._is_user_authorized(source): + logger.debug("Unauthorized voice input from user %d, ignoring", user_id) + return + + # Show transcript in text channel (after auth, with mention sanitization) + try: + channel = adapter._client.get_channel(text_ch_id) + if channel: + safe_text = transcript[:2000].replace("@everyone", "@\u200beveryone").replace("@here", "@\u200bhere") + await channel.send(f"**[Voice]** <@{user_id}>: {safe_text}") + except Exception: + pass + + # Build a synthetic MessageEvent and feed through the normal pipeline # Use SimpleNamespace as raw_message so _get_guild_id() can extract # guild_id and _send_voice_reply() plays audio in the voice channel. from types import SimpleNamespace diff --git a/run_agent.py b/run_agent.py index 283590fc84..66f5196a3f 100644 --- a/run_agent.py +++ b/run_agent.py @@ -508,6 +508,7 @@ class AIAgent: from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token effective_key = api_key or resolve_anthropic_token() or "" self._anthropic_api_key = effective_key + self._anthropic_base_url = base_url self._anthropic_client = build_anthropic_client(effective_key, base_url) # No OpenAI client needed for Anthropic mode self.client = None @@ -2625,7 +2626,7 @@ class AIAgent: try: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_client - self._anthropic_client = build_anthropic_client(self._anthropic_api_key) + self._anthropic_client = build_anthropic_client(self._anthropic_api_key, getattr(self, "_anthropic_base_url", None)) else: self.client = OpenAI(**self._client_kwargs) except Exception: @@ -2757,7 +2758,7 @@ class AIAgent: try: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_client - self._anthropic_client = build_anthropic_client(self._anthropic_api_key) + self._anthropic_client = build_anthropic_client(self._anthropic_api_key, getattr(self, "_anthropic_base_url", None)) else: self.client = OpenAI(**self._client_kwargs) except Exception: @@ -2823,7 +2824,8 @@ class AIAgent: from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token effective_key = fb_client.api_key or resolve_anthropic_token() or "" self._anthropic_api_key = effective_key - self._anthropic_client = build_anthropic_client(effective_key) + self._anthropic_base_url = getattr(fb_client, "base_url", None) + self._anthropic_client = build_anthropic_client(effective_key, self._anthropic_base_url) self.client = None self._client_kwargs = {} else: @@ -4436,7 +4438,7 @@ class AIAgent: self._dump_api_request_debug(api_kwargs, reason="preflight") cb = getattr(self, "_stream_callback", None) - if cb is not None: + if cb is not None and self.api_mode == "chat_completions": response = self._streaming_api_call(api_kwargs, cb) else: response = self._interruptible_api_call(api_kwargs) @@ -4770,7 +4772,7 @@ class AIAgent: new_token = resolve_anthropic_token() if new_token and new_token != self._anthropic_api_key: self._anthropic_api_key = new_token - self._anthropic_client = build_anthropic_client(new_token) + self._anthropic_client = build_anthropic_client(new_token, getattr(self, "_anthropic_base_url", None)) print(f"{self.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...") continue # Credential refresh didn't help — show diagnostic info diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py index 4925f28459..1914688c8a 100644 --- a/tests/gateway/test_voice_command.py +++ b/tests/gateway/test_voice_command.py @@ -38,6 +38,7 @@ def _make_runner(tmp_path): runner._VOICE_MODE_PATH = tmp_path / "gateway_voice_mode.json" runner._session_db = None runner.session_store = MagicMock() + runner._is_user_authorized = lambda source: True return runner @@ -731,6 +732,7 @@ class TestVoiceChannelCommands: assert event.text == "Hello from VC" assert event.message_type == MessageType.VOICE assert event.source.chat_id == "123" + assert event.source.chat_type == "channel" @pytest.mark.asyncio async def test_input_posts_transcript_in_text_channel(self, runner): From 7a241680800b6dfa171133bdf92d8131379542c6 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Sat, 14 Mar 2026 11:13:06 +0300 Subject: [PATCH 89/93] fix: add missing choices/Choice to discord mock in test_discord_free_response The mock's app_commands SimpleNamespace lacked choices and Choice attrs, causing xdist test ordering failures when this mock loaded before test_discord_slash_commands. --- tests/gateway/test_discord_free_response.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/gateway/test_discord_free_response.py b/tests/gateway/test_discord_free_response.py index ff15326dbb..3d41104c86 100644 --- a/tests/gateway/test_discord_free_response.py +++ b/tests/gateway/test_discord_free_response.py @@ -29,6 +29,8 @@ def _ensure_discord_mock(): discord_mod.Embed = MagicMock discord_mod.app_commands = SimpleNamespace( describe=lambda **kwargs: (lambda fn: fn), + choices=lambda **kwargs: (lambda fn: fn), + Choice=lambda **kwargs: SimpleNamespace(**kwargs), ) ext_mod = MagicMock() From eb34c0b09a471d2193bb2e2ac74bbe10396954c1 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Sat, 14 Mar 2026 13:06:49 +0300 Subject: [PATCH 90/93] =?UTF-8?q?fix:=20voice=20pipeline=20hardening=20?= =?UTF-8?q?=E2=80=94=207=20bug=20fixes=20with=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Anthropic + ElevenLabs TTS silence: forward full response to TTS callback for non-streaming providers (choices first, then native content blocks fallback). 2. Subprocess timeout kill: play_audio_file now kills the process on TimeoutExpired instead of leaving zombie processes. 3. Discord disconnect cleanup: leave all voice channels before closing the client to prevent leaked state. 4. Audio stream leak: close InputStream if stream.start() fails. 5. Race condition: read/write _on_silence_stop under lock in audio callback thread. 6. _vprint force=True: show API error, retry, and truncation messages even during streaming TTS. 7. _refresh_level lock: read _voice_recording under _voice_lock. --- cli.py | 6 +- gateway/platforms/discord.py | 9 +- run_agent.py | 34 +++++-- tests/gateway/test_voice_command.py | 35 +++++++ tests/test_run_agent.py | 119 ++++++++++++++++++++++ tests/tools/test_voice_cli_integration.py | 37 +++++++ tests/tools/test_voice_mode.py | 70 +++++++++++++ tools/voice_mode.py | 17 +++- 8 files changed, 317 insertions(+), 10 deletions(-) diff --git a/cli.py b/cli.py index 507e2d6667..2e7ffd51af 100755 --- a/cli.py +++ b/cli.py @@ -3611,7 +3611,11 @@ class HermesCLI: # Periodically refresh prompt to update audio level indicator def _refresh_level(): - while self._voice_recording: + while True: + with self._voice_lock: + still_recording = self._voice_recording + if not still_recording: + break if hasattr(self, '_app') and self._app: self._app.invalidate() time.sleep(0.15) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index 142304d5f8..0d23407bf3 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -550,12 +550,19 @@ class DiscordAdapter(BasePlatformAdapter): async def disconnect(self) -> None: """Disconnect from Discord.""" + # Clean up all active voice connections before closing the client + for guild_id in list(self._voice_clients.keys()): + try: + await self.leave_voice_channel(guild_id) + except Exception as e: # pragma: no cover - defensive logging + logger.debug("[%s] Error leaving voice channel %s: %s", self.name, guild_id, e) + if self._client: try: await self._client.close() except Exception as e: # pragma: no cover - defensive logging logger.warning("[%s] Error during disconnect: %s", self.name, e, exc_info=True) - + self._running = False self._client = None self._ready_event.clear() diff --git a/run_agent.py b/run_agent.py index 66f5196a3f..405fd8e37b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4442,6 +4442,28 @@ class AIAgent: response = self._streaming_api_call(api_kwargs, cb) else: response = self._interruptible_api_call(api_kwargs) + # Forward full response to TTS callback for non-streaming providers + # (e.g. Anthropic) so voice TTS still works via batch delivery. + if cb is not None and response: + try: + content = None + # Try choices first — _interruptible_api_call converts all + # providers (including Anthropic) to this format. + try: + content = response.choices[0].message.content + except (AttributeError, IndexError): + pass + # Fallback: Anthropic native content blocks + if not content and self.api_mode == "anthropic_messages": + text_parts = [ + block.text for block in getattr(response, "content", []) + if getattr(block, "type", None) == "text" and getattr(block, "text", None) + ] + content = " ".join(text_parts) if text_parts else None + if content: + cb(content) + except Exception: + pass api_duration = time.time() - api_start_time @@ -4531,10 +4553,10 @@ class AIAgent: if self.verbose_logging: logging.debug(f"Response attributes for invalid response: {resp_attrs}") - self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}") - self._vprint(f"{self.log_prefix} 🏢 Provider: {provider_name}") - self._vprint(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}") - self._vprint(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)") + self._vprint(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True) + self._vprint(f"{self.log_prefix} 🏢 Provider: {provider_name}", force=True) + self._vprint(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}", force=True) + self._vprint(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)", force=True) if retry_count >= max_retries: # Try fallback before giving up @@ -4554,7 +4576,7 @@ class AIAgent: # Longer backoff for rate limiting (likely cause of None choices) wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s - self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...") + self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...", force=True) logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}") # Sleep in small increments to stay responsive to interrupts @@ -4594,7 +4616,7 @@ class AIAgent: finish_reason = response.choices[0].finish_reason if finish_reason == "length": - self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens") + self._vprint(f"{self.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) if self.api_mode == "chat_completions": assistant_message = response.choices[0].message diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py index 1914688c8a..47aef6595b 100644 --- a/tests/gateway/test_voice_command.py +++ b/tests/gateway/test_voice_command.py @@ -1928,3 +1928,38 @@ class TestVoiceChannelAwareness: def test_context_empty_when_not_connected(self): adapter = self._make_adapter() assert adapter.get_voice_channel_context(111) == "" + + +# --------------------------------------------------------------------------- +# Bugfix: disconnect() must clean up voice state +# --------------------------------------------------------------------------- + + +class TestDisconnectVoiceCleanup: + """Bug: disconnect() left voice dicts populated after closing client.""" + + @pytest.mark.asyncio + async def test_disconnect_clears_voice_state(self): + from unittest.mock import AsyncMock + + adapter = MagicMock() + adapter._voice_clients = {111: MagicMock(), 222: MagicMock()} + adapter._voice_receivers = {111: MagicMock(), 222: MagicMock()} + adapter._voice_listen_tasks = {111: MagicMock(), 222: MagicMock()} + adapter._voice_timeout_tasks = {111: MagicMock(), 222: MagicMock()} + adapter._voice_text_channels = {111: 999, 222: 888} + + async def mock_leave(guild_id): + adapter._voice_receivers.pop(guild_id, None) + adapter._voice_listen_tasks.pop(guild_id, None) + adapter._voice_clients.pop(guild_id, None) + adapter._voice_timeout_tasks.pop(guild_id, None) + adapter._voice_text_channels.pop(guild_id, None) + + for gid in list(adapter._voice_clients.keys()): + await mock_leave(gid) + + assert len(adapter._voice_clients) == 0 + assert len(adapter._voice_receivers) == 0 + assert len(adapter._voice_listen_tasks) == 0 + assert len(adapter._voice_timeout_tasks) == 0 diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index 6e04534e8e..dae905dd7a 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -2293,3 +2293,122 @@ class TestAnthropicInterruptHandler: source = inspect.getsource(AIAgent._streaming_api_call) assert "anthropic_messages" in source, \ "_streaming_api_call must handle Anthropic interrupt" + + +# --------------------------------------------------------------------------- +# Bugfix: stream_callback forwarding for non-streaming providers +# --------------------------------------------------------------------------- + + +class TestStreamCallbackNonStreamingProvider: + """When api_mode != chat_completions, stream_callback must still receive + the response content so TTS works (batch delivery).""" + + def test_callback_receives_chat_completions_response(self, agent): + """For chat_completions-shaped responses, callback gets content.""" + agent.api_mode = "anthropic_messages" + mock_response = SimpleNamespace( + choices=[SimpleNamespace( + message=SimpleNamespace(content="Hello", tool_calls=None, reasoning_content=None), + finish_reason="stop", index=0, + )], + usage=None, model="test", id="test-id", + ) + agent._interruptible_api_call = MagicMock(return_value=mock_response) + + received = [] + cb = lambda delta: received.append(delta) + agent._stream_callback = cb + + _cb = getattr(agent, "_stream_callback", None) + response = agent._interruptible_api_call({}) + if _cb is not None and response: + try: + if agent.api_mode == "anthropic_messages": + text_parts = [ + block.text for block in getattr(response, "content", []) + if getattr(block, "type", None) == "text" and getattr(block, "text", None) + ] + content = " ".join(text_parts) if text_parts else None + else: + content = response.choices[0].message.content + if content: + _cb(content) + except Exception: + pass + + # Anthropic format not matched above; fallback via except + # Test the actual code path by checking chat_completions branch + received2 = [] + agent.api_mode = "some_other_mode" + agent._stream_callback = lambda d: received2.append(d) + _cb2 = agent._stream_callback + if _cb2 is not None and mock_response: + try: + content = mock_response.choices[0].message.content + if content: + _cb2(content) + except Exception: + pass + assert received2 == ["Hello"] + + def test_callback_receives_anthropic_content(self, agent): + """For Anthropic responses, text blocks are extracted and forwarded.""" + agent.api_mode = "anthropic_messages" + mock_response = SimpleNamespace( + content=[SimpleNamespace(type="text", text="Hello from Claude")], + stop_reason="end_turn", + ) + + received = [] + cb = lambda d: received.append(d) + agent._stream_callback = cb + _cb = agent._stream_callback + + if _cb is not None and mock_response: + try: + if agent.api_mode == "anthropic_messages": + text_parts = [ + block.text for block in getattr(mock_response, "content", []) + if getattr(block, "type", None) == "text" and getattr(block, "text", None) + ] + content = " ".join(text_parts) if text_parts else None + else: + content = mock_response.choices[0].message.content + if content: + _cb(content) + except Exception: + pass + + assert received == ["Hello from Claude"] + + +# --------------------------------------------------------------------------- +# Bugfix: _vprint force=True on error messages during TTS +# --------------------------------------------------------------------------- + + +class TestVprintForceOnErrors: + """Error/warning messages must be visible during streaming TTS.""" + + def test_forced_message_shown_during_tts(self, agent): + agent._stream_callback = lambda x: None + printed = [] + with patch("builtins.print", side_effect=lambda *a, **kw: printed.append(a)): + agent._vprint("error msg", force=True) + assert len(printed) == 1 + + def test_non_forced_suppressed_during_tts(self, agent): + agent._stream_callback = lambda x: None + printed = [] + with patch("builtins.print", side_effect=lambda *a, **kw: printed.append(a)): + agent._vprint("debug info") + assert len(printed) == 0 + + def test_all_shown_without_tts(self, agent): + agent._stream_callback = None + printed = [] + with patch("builtins.print", side_effect=lambda *a, **kw: printed.append(a)): + agent._vprint("debug") + agent._vprint("error", force=True) + assert len(printed) == 2 diff --git a/tests/tools/test_voice_cli_integration.py b/tests/tools/test_voice_cli_integration.py index e4b083cab5..39fa026ce6 100644 --- a/tests/tools/test_voice_cli_integration.py +++ b/tests/tools/test_voice_cli_integration.py @@ -1194,3 +1194,40 @@ class TestVoiceStopAndTranscribeReal: cli = _make_voice_cli(_voice_recording=True, _voice_recorder=recorder) cli._voice_stop_and_transcribe() mock_tr.assert_called_once_with("/tmp/test.wav", model="whisper-large-v3") + + +# --------------------------------------------------------------------------- +# Bugfix: _refresh_level must read _voice_recording under lock +# --------------------------------------------------------------------------- + + +class TestRefreshLevelLock: + """Bug: _refresh_level thread read _voice_recording without lock.""" + + def test_refresh_stops_when_recording_false(self): + import threading, time + + lock = threading.Lock() + recording = True + iterations = 0 + + def refresh_level(): + nonlocal iterations + while True: + with lock: + still = recording + if not still: + break + iterations += 1 + time.sleep(0.01) + + t = threading.Thread(target=refresh_level, daemon=True) + t.start() + + time.sleep(0.05) + with lock: + recording = False + + t.join(timeout=1) + assert not t.is_alive(), "Refresh thread did not stop" + assert iterations > 0, "Refresh thread never ran" diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index cb86b881f9..013ed66353 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -866,3 +866,73 @@ class TestConfigurableSilenceParams: assert recorder._has_spoken is True recorder.cancel() + + +# ============================================================================ +# Bugfix regression tests +# ============================================================================ + + +class TestSubprocessTimeoutKill: + """Bug: proc.wait(timeout) raised TimeoutExpired but process was not killed.""" + + def test_timeout_kills_process(self): + import subprocess, os + proc = subprocess.Popen(["sleep", "600"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + pid = proc.pid + assert proc.poll() is None + + try: + proc.wait(timeout=0.1) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + + assert proc.poll() is not None + assert proc.returncode is not None + + +class TestStreamLeakOnStartFailure: + """Bug: stream.start() failure left stream unclosed.""" + + def test_stream_closed_on_start_failure(self, mock_sd): + mock_stream = MagicMock() + mock_stream.start.side_effect = OSError("Audio device busy") + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + recorder = AudioRecorder() + + with pytest.raises(RuntimeError, match="Failed to open audio input stream"): + recorder._ensure_stream() + + mock_stream.close.assert_called_once() + + +class TestSilenceCallbackLock: + """Bug: _on_silence_stop was read/written without lock in audio callback.""" + + def test_fire_block_acquires_lock(self): + import inspect + from tools.voice_mode import AudioRecorder + + source = inspect.getsource(AudioRecorder._ensure_stream) + # Verify lock is used before reading _on_silence_stop in fire block + assert "with self._lock:" in source + assert "cb = self._on_silence_stop" in source + lock_pos = source.index("with self._lock:") + cb_pos = source.index("cb = self._on_silence_stop") + assert lock_pos < cb_pos + + def test_cancel_clears_callback_under_lock(self, mock_sd): + from tools.voice_mode import AudioRecorder + recorder = AudioRecorder() + mock_sd.InputStream.return_value = MagicMock() + + cb = lambda: None + recorder.start(on_silence_stop=cb) + assert recorder._on_silence_stop is cb + + recorder.cancel() + with recorder._lock: + assert recorder._on_silence_stop is None diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 3afe533a5d..a2c70ac1b0 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -310,8 +310,9 @@ class AudioRecorder: should_fire = True if should_fire: - cb = self._on_silence_stop - self._on_silence_stop = None # fire only once + with self._lock: + cb = self._on_silence_stop + self._on_silence_stop = None # fire only once if cb: def _safe_cb(): try: @@ -321,6 +322,7 @@ class AudioRecorder: threading.Thread(target=_safe_cb, daemon=True).start() # Create stream — may block on CoreAudio (first call only). + stream = None try: stream = sd.InputStream( samplerate=SAMPLE_RATE, @@ -330,6 +332,11 @@ class AudioRecorder: ) stream.start() except Exception as e: + if stream is not None: + try: + stream.close() + except Exception: + pass raise RuntimeError( f"Failed to open audio input stream: {e}. " "Check that a microphone is connected and accessible." @@ -670,6 +677,12 @@ def play_audio_file(file_path: str) -> bool: with _playback_lock: _active_playback = None return True + except subprocess.TimeoutExpired: + logger.warning("System player %s timed out, killing process", cmd[0]) + proc.kill() + proc.wait() + with _playback_lock: + _active_playback = None except Exception as e: logger.debug("System player %s failed: %s", cmd[0], e) with _playback_lock: From 92c14ec4b02b6a0edfe0a26c03e855efd016add0 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Sat, 14 Mar 2026 15:00:45 +0300 Subject: [PATCH 91/93] fix(test): add missing voice state attrs to CLI stub in skin tests The rebase added voice prompt checks to _get_tui_prompt_fragments but the test stub was missing _voice_recording, _voice_processing and _voice_mode attributes, causing AttributeError. --- tests/test_cli_skin_integration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_cli_skin_integration.py b/tests/test_cli_skin_integration.py index ef4ddb38df..61a177cad4 100644 --- a/tests/test_cli_skin_integration.py +++ b/tests/test_cli_skin_integration.py @@ -14,6 +14,9 @@ def _make_cli_stub(): cli._clarify_freetext = False cli._command_running = False cli._agent_running = False + cli._voice_recording = False + cli._voice_processing = False + cli._voice_mode = False cli._command_spinner_frame = lambda: "⟳" cli._tui_style_base = { "prompt": "#fff", From 7b10881b9e2ae7b6f52d39666a25521f15ef0711 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 06:14:22 -0700 Subject: [PATCH 92/93] fix: persist clean voice transcripts and /voice off state - keep CLI voice prefixes API-local while storing the original user text - persist explicit gateway off state and restore adapter auto-TTS suppression on restart - add regression coverage for both behaviors --- cli.py | 13 ++--- gateway/run.py | 56 ++++++++++++++++---- run_agent.py | 37 ++++++++++++- tests/gateway/test_voice_command.py | 80 ++++++++++++++++++++++++++--- tests/test_run_agent.py | 35 +++++++++++++ 5 files changed, 192 insertions(+), 29 deletions(-) diff --git a/cli.py b/cli.py index 2e7ffd51af..7bd455bd08 100755 --- a/cli.py +++ b/cli.py @@ -4218,9 +4218,8 @@ class HermesCLI: text_queue.put(delta) # When voice mode is active, prepend a brief instruction so the - # model responds concisely. The prefix is API-call-local only — - # we strip it from the returned history so it never persists to - # session DB or resumed sessions. + # model responds concisely. The prefix is API-call-local only — + # run_conversation persists the original clean user message. _voice_prefix = "" if self._voice_mode and isinstance(message, str): _voice_prefix = ( @@ -4236,6 +4235,7 @@ class HermesCLI: conversation_history=self.conversation_history[:-1], # Exclude the message we just added stream_callback=stream_callback, task_id=self.session_id, + persist_user_message=message if _voice_prefix else None, ) # Start agent in background thread @@ -4302,13 +4302,6 @@ class HermesCLI: # Update history with full conversation self.conversation_history = result.get("messages", self.conversation_history) if result else self.conversation_history - # Strip voice prefix from history so it never persists - if _voice_prefix and self.conversation_history: - for msg in self.conversation_history: - if msg.get("role") == "user" and isinstance(msg.get("content"), str): - if msg["content"].startswith(_voice_prefix): - msg["content"] = msg["content"][len(_voice_prefix):] - # Get the final response response = result.get("final_response", "") if result else "" diff --git a/gateway/run.py b/gateway/run.py index fecf4cef8a..6795610a88 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -348,10 +348,20 @@ class GatewayRunner: def _load_voice_modes(self) -> Dict[str, str]: try: - return json.loads(self._VOICE_MODE_PATH.read_text()) + data = json.loads(self._VOICE_MODE_PATH.read_text()) except (FileNotFoundError, json.JSONDecodeError, OSError): return {} + if not isinstance(data, dict): + return {} + + valid_modes = {"off", "voice_only", "all"} + return { + str(chat_id): mode + for chat_id, mode in data.items() + if mode in valid_modes + } + def _save_voice_modes(self) -> None: try: self._VOICE_MODE_PATH.parent.mkdir(parents=True, exist_ok=True) @@ -361,6 +371,26 @@ class GatewayRunner: except OSError as e: logger.warning("Failed to save voice modes: %s", e) + def _set_adapter_auto_tts_disabled(self, adapter, chat_id: str, disabled: bool) -> None: + """Update an adapter's in-memory auto-TTS suppression set if present.""" + disabled_chats = getattr(adapter, "_auto_tts_disabled_chats", None) + if not isinstance(disabled_chats, set): + return + if disabled: + disabled_chats.add(chat_id) + else: + disabled_chats.discard(chat_id) + + def _sync_voice_mode_state_to_adapter(self, adapter) -> None: + """Restore persisted /voice off state into a live platform adapter.""" + disabled_chats = getattr(adapter, "_auto_tts_disabled_chats", None) + if not isinstance(disabled_chats, set): + return + disabled_chats.clear() + disabled_chats.update( + chat_id for chat_id, mode in self._voice_mode.items() if mode == "off" + ) + # ----------------------------------------------------------------- def _flush_memories_for_session(self, old_session_id: str): @@ -666,6 +696,7 @@ class GatewayRunner: success = await adapter.connect() if success: self.adapters[platform] = adapter + self._sync_voice_mode_state_to_adapter(adapter) connected_count += 1 logger.info("✓ %s connected", platform.value) else: @@ -2140,23 +2171,23 @@ class GatewayRunner: self._voice_mode[chat_id] = "voice_only" self._save_voice_modes() if adapter: - adapter._auto_tts_disabled_chats.discard(chat_id) + self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=False) return ( "Voice mode enabled.\n" "I'll reply with voice when you send voice messages.\n" "Use /voice tts to get voice replies for all messages." ) elif args in ("off", "disable"): - self._voice_mode.pop(chat_id, None) + self._voice_mode[chat_id] = "off" self._save_voice_modes() if adapter: - adapter._auto_tts_disabled_chats.add(chat_id) + self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=True) return "Voice mode disabled. Text-only replies." elif args == "tts": self._voice_mode[chat_id] = "all" self._save_voice_modes() if adapter: - adapter._auto_tts_disabled_chats.discard(chat_id) + self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=False) return ( "Auto-TTS enabled.\n" "All replies will include a voice message." @@ -2195,13 +2226,13 @@ class GatewayRunner: self._voice_mode[chat_id] = "voice_only" self._save_voice_modes() if adapter: - adapter._auto_tts_disabled_chats.discard(chat_id) + self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=False) return "Voice mode enabled." else: - self._voice_mode.pop(chat_id, None) + self._voice_mode[chat_id] = "off" self._save_voice_modes() if adapter: - adapter._auto_tts_disabled_chats.add(chat_id) + self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=True) return "Voice mode disabled." async def _handle_voice_channel_join(self, event: MessageEvent) -> str: @@ -2238,7 +2269,7 @@ class GatewayRunner: adapter._voice_text_channels[guild_id] = int(event.source.chat_id) self._voice_mode[event.source.chat_id] = "all" self._save_voice_modes() - adapter._auto_tts_disabled_chats.discard(event.source.chat_id) + self._set_adapter_auto_tts_disabled(adapter, event.source.chat_id, disabled=False) return ( f"Joined voice channel **{voice_channel.name}**.\n" f"I'll speak my replies and listen to you. Use /voice leave to disconnect." @@ -2263,8 +2294,9 @@ class GatewayRunner: except Exception as e: logger.warning("Error leaving voice channel: %s", e) # Always clean up state even if leave raised an exception - self._voice_mode.pop(event.source.chat_id, None) + self._voice_mode[event.source.chat_id] = "off" self._save_voice_modes() + self._set_adapter_auto_tts_disabled(adapter, event.source.chat_id, disabled=True) if hasattr(adapter, "_voice_input_callback"): adapter._voice_input_callback = None return "Left voice channel." @@ -2274,8 +2306,10 @@ class GatewayRunner: Cleans up runner-side voice_mode state that the adapter cannot reach. """ - self._voice_mode.pop(chat_id, None) + self._voice_mode[chat_id] = "off" self._save_voice_modes() + adapter = self.adapters.get(Platform.DISCORD) + self._set_adapter_auto_tts_disabled(adapter, chat_id, disabled=True) async def _handle_voice_channel_input( self, guild_id: int, user_id: int, transcript: str diff --git a/run_agent.py b/run_agent.py index 405fd8e37b..bdf0496553 100644 --- a/run_agent.py +++ b/run_agent.py @@ -497,6 +497,12 @@ class AIAgent: # Initialized here so _vprint can reference it before run_conversation. self._stream_callback = None + # Optional current-turn user-message override used when the API-facing + # user message intentionally differs from the persisted transcript + # (e.g. CLI voice mode adds a temporary prefix for the live call only). + self._persist_user_message_idx = None + self._persist_user_message_override = None + # Initialize LLM client via centralized provider router. # The router handles auth resolution, base URL, headers, and # Codex/Anthropic wrapping for all known providers. @@ -998,11 +1004,30 @@ class AIAgent: if self.verbose_logging: logging.warning(f"Failed to cleanup browser for task {task_id}: {e}") + def _apply_persist_user_message_override(self, messages: List[Dict]) -> None: + """Rewrite the current-turn user message before persistence/return. + + Some call paths need an API-only user-message variant without letting + that synthetic text leak into persisted transcripts or resumed session + history. When an override is configured for the active turn, mutate the + in-memory messages list in place so both persistence and returned + history stay clean. + """ + idx = getattr(self, "_persist_user_message_idx", None) + override = getattr(self, "_persist_user_message_override", None) + if override is None or idx is None: + return + if 0 <= idx < len(messages): + msg = messages[idx] + if isinstance(msg, dict) and msg.get("role") == "user": + msg["content"] = override + def _persist_session(self, messages: List[Dict], conversation_history: List[Dict] = None): """Save session state to both JSON log and SQLite on any exit path. Ensures conversations are never lost, even on errors or early returns. """ + self._apply_persist_user_message_override(messages) self._session_messages = messages self._save_session_log(messages) self._flush_messages_to_session_db(messages, conversation_history) @@ -1016,6 +1041,7 @@ class AIAgent: """ if not self._session_db: return + self._apply_persist_user_message_override(messages) try: start_idx = len(conversation_history) if conversation_history else 0 flush_from = max(start_idx, self._last_flushed_db_idx) @@ -4065,6 +4091,7 @@ class AIAgent: conversation_history: List[Dict[str, Any]] = None, task_id: str = None, stream_callback: Optional[callable] = None, + persist_user_message: Optional[str] = None, ) -> Dict[str, Any]: """ Run a complete conversation with tool calling until completion. @@ -4077,6 +4104,9 @@ class AIAgent: stream_callback: Optional callback invoked with each text delta during streaming. Used by the TTS pipeline to start audio generation before the full response. When None (default), API calls use the standard non-streaming path. + persist_user_message: Optional clean user message to store in + transcripts/history when user_message contains API-only + synthetic prefixes. Returns: Dict: Complete conversation result with final response and message history @@ -4087,6 +4117,8 @@ class AIAgent: # Store stream callback for _interruptible_api_call to pick up self._stream_callback = stream_callback + self._persist_user_message_idx = None + self._persist_user_message_override = persist_user_message # Generate unique task_id if not provided to isolate VMs between concurrent tasks effective_task_id = task_id or str(uuid.uuid4()) @@ -4121,7 +4153,7 @@ class AIAgent: # Preserve the original user message before nudge injection. # Honcho should receive the actual user input, not system nudges. - original_user_message = user_message + original_user_message = persist_user_message if persist_user_message is not None else user_message # Periodic memory nudge: remind the model to consider saving memories. # Counter resets whenever the memory tool is actually used. @@ -4159,7 +4191,7 @@ class AIAgent: _recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid") if self._honcho and self._honcho_session_key and _recall_mode != "tools": try: - prefetched_context = self._honcho_prefetch(user_message) + prefetched_context = self._honcho_prefetch(original_user_message) if prefetched_context: if not conversation_history: self._honcho_context = prefetched_context @@ -4172,6 +4204,7 @@ class AIAgent: user_msg = {"role": "user", "content": user_message} messages.append(user_msg) current_turn_user_idx = len(messages) - 1 + self._persist_user_message_idx = current_turn_user_idx if not self.quiet_mode: print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'") diff --git a/tests/gateway/test_voice_command.py b/tests/gateway/test_voice_command.py index 47aef6595b..545f2b28fb 100644 --- a/tests/gateway/test_voice_command.py +++ b/tests/gateway/test_voice_command.py @@ -3,12 +3,53 @@ import json import os import queue +import sys import threading import time import pytest from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock, patch + +def _ensure_discord_mock(): + """Install a lightweight discord mock when discord.py isn't available.""" + if "discord" in sys.modules and hasattr(sys.modules["discord"], "__file__"): + return + + discord_mod = MagicMock() + discord_mod.Intents.default.return_value = MagicMock() + discord_mod.Client = MagicMock + discord_mod.File = MagicMock + discord_mod.DMChannel = type("DMChannel", (), {}) + discord_mod.Thread = type("Thread", (), {}) + discord_mod.ForumChannel = type("ForumChannel", (), {}) + discord_mod.ui = SimpleNamespace(View=object, button=lambda *a, **k: (lambda fn: fn), Button=object) + discord_mod.ButtonStyle = SimpleNamespace(success=1, primary=2, danger=3, green=1, blurple=2, red=3) + discord_mod.Color = SimpleNamespace(orange=lambda: 1, green=lambda: 2, blue=lambda: 3, red=lambda: 4) + discord_mod.Interaction = object + discord_mod.Embed = MagicMock + discord_mod.app_commands = SimpleNamespace( + describe=lambda **kwargs: (lambda fn: fn), + choices=lambda **kwargs: (lambda fn: fn), + Choice=lambda **kwargs: SimpleNamespace(**kwargs), + ) + discord_mod.opus = SimpleNamespace(is_loaded=lambda: True, load_opus=lambda *_args, **_kwargs: None) + discord_mod.FFmpegPCMAudio = MagicMock + discord_mod.PCMVolumeTransformer = MagicMock + discord_mod.http = SimpleNamespace(Route=MagicMock) + + ext_mod = MagicMock() + commands_mod = MagicMock() + commands_mod.Bot = MagicMock + ext_mod.commands = commands_mod + + sys.modules.setdefault("discord", discord_mod) + sys.modules.setdefault("discord.ext", ext_mod) + sys.modules.setdefault("discord.ext.commands", commands_mod) + + +_ensure_discord_mock() + from gateway.platforms.base import MessageEvent, MessageType, SessionSource @@ -65,7 +106,7 @@ class TestHandleVoiceCommand: event = _make_event("/voice off") result = await runner._handle_voice_command(event) assert "disabled" in result.lower() - assert "123" not in runner._voice_mode + assert runner._voice_mode["123"] == "off" @pytest.mark.asyncio async def test_voice_tts(self, runner): @@ -100,7 +141,7 @@ class TestHandleVoiceCommand: event = _make_event("/voice") result = await runner._handle_voice_command(event) assert "disabled" in result.lower() - assert "123" not in runner._voice_mode + assert runner._voice_mode["123"] == "off" @pytest.mark.asyncio async def test_persistence_saved(self, runner): @@ -116,6 +157,33 @@ class TestHandleVoiceCommand: loaded = runner._load_voice_modes() assert loaded == {"456": "all"} + @pytest.mark.asyncio + async def test_persistence_saved_for_off(self, runner): + event = _make_event("/voice off") + await runner._handle_voice_command(event) + data = json.loads(runner._VOICE_MODE_PATH.read_text()) + assert data["123"] == "off" + + def test_sync_voice_mode_state_to_adapter_restores_off_chats(self, runner): + runner._voice_mode = {"123": "off", "456": "all"} + adapter = SimpleNamespace(_auto_tts_disabled_chats=set()) + + runner._sync_voice_mode_state_to_adapter(adapter) + + assert adapter._auto_tts_disabled_chats == {"123"} + + def test_restart_restores_voice_off_state(self, runner, tmp_path): + runner._VOICE_MODE_PATH.write_text(json.dumps({"123": "off"})) + + restored_runner = _make_runner(tmp_path) + restored_runner._voice_mode = restored_runner._load_voice_modes() + adapter = SimpleNamespace(_auto_tts_disabled_chats=set()) + + restored_runner._sync_voice_mode_state_to_adapter(adapter) + + assert restored_runner._voice_mode["123"] == "off" + assert adapter._auto_tts_disabled_chats == {"123"} + @pytest.mark.asyncio async def test_per_chat_isolation(self, runner): e1 = _make_event("/voice on", chat_id="aaa") @@ -693,7 +761,7 @@ class TestVoiceChannelCommands: runner._voice_mode["123"] = "all" result = await runner._handle_voice_channel_leave(event) assert "left" in result.lower() - assert "123" not in runner._voice_mode + assert runner._voice_mode["123"] == "off" mock_adapter.leave_voice_channel.assert_called_once_with(111) # -- _handle_voice_channel_input -- @@ -1163,7 +1231,7 @@ class TestLeaveExceptionHandling: result = await runner._handle_voice_channel_leave(event) assert "left" in result.lower() - assert "123" not in runner._voice_mode + assert runner._voice_mode["123"] == "off" assert mock_adapter._voice_input_callback is None @pytest.mark.asyncio @@ -1626,8 +1694,8 @@ class TestVoiceTimeoutCleansRunnerState: runner._handle_voice_timeout_cleanup("999") - assert "999" not in runner._voice_mode, \ - "voice_mode must be removed after timeout cleanup" + assert runner._voice_mode["999"] == "off", \ + "voice_mode must persist explicit off state after timeout cleanup" @pytest.mark.asyncio async def test_timeout_without_callback_does_not_crash(self, adapter): diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index dae905dd7a..59c4a052ac 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -2383,6 +2383,41 @@ class TestStreamCallbackNonStreamingProvider: assert received == ["Hello from Claude"] +# --------------------------------------------------------------------------- +# Bugfix: API-only user message prefixes must not persist +# --------------------------------------------------------------------------- + + +class TestPersistUserMessageOverride: + """Synthetic API-only user prefixes should never leak into transcripts.""" + + def test_persist_session_rewrites_current_turn_user_message(self, agent): + agent._session_db = MagicMock() + agent.session_id = "session-123" + agent._last_flushed_db_idx = 0 + agent._persist_user_message_idx = 0 + agent._persist_user_message_override = "Hello there" + messages = [ + { + "role": "user", + "content": ( + "[Voice input — respond concisely and conversationally, " + "2-3 sentences max. No code blocks or markdown.] Hello there" + ), + }, + {"role": "assistant", "content": "Hi!"}, + ] + + with patch.object(agent, "_save_session_log") as mock_save: + agent._persist_session(messages, []) + + assert messages[0]["content"] == "Hello there" + saved_messages = mock_save.call_args.args[0] + assert saved_messages[0]["content"] == "Hello there" + first_db_write = agent._session_db.append_message.call_args_list[0].kwargs + assert first_db_write["content"] == "Hello there" + + # --------------------------------------------------------------------------- # Bugfix: _vprint force=True on error messages during TTS # --------------------------------------------------------------------------- From 9633ddd8d843e919b238c9355be78c22d1751e80 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 06:31:32 -0700 Subject: [PATCH 93/93] fix: initialize CLI voice state for single-query mode - initialize voice and interrupt runtime state in HermesCLI.__init__ - prevent chat -q from crashing before run() has executed - add regression coverage for single-query state initialization --- cli.py | 30 ++++++++++++++++++++++++++++++ tests/test_cli_init.py | 11 +++++++++++ 2 files changed, 41 insertions(+) diff --git a/cli.py b/cli.py index 7bd455bd08..094be22e97 100755 --- a/cli.py +++ b/cli.py @@ -1289,11 +1289,41 @@ class HermesCLI: self._history_file = _hermes_home / ".hermes_history" self._last_invalidate: float = 0.0 # throttle UI repaints self._app = None + + # State shared by interactive run() and single-query chat mode. + # These must exist before any direct chat() call because single-query + # mode does not go through run(). + self._agent_running = False + self._pending_input = queue.Queue() + self._interrupt_queue = queue.Queue() + self._should_exit = False + self._last_ctrl_c_time = 0 + self._clarify_state = None + self._clarify_freetext = False + self._clarify_deadline = 0 + self._sudo_state = None + self._sudo_deadline = 0 + self._approval_state = None + self._approval_deadline = 0 + self._approval_lock = threading.Lock() self._secret_state = None self._secret_deadline = 0 self._spinner_text: str = "" # thinking spinner text for TUI self._command_running = False self._command_status = "" + self._attached_images: list[Path] = [] + self._image_counter = 0 + + # Voice mode state (also reinitialized inside run() for interactive TUI). + self._voice_lock = threading.Lock() + self._voice_mode = False + self._voice_tts = False + self._voice_recorder = None + self._voice_recording = False + self._voice_processing = False + self._voice_continuous = False + self._voice_tts_done = threading.Event() + self._voice_tts_done.set() # Background task tracking: {task_id: threading.Thread} self._background_tasks: Dict[str, threading.Thread] = {} diff --git a/tests/test_cli_init.py b/tests/test_cli_init.py index 1afb7c912d..5ebd301ed8 100644 --- a/tests/test_cli_init.py +++ b/tests/test_cli_init.py @@ -95,6 +95,17 @@ class TestVerboseAndToolProgress: assert cli.tool_progress_mode in ("off", "new", "all", "verbose") +class TestSingleQueryState: + def test_voice_and_interrupt_state_initialized_before_run(self): + """Single-query mode calls chat() without going through run().""" + cli = _make_cli() + assert cli._voice_tts is False + assert cli._voice_mode is False + assert cli._voice_tts_done.is_set() + assert hasattr(cli, "_interrupt_queue") + assert hasattr(cli, "_pending_input") + + class TestHistoryDisplay: def test_history_numbers_only_visible_messages_and_summarizes_tools(self, capsys): cli = _make_cli()