diff --git a/cli.py b/cli.py index 9fb613c851..e35fdafb9d 100755 --- a/cli.py +++ b/cli.py @@ -1550,6 +1550,7 @@ class HermesCLI: checkpoints_enabled=self.checkpoints_enabled, checkpoint_max_snapshots=self.checkpoint_max_snapshots, pass_session_id=self.pass_session_id, + tool_progress_callback=self._on_tool_progress, ) # Apply any pending title now that the session exists in the DB if self._pending_title and self._session_db: @@ -3515,6 +3516,28 @@ class HermesCLI: except Exception as e: print(f" ❌ MCP reload failed: {e}") + # ==================================================================== + # Tool progress callback (audio cues for voice mode) + # ==================================================================== + + def _on_tool_progress(self, function_name: str, preview: str, function_args: dict): + """Called when a tool starts executing. Plays audio cue in voice mode.""" + if not self._voice_mode: + return + # Skip internal/thinking tools + if function_name.startswith("_"): + return + try: + from tools.voice_mode import play_beep + # Short, subtle tick sound (higher pitch, very brief) + threading.Thread( + target=play_beep, + kwargs={"frequency": 1200, "duration": 0.06, "count": 1}, + daemon=True, + ).start() + except Exception: + pass + # ==================================================================== # Voice mode methods # ==================================================================== @@ -3536,9 +3559,21 @@ class HermesCLI: "Get one at: https://platform.openai.com/api-keys" ) + # Load silence detection params from config + voice_cfg = {} + try: + from hermes_cli.config import load_config + voice_cfg = load_config().get("voice", {}) + except Exception: + pass + if self._voice_recorder is None: self._voice_recorder = AudioRecorder() + # Apply config-driven silence params + self._voice_recorder._silence_threshold = voice_cfg.get("silence_threshold", 200) + self._voice_recorder._silence_duration = voice_cfg.get("silence_duration", 3.0) + def _on_silence(): """Called by AudioRecorder when silence is detected after speech.""" with self._voice_lock: @@ -3549,18 +3584,26 @@ class HermesCLI: self._app.invalidate() self._voice_stop_and_transcribe() + # Audio cue: single beep BEFORE starting stream (avoid CoreAudio conflict) + try: + from tools.voice_mode import play_beep + play_beep(frequency=880, count=1) + except Exception: + pass + self._voice_recorder.start(on_silence_stop=_on_silence) with self._voice_lock: self._voice_recording = True - - # Audio cue: single beep on recording start - try: - from tools.voice_mode import play_beep - threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start() - except Exception: - pass _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}") + # Periodically refresh prompt to update audio level indicator + def _refresh_level(): + while self._voice_recording: + if hasattr(self, '_app') and self._app: + self._app.invalidate() + time.sleep(0.15) + threading.Thread(target=_refresh_level, daemon=True).start() + def _voice_stop_and_transcribe(self): """Stop recording, transcribe via STT, and queue the transcript as input.""" try: @@ -3571,15 +3614,15 @@ class HermesCLI: with self._voice_lock: self._voice_recording = False - # Audio cue: double beep on recording stop + # Audio cue: double beep after stream stopped (no CoreAudio conflict) try: from tools.voice_mode import play_beep - threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start() + play_beep(frequency=660, count=2) except Exception: pass if wav_path is None: - _cprint(f"{_DIM}No speech detected (recording too short).{_RST}") + _cprint(f"{_DIM}No speech detected.{_RST}") return with self._voice_lock: @@ -3614,6 +3657,7 @@ class HermesCLI: finally: with self._voice_lock: self._voice_processing = False + submitted = self._pending_input.qsize() > 0 if hasattr(self, '_app') and self._app: self._app.invalidate() # Clean up temp file @@ -3623,6 +3667,18 @@ class HermesCLI: except Exception: pass + # If no transcript was submitted but continuous mode is active, + # restart recording so the user can keep talking. + # (When transcript IS submitted, process_loop handles restart + # after chat() completes.) + if self._voice_continuous and not submitted and not self._voice_recording: + try: + self._voice_start_recording() + if hasattr(self, '_app') and self._app: + self._app.invalidate() + except Exception: + pass + def _voice_speak_response(self, text: str): """Speak the agent's response aloud using TTS (runs in background thread).""" if not self._voice_tts: @@ -3727,6 +3783,16 @@ class HermesCLI: except Exception: pass + # Append voice-mode system prompt for concise, conversational responses + self._voice_original_prompt = self.system_prompt + voice_instruction = ( + "\n\n[Voice mode active] The user is speaking via voice input. " + "Keep responses concise and conversational — 2-3 sentences max unless " + "the user asks for detail. Avoid code blocks, markdown formatting, " + "and long lists. Respond naturally as in a spoken conversation." + ) + self.system_prompt = (self.system_prompt or "") + voice_instruction + tts_status = " (TTS enabled)" if self._voice_tts else "" _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}") _cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}") @@ -3742,6 +3808,10 @@ class HermesCLI: self._voice_mode = False self._voice_tts = False self._voice_continuous = False + + # Restore original system prompt + if hasattr(self, '_voice_original_prompt'): + self.system_prompt = self._voice_original_prompt _cprint(f"\n{_DIM}Voice mode disabled.{_RST}") def _toggle_voice_tts(self): @@ -4237,11 +4307,24 @@ class HermesCLI: # Icon-only custom prompts should still remain visible in special states. return symbol, symbol + def _audio_level_bar(self) -> str: + """Return a visual audio level indicator based on current RMS.""" + _LEVEL_BARS = " ▁▂▃▄▅▆▇" + rec = getattr(self, "_voice_recorder", None) + if rec is None: + return "" + rms = rec.current_rms + # Normalize RMS (0-32767) to 0-7 index, with log-ish scaling + # Typical speech RMS is 500-5000, we cap display at ~8000 + level = min(rms, 8000) * 7 // 8000 + return _LEVEL_BARS[level] + def _get_tui_prompt_fragments(self): """Return the prompt_toolkit fragments for the current interactive state.""" symbol, state_suffix = self._get_tui_prompt_symbols() if self._voice_recording: - return [("class:voice-recording", f"● {state_suffix}")] + bar = self._audio_level_bar() + return [("class:voice-recording", f"● {bar} {state_suffix}")] if self._voice_processing: return [("class:voice-processing", f"◉ {state_suffix}")] if self._sudo_state: @@ -4692,6 +4775,14 @@ class HermesCLI: ).start() else: try: + # Interrupt TTS if playing, so user can start talking + if not cli_ref._voice_tts_done.is_set(): + try: + from tools.voice_mode import stop_playback + stop_playback() + cli_ref._voice_tts_done.set() + except Exception: + pass with cli_ref._voice_lock: cli_ref._voice_continuous = True cli_ref._voice_start_recording() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 174e4326e4..8dc2076404 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -207,6 +207,8 @@ DEFAULT_CONFIG = { "record_key": "ctrl+r", "max_recording_seconds": 120, "auto_tts": False, + "silence_threshold": 200, # RMS below this = silence (0-32767) + "silence_duration": 3.0, # Seconds of silence before auto-stop }, "human_delay": { diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index 0d40932e2f..e6a46def7c 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -157,6 +157,7 @@ class TestAudioRecorderStop: # Simulate captured audio frames (1 second of loud audio above RMS threshold) frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16") recorder._frames = [frame] + recorder._peak_rms = 1000 # Peak RMS above threshold wav_path = recorder.stop() @@ -203,6 +204,7 @@ class TestAudioRecorderStop: # 1 second of near-silence (RMS well below threshold) frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16") recorder._frames = [frame] + recorder._peak_rms = 10 # Peak RMS also below threshold wav_path = recorder.stop() assert wav_path is None @@ -475,8 +477,9 @@ class TestSilenceDetection: from tools.voice_mode import AudioRecorder, SAMPLE_RATE recorder = AudioRecorder() - # Use very short silence duration for testing + # Use very short durations for testing recorder._silence_duration = 0.05 + recorder._min_speech_duration = 0.05 fired = threading.Event() @@ -490,9 +493,11 @@ class TestSilenceDetection: if callback is None: callback = mock_sd.InputStream.call_args[1]["callback"] - # Simulate loud audio (speech) -- RMS well above threshold + # Simulate sustained speech (multiple loud chunks to exceed min_speech_duration) loud_frame = np.full((1600, 1), 5000, dtype="int16") callback(loud_frame, 1600, None, None) + time.sleep(0.06) + callback(loud_frame, 1600, None, None) assert recorder._has_spoken is True # Simulate silence @@ -537,6 +542,47 @@ class TestSilenceDetection: recorder.cancel() + def test_micro_pause_tolerance_during_speech(self, mock_sd): + """Brief dips below threshold during speech should NOT reset speech tracking.""" + np = pytest.importorskip("numpy") + import threading + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder + + recorder = AudioRecorder() + recorder._silence_duration = 0.05 + recorder._min_speech_duration = 0.15 + recorder._max_dip_tolerance = 0.1 + + fired = threading.Event() + recorder.start(on_silence_stop=lambda: fired.set()) + + callback = mock_sd.InputStream.call_args.kwargs.get("callback") + if callback is None: + callback = mock_sd.InputStream.call_args[1]["callback"] + + loud_frame = np.full((1600, 1), 5000, dtype="int16") + quiet_frame = np.full((1600, 1), 50, dtype="int16") + + # Speech chunk 1 + callback(loud_frame, 1600, None, None) + time.sleep(0.05) + # Brief micro-pause (dip < max_dip_tolerance) + callback(quiet_frame, 1600, None, None) + time.sleep(0.05) + # Speech resumes -- speech_start should NOT have been reset + callback(loud_frame, 1600, None, None) + assert recorder._speech_start > 0, "Speech start should be preserved across brief dips" + time.sleep(0.06) + # Another speech chunk to exceed min_speech_duration + callback(loud_frame, 1600, None, None) + assert recorder._has_spoken is True, "Speech should be confirmed after tolerating micro-pause" + + recorder.cancel() + def test_no_callback_means_no_silence_detection(self, mock_sd): np = pytest.importorskip("numpy") diff --git a/tools/voice_mode.py b/tools/voice_mode.py index cdffa99086..d4fd00f19b 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -117,10 +117,18 @@ class AudioRecorder: self._start_time: float = 0.0 # Silence detection state self._has_spoken = False + self._speech_start: float = 0.0 # When speech attempt began + self._dip_start: float = 0.0 # When current below-threshold dip began + self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm + self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech self._silence_start: float = 0.0 self._on_silence_stop = None self._silence_threshold: int = SILENCE_RMS_THRESHOLD self._silence_duration: float = SILENCE_DURATION_SECONDS + # Peak RMS seen during recording (for speech presence check in stop()) + self._peak_rms: int = 0 + # Live audio level (read by UI for visual feedback) + self._current_rms: int = 0 # -- public properties --------------------------------------------------- @@ -134,6 +142,11 @@ class AudioRecorder: return 0.0 return time.monotonic() - self._start_time + @property + def current_rms(self) -> int: + """Current audio input RMS level (0-32767). Updated each audio chunk.""" + return self._current_rms + # -- public methods ------------------------------------------------------ def start(self, on_silence_stop=None) -> None: @@ -161,7 +174,10 @@ class AudioRecorder: self._frames = [] self._start_time = time.monotonic() self._has_spoken = False + self._speech_start = 0.0 + self._dip_start = 0.0 self._silence_start = 0.0 + self._peak_rms = 0 self._on_silence_stop = on_silence_stop def _callback(indata, frames, time_info, status): # noqa: ARG001 @@ -169,15 +185,44 @@ class AudioRecorder: logger.debug("sounddevice status: %s", status) self._frames.append(indata.copy()) - # Silence detection: compute RMS of this chunk + # Compute RMS for level display and silence detection + rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) + self._current_rms = rms + if rms > self._peak_rms: + self._peak_rms = rms + + # Silence detection if self._on_silence_stop is not None and self._recording: - rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2))) now = time.monotonic() if rms > self._silence_threshold: - self._has_spoken = True + # Audio is above threshold -- this is speech (or noise). + self._dip_start = 0.0 # Reset dip tracker + if self._speech_start == 0.0: + self._speech_start = now + elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration: + self._has_spoken = True + logger.debug("Speech confirmed (%.2fs above threshold)", + now - self._speech_start) self._silence_start = 0.0 elif self._has_spoken: + # Speech already confirmed, let silence timer run below + pass + elif self._speech_start > 0: + # We were in a speech attempt but RMS dipped. + # Tolerate brief dips (micro-pauses between syllables). + if self._dip_start == 0.0: + self._dip_start = now + elif now - self._dip_start >= self._max_dip_tolerance: + # Dip lasted too long -- genuine silence, reset + logger.debug("Speech attempt reset (dip lasted %.2fs)", + now - self._dip_start) + self._speech_start = 0.0 + self._dip_start = 0.0 + # else: brief dip, keep tolerating + # else: no speech attempt, just silence -- nothing to do + + if self._has_spoken and rms <= self._silence_threshold: # User was speaking and now is silent if self._silence_start == 0.0: self._silence_start = now @@ -235,10 +280,11 @@ class AudioRecorder: logger.debug("Recording too short (%d samples), discarding", len(audio_data)) return None - # Skip silent recordings (RMS below threshold = no real speech) - rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2))) - if rms < SILENCE_RMS_THRESHOLD: - logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD) + # Skip silent recordings using peak RMS (not overall average, which + # gets diluted by silence at the end of the recording). + if self._peak_rms < SILENCE_RMS_THRESHOLD: + logger.info("Recording too quiet (peak RMS=%d < %d), discarding", + self._peak_rms, SILENCE_RMS_THRESHOLD) return None return self._write_wav(audio_data) @@ -341,8 +387,34 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str # ============================================================================ -# Audio playback +# Audio playback (interruptable) # ============================================================================ + +# Global reference to the active playback process so it can be interrupted. +_active_playback: Optional[subprocess.Popen] = None +_playback_lock = threading.Lock() + + +def stop_playback() -> None: + """Interrupt the currently playing audio (if any).""" + global _active_playback + with _playback_lock: + proc = _active_playback + _active_playback = None + if proc and proc.poll() is None: + try: + proc.terminate() + logger.info("Audio playback interrupted") + except Exception: + pass + # Also stop sounddevice playback if active + if _HAS_AUDIO: + try: + sd.stop() + except Exception: + pass + + def play_audio_file(file_path: str) -> bool: """Play an audio file through the default output device. @@ -351,9 +423,13 @@ def play_audio_file(file_path: str) -> bool: 2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform), ``aplay`` (Linux ALSA). + Playback can be interrupted by calling ``stop_playback()``. + Returns: ``True`` if playback succeeded, ``False`` otherwise. """ + global _active_playback + if not os.path.isfile(file_path): logger.warning("Audio file not found: %s", file_path) return False @@ -372,7 +448,7 @@ def play_audio_file(file_path: str) -> bool: except Exception as e: logger.debug("sounddevice playback failed: %s", e) - # Fall back to system audio players + # Fall back to system audio players (using Popen for interruptability) system = platform.system() players = [] @@ -386,10 +462,17 @@ def play_audio_file(file_path: str) -> bool: exe = shutil.which(cmd[0]) if exe: try: - subprocess.run(cmd, capture_output=True, timeout=300) + proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + with _playback_lock: + _active_playback = proc + proc.wait(timeout=300) + with _playback_lock: + _active_playback = None return True except Exception as e: logger.debug("System player %s failed: %s", cmd[0], e) + with _playback_lock: + _active_playback = None logger.warning("No audio player available for %s", file_path) return False