diff --git a/cli.py b/cli.py
index 9fb613c851..e35fdafb9d 100755
--- a/cli.py
+++ b/cli.py
@@ -1550,6 +1550,7 @@ class HermesCLI:
                 checkpoints_enabled=self.checkpoints_enabled,
                 checkpoint_max_snapshots=self.checkpoint_max_snapshots,
                 pass_session_id=self.pass_session_id,
+                tool_progress_callback=self._on_tool_progress,
             )
             # Apply any pending title now that the session exists in the DB
             if self._pending_title and self._session_db:
@@ -3515,6 +3516,28 @@ class HermesCLI:
         except Exception as e:
             print(f"  ❌ MCP reload failed: {e}")
 
+    # ====================================================================
+    # Tool progress callback (audio cues for voice mode)
+    # ====================================================================
+
+    def _on_tool_progress(self, function_name: str, preview: str, function_args: dict):
+        """Called when a tool starts executing. Plays audio cue in voice mode."""
+        if not self._voice_mode:
+            return
+        # Skip internal/thinking tools
+        if function_name.startswith("_"):
+            return
+        try:
+            from tools.voice_mode import play_beep
+            # Short, subtle tick sound (higher pitch, very brief)
+            threading.Thread(
+                target=play_beep,
+                kwargs={"frequency": 1200, "duration": 0.06, "count": 1},
+                daemon=True,
+            ).start()
+        except Exception:
+            pass
+
     # ====================================================================
     # Voice mode methods
     # ====================================================================
@@ -3536,9 +3559,21 @@ class HermesCLI:
                 "Get one at: https://platform.openai.com/api-keys"
             )
 
+        # Load silence detection params from config
+        voice_cfg = {}
+        try:
+            from hermes_cli.config import load_config
+            voice_cfg = load_config().get("voice", {})
+        except Exception:
+            pass
+
         if self._voice_recorder is None:
             self._voice_recorder = AudioRecorder()
 
+        # Apply config-driven silence params
+        self._voice_recorder._silence_threshold = voice_cfg.get("silence_threshold", 200)
+        self._voice_recorder._silence_duration = voice_cfg.get("silence_duration", 3.0)
+
         def _on_silence():
             """Called by AudioRecorder when silence is detected after speech."""
             with self._voice_lock:
@@ -3549,18 +3584,26 @@ class HermesCLI:
                 self._app.invalidate()
             self._voice_stop_and_transcribe()
 
+        # Audio cue: single beep BEFORE starting stream (avoid CoreAudio conflict)
+        try:
+            from tools.voice_mode import play_beep
+            play_beep(frequency=880, count=1)
+        except Exception:
+            pass
+
         self._voice_recorder.start(on_silence_stop=_on_silence)
         with self._voice_lock:
             self._voice_recording = True
-
-        # Audio cue: single beep on recording start
-        try:
-            from tools.voice_mode import play_beep
-            threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start()
-        except Exception:
-            pass
         _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")
 
+        # Periodically refresh prompt to update audio level indicator
+        def _refresh_level():
+            while self._voice_recording:
+                if hasattr(self, '_app') and self._app:
+                    self._app.invalidate()
+                time.sleep(0.15)
+        threading.Thread(target=_refresh_level, daemon=True).start()
+
     def _voice_stop_and_transcribe(self):
         """Stop recording, transcribe via STT, and queue the transcript as input."""
         try:
@@ -3571,15 +3614,15 @@ class HermesCLI:
             with self._voice_lock:
                 self._voice_recording = False
 
-            # Audio cue: double beep on recording stop
+            # Audio cue: double beep after stream stopped (no CoreAudio conflict)
             try:
                 from tools.voice_mode import play_beep
-                threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start()
+                play_beep(frequency=660, count=2)
             except Exception:
                 pass
 
             if wav_path is None:
-                _cprint(f"{_DIM}No speech detected (recording too short).{_RST}")
+                _cprint(f"{_DIM}No speech detected.{_RST}")
                 return
 
             with self._voice_lock:
@@ -3614,6 +3657,7 @@ class HermesCLI:
         finally:
             with self._voice_lock:
                 self._voice_processing = False
+                submitted = self._pending_input.qsize() > 0
             if hasattr(self, '_app') and self._app:
                 self._app.invalidate()
             # Clean up temp file
@@ -3623,6 +3667,18 @@ class HermesCLI:
             except Exception:
                 pass
 
+            # If no transcript was submitted but continuous mode is active,
+            # restart recording so the user can keep talking.
+            # (When transcript IS submitted, process_loop handles restart
+            # after chat() completes.)
+            if self._voice_continuous and not submitted and not self._voice_recording:
+                try:
+                    self._voice_start_recording()
+                    if hasattr(self, '_app') and self._app:
+                        self._app.invalidate()
+                except Exception:
+                    pass
+
     def _voice_speak_response(self, text: str):
         """Speak the agent's response aloud using TTS (runs in background thread)."""
         if not self._voice_tts:
@@ -3727,6 +3783,16 @@ class HermesCLI:
         except Exception:
             pass
 
+        # Append voice-mode system prompt for concise, conversational responses
+        self._voice_original_prompt = self.system_prompt
+        voice_instruction = (
+            "\n\n[Voice mode active] The user is speaking via voice input. "
+            "Keep responses concise and conversational — 2-3 sentences max unless "
+            "the user asks for detail. Avoid code blocks, markdown formatting, "
+            "and long lists. Respond naturally as in a spoken conversation."
+        )
+        self.system_prompt = (self.system_prompt or "") + voice_instruction
+
         tts_status = " (TTS enabled)" if self._voice_tts else ""
         _cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
         _cprint(f"  {_DIM}Ctrl+R to start/stop recording{_RST}")
@@ -3742,6 +3808,10 @@ class HermesCLI:
             self._voice_mode = False
             self._voice_tts = False
             self._voice_continuous = False
+
+        # Restore original system prompt
+        if hasattr(self, '_voice_original_prompt'):
+            self.system_prompt = self._voice_original_prompt
         _cprint(f"\n{_DIM}Voice mode disabled.{_RST}")
 
     def _toggle_voice_tts(self):
@@ -4237,11 +4307,24 @@ class HermesCLI:
         # Icon-only custom prompts should still remain visible in special states.
         return symbol, symbol
 
+    def _audio_level_bar(self) -> str:
+        """Return a visual audio level indicator based on current RMS."""
+        _LEVEL_BARS = " ▁▂▃▄▅▆▇"
+        rec = getattr(self, "_voice_recorder", None)
+        if rec is None:
+            return ""
+        rms = rec.current_rms
+        # Normalize RMS (0-32767) to 0-7 index, with log-ish scaling
+        # Typical speech RMS is 500-5000, we cap display at ~8000
+        level = min(rms, 8000) * 7 // 8000
+        return _LEVEL_BARS[level]
+
     def _get_tui_prompt_fragments(self):
         """Return the prompt_toolkit fragments for the current interactive state."""
         symbol, state_suffix = self._get_tui_prompt_symbols()
         if self._voice_recording:
-            return [("class:voice-recording", f"● {state_suffix}")]
+            bar = self._audio_level_bar()
+            return [("class:voice-recording", f"● {bar} {state_suffix}")]
         if self._voice_processing:
             return [("class:voice-processing", f"◉ {state_suffix}")]
         if self._sudo_state:
@@ -4692,6 +4775,14 @@ class HermesCLI:
                 ).start()
             else:
                 try:
+                    # Interrupt TTS if playing, so user can start talking
+                    if not cli_ref._voice_tts_done.is_set():
+                        try:
+                            from tools.voice_mode import stop_playback
+                            stop_playback()
+                            cli_ref._voice_tts_done.set()
+                        except Exception:
+                            pass
                     with cli_ref._voice_lock:
                         cli_ref._voice_continuous = True
                     cli_ref._voice_start_recording()
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 174e4326e4..8dc2076404 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -207,6 +207,8 @@ DEFAULT_CONFIG = {
         "record_key": "ctrl+r",
         "max_recording_seconds": 120,
         "auto_tts": False,
+        "silence_threshold": 200,     # RMS below this = silence (0-32767)
+        "silence_duration": 3.0,      # Seconds of silence before auto-stop
     },
     
     "human_delay": {
diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py
index 0d40932e2f..e6a46def7c 100644
--- a/tests/tools/test_voice_mode.py
+++ b/tests/tools/test_voice_mode.py
@@ -157,6 +157,7 @@ class TestAudioRecorderStop:
         # Simulate captured audio frames (1 second of loud audio above RMS threshold)
         frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
         recorder._frames = [frame]
+        recorder._peak_rms = 1000  # Peak RMS above threshold
 
         wav_path = recorder.stop()
 
@@ -203,6 +204,7 @@ class TestAudioRecorderStop:
         # 1 second of near-silence (RMS well below threshold)
         frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
         recorder._frames = [frame]
+        recorder._peak_rms = 10  # Peak RMS also below threshold
 
         wav_path = recorder.stop()
         assert wav_path is None
@@ -475,8 +477,9 @@ class TestSilenceDetection:
         from tools.voice_mode import AudioRecorder, SAMPLE_RATE
 
         recorder = AudioRecorder()
-        # Use very short silence duration for testing
+        # Use very short durations for testing
         recorder._silence_duration = 0.05
+        recorder._min_speech_duration = 0.05
 
         fired = threading.Event()
 
@@ -490,9 +493,11 @@ class TestSilenceDetection:
         if callback is None:
             callback = mock_sd.InputStream.call_args[1]["callback"]
 
-        # Simulate loud audio (speech) -- RMS well above threshold
+        # Simulate sustained speech (multiple loud chunks to exceed min_speech_duration)
         loud_frame = np.full((1600, 1), 5000, dtype="int16")
         callback(loud_frame, 1600, None, None)
+        time.sleep(0.06)
+        callback(loud_frame, 1600, None, None)
         assert recorder._has_spoken is True
 
         # Simulate silence
@@ -537,6 +542,47 @@ class TestSilenceDetection:
 
         recorder.cancel()
 
+    def test_micro_pause_tolerance_during_speech(self, mock_sd):
+        """Brief dips below threshold during speech should NOT reset speech tracking."""
+        np = pytest.importorskip("numpy")
+        import threading
+
+        mock_stream = MagicMock()
+        mock_sd.InputStream.return_value = mock_stream
+
+        from tools.voice_mode import AudioRecorder
+
+        recorder = AudioRecorder()
+        recorder._silence_duration = 0.05
+        recorder._min_speech_duration = 0.15
+        recorder._max_dip_tolerance = 0.1
+
+        fired = threading.Event()
+        recorder.start(on_silence_stop=lambda: fired.set())
+
+        callback = mock_sd.InputStream.call_args.kwargs.get("callback")
+        if callback is None:
+            callback = mock_sd.InputStream.call_args[1]["callback"]
+
+        loud_frame = np.full((1600, 1), 5000, dtype="int16")
+        quiet_frame = np.full((1600, 1), 50, dtype="int16")
+
+        # Speech chunk 1
+        callback(loud_frame, 1600, None, None)
+        time.sleep(0.05)
+        # Brief micro-pause (dip < max_dip_tolerance)
+        callback(quiet_frame, 1600, None, None)
+        time.sleep(0.05)
+        # Speech resumes -- speech_start should NOT have been reset
+        callback(loud_frame, 1600, None, None)
+        assert recorder._speech_start > 0, "Speech start should be preserved across brief dips"
+        time.sleep(0.06)
+        # Another speech chunk to exceed min_speech_duration
+        callback(loud_frame, 1600, None, None)
+        assert recorder._has_spoken is True, "Speech should be confirmed after tolerating micro-pause"
+
+        recorder.cancel()
+
     def test_no_callback_means_no_silence_detection(self, mock_sd):
         np = pytest.importorskip("numpy")
 
diff --git a/tools/voice_mode.py b/tools/voice_mode.py
index cdffa99086..d4fd00f19b 100644
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@@ -117,10 +117,18 @@ class AudioRecorder:
         self._start_time: float = 0.0
         # Silence detection state
         self._has_spoken = False
+        self._speech_start: float = 0.0  # When speech attempt began
+        self._dip_start: float = 0.0  # When current below-threshold dip began
+        self._min_speech_duration: float = 0.3  # Seconds of speech needed to confirm
+        self._max_dip_tolerance: float = 0.3  # Max dip duration before resetting speech
         self._silence_start: float = 0.0
         self._on_silence_stop = None
         self._silence_threshold: int = SILENCE_RMS_THRESHOLD
         self._silence_duration: float = SILENCE_DURATION_SECONDS
+        # Peak RMS seen during recording (for speech presence check in stop())
+        self._peak_rms: int = 0
+        # Live audio level (read by UI for visual feedback)
+        self._current_rms: int = 0
 
     # -- public properties ---------------------------------------------------
 
@@ -134,6 +142,11 @@ class AudioRecorder:
             return 0.0
         return time.monotonic() - self._start_time
 
+    @property
+    def current_rms(self) -> int:
+        """Current audio input RMS level (0-32767). Updated each audio chunk."""
+        return self._current_rms
+
     # -- public methods ------------------------------------------------------
 
     def start(self, on_silence_stop=None) -> None:
@@ -161,7 +174,10 @@ class AudioRecorder:
             self._frames = []
             self._start_time = time.monotonic()
             self._has_spoken = False
+            self._speech_start = 0.0
+            self._dip_start = 0.0
             self._silence_start = 0.0
+            self._peak_rms = 0
             self._on_silence_stop = on_silence_stop
 
             def _callback(indata, frames, time_info, status):  # noqa: ARG001
@@ -169,15 +185,44 @@ class AudioRecorder:
                     logger.debug("sounddevice status: %s", status)
                 self._frames.append(indata.copy())
 
-                # Silence detection: compute RMS of this chunk
+                # Compute RMS for level display and silence detection
+                rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
+                self._current_rms = rms
+                if rms > self._peak_rms:
+                    self._peak_rms = rms
+
+                # Silence detection
                 if self._on_silence_stop is not None and self._recording:
-                    rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
                     now = time.monotonic()
 
                     if rms > self._silence_threshold:
-                        self._has_spoken = True
+                        # Audio is above threshold -- this is speech (or noise).
+                        self._dip_start = 0.0  # Reset dip tracker
+                        if self._speech_start == 0.0:
+                            self._speech_start = now
+                        elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
+                            self._has_spoken = True
+                            logger.debug("Speech confirmed (%.2fs above threshold)",
+                                         now - self._speech_start)
                         self._silence_start = 0.0
                     elif self._has_spoken:
+                        # Speech already confirmed, let silence timer run below
+                        pass
+                    elif self._speech_start > 0:
+                        # We were in a speech attempt but RMS dipped.
+                        # Tolerate brief dips (micro-pauses between syllables).
+                        if self._dip_start == 0.0:
+                            self._dip_start = now
+                        elif now - self._dip_start >= self._max_dip_tolerance:
+                            # Dip lasted too long -- genuine silence, reset
+                            logger.debug("Speech attempt reset (dip lasted %.2fs)",
+                                         now - self._dip_start)
+                            self._speech_start = 0.0
+                            self._dip_start = 0.0
+                        # else: brief dip, keep tolerating
+                    # else: no speech attempt, just silence -- nothing to do
+
+                    if self._has_spoken and rms <= self._silence_threshold:
                         # User was speaking and now is silent
                         if self._silence_start == 0.0:
                             self._silence_start = now
@@ -235,10 +280,11 @@ class AudioRecorder:
                 logger.debug("Recording too short (%d samples), discarding", len(audio_data))
                 return None
 
-            # Skip silent recordings (RMS below threshold = no real speech)
-            rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2)))
-            if rms < SILENCE_RMS_THRESHOLD:
-                logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD)
+            # Skip silent recordings using peak RMS (not overall average, which
+            # gets diluted by silence at the end of the recording).
+            if self._peak_rms < SILENCE_RMS_THRESHOLD:
+                logger.info("Recording too quiet (peak RMS=%d < %d), discarding",
+                            self._peak_rms, SILENCE_RMS_THRESHOLD)
                 return None
 
             return self._write_wav(audio_data)
@@ -341,8 +387,34 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
 
 
 # ============================================================================
-# Audio playback
+# Audio playback (interruptable)
 # ============================================================================
+
+# Global reference to the active playback process so it can be interrupted.
+_active_playback: Optional[subprocess.Popen] = None
+_playback_lock = threading.Lock()
+
+
+def stop_playback() -> None:
+    """Interrupt the currently playing audio (if any)."""
+    global _active_playback
+    with _playback_lock:
+        proc = _active_playback
+        _active_playback = None
+    if proc and proc.poll() is None:
+        try:
+            proc.terminate()
+            logger.info("Audio playback interrupted")
+        except Exception:
+            pass
+    # Also stop sounddevice playback if active
+    if _HAS_AUDIO:
+        try:
+            sd.stop()
+        except Exception:
+            pass
+
+
 def play_audio_file(file_path: str) -> bool:
     """Play an audio file through the default output device.
 
@@ -351,9 +423,13 @@ def play_audio_file(file_path: str) -> bool:
     2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
        ``aplay`` (Linux ALSA).
 
+    Playback can be interrupted by calling ``stop_playback()``.
+
     Returns:
         ``True`` if playback succeeded, ``False`` otherwise.
     """
+    global _active_playback
+
     if not os.path.isfile(file_path):
         logger.warning("Audio file not found: %s", file_path)
         return False
@@ -372,7 +448,7 @@ def play_audio_file(file_path: str) -> bool:
         except Exception as e:
             logger.debug("sounddevice playback failed: %s", e)
 
-    # Fall back to system audio players
+    # Fall back to system audio players (using Popen for interruptability)
     system = platform.system()
     players = []
 
@@ -386,10 +462,17 @@ def play_audio_file(file_path: str) -> bool:
         exe = shutil.which(cmd[0])
         if exe:
             try:
-                subprocess.run(cmd, capture_output=True, timeout=300)
+                proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+                with _playback_lock:
+                    _active_playback = proc
+                proc.wait(timeout=300)
+                with _playback_lock:
+                    _active_playback = None
                 return True
             except Exception as e:
                 logger.debug("System player %s failed: %s", cmd[0], e)
+                with _playback_lock:
+                    _active_playback = None
 
     logger.warning("No audio player available for %s", file_path)
     return False