From 8d1706ae5cb2e0bcffd6496d45e3c7f10e4b0cc1 Mon Sep 17 00:00:00 2001
From: Brooklyn Nicholson <brooklyn.bb.nicholson@gmail.com>
Date: Wed, 24 Jun 2026 18:26:14 -0500
Subject: [PATCH] fix(desktop): wire Ctrl+B voice, declutter voice settings,
 stop endless TTS hang
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three voice-mode papercuts in the desktop app:

1. Ctrl+B did nothing. The docs + `voice.record_key` advertise Ctrl+B to
   talk, but the desktop never bound it (only ⌘B = sidebar existed). Add a
   rebindable `composer.voice` action that toggles the voice conversation,
   defaulting to ⌃B on macOS (distinct from ⌘B; off-macOS `ctrl` folds to
   the sidebar chord, so it ships unbound there to avoid stealing it). The
   global keybind reaches the composer through a new focus-bus event.

2. The Voice settings page rendered every provider's options at once (~30
   fields). Filter to the *selected* TTS/STT provider's sub-fields; STT
   provider fields hide when STT is off. Picking "edge" now shows just the
   Edge voice, making it obvious voice chat also needs STT enabled.

3. Voice mode could hang "speaking" forever. Free Edge TTS sometimes returns
   audio that never fires `playing`/`ended`/`error`, so the playback promise
   never settled. Add a stall watchdog (rearmed on each progress tick, so
   long speech is never cut off) that rejects a stuck stream, letting the
   loop recover with a clear error.
---
 apps/desktop/src/app/chat/composer/focus.ts   |  8 ++++
 apps/desktop/src/app/chat/composer/index.tsx  | 21 +++++++-
 apps/desktop/src/app/hooks/use-keybinds.ts    |  3 +-
 .../src/app/settings/config-settings.tsx      | 27 ++++++++++-
 .../app/settings/voice-field-visible.test.ts  | 48 +++++++++++++++++++
 apps/desktop/src/i18n/en.ts                   |  1 +
 apps/desktop/src/i18n/zh.ts                   |  1 +
 apps/desktop/src/lib/keybinds/actions.ts      |  8 ++++
 apps/desktop/src/lib/voice-playback.ts        | 29 ++++++++++-
 9 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100644 apps/desktop/src/app/settings/voice-field-visible.test.ts
diff --git a/apps/desktop/src/app/chat/composer/focus.ts b/apps/desktop/src/app/chat/composer/focus.ts
index 916436de0b2..3de3f5c9800 100644
--- a/apps/desktop/src/app/chat/composer/focus.ts
+++ b/apps/desktop/src/app/chat/composer/focus.ts
@@ -34,6 +34,7 @@ interface InsertRefsDetail {
 const FOCUS_EVENT = 'hermes:composer-focus'
 const INSERT_EVENT = 'hermes:composer-insert'
 const INSERT_REFS_EVENT = 'hermes:composer-insert-refs'
+const VOICE_TOGGLE_EVENT = 'hermes:composer-voice-toggle'
 
 let activeTarget: ComposerTarget = 'main'
 
@@ -105,6 +106,13 @@ export const requestComposerInsertRefs = (
 export const onComposerInsertRefsRequest = (handler: (detail: InsertRefsDetail) => void) =>
   subscribe<InsertRefsDetail>(INSERT_REFS_EVENT, handler)
 
+/** Toggle the active composer's voice conversation — the `composer.voice`
+ *  hotkey (Ctrl+B) reaching into the composer that owns the voice state. */
+export const requestVoiceToggle = () => dispatch<{ at: number }>(VOICE_TOGGLE_EVENT, { at: Date.now() })
+
+export const onComposerVoiceToggleRequest = (handler: () => void) =>
+  subscribe<{ at: number }>(VOICE_TOGGLE_EVENT, () => handler())
+
 /**
  * Focus a composer input across React commit + browser focus restore.
  *
diff --git a/apps/desktop/src/app/chat/composer/index.tsx b/apps/desktop/src/app/chat/composer/index.tsx
index 5b38cae3ff0..d4ec0a36a1d 100644
--- a/apps/desktop/src/app/chat/composer/index.tsx
+++ b/apps/desktop/src/app/chat/composer/index.tsx
@@ -79,7 +79,8 @@ import {
   markActiveComposer,
   onComposerFocusRequest,
   onComposerInsertRefsRequest,
-  onComposerInsertRequest
+  onComposerInsertRequest,
+  onComposerVoiceToggleRequest
 } from './focus'
 import { HelpHint } from './help-hint'
 import { useAtCompletions } from './hooks/use-at-completions'
@@ -1844,6 +1845,24 @@ export function ChatBar({
     pendingResponse
   })
 
+  // The `composer.voice` hotkey (Ctrl+B) toggles the conversation. Starting
+  // with STT unconfigured lets the conversation surface its own "configure
+  // speech-to-text" notice rather than silently no-opping.
+  const toggleVoiceConversation = useCallback(() => {
+    if (disabled) {
+      return
+    }
+
+    if (voiceConversationActive) {
+      setVoiceConversationActive(false)
+      void conversation.end()
+    } else {
+      setVoiceConversationActive(true)
+    }
+  }, [conversation, disabled, voiceConversationActive])
+
+  useEffect(() => onComposerVoiceToggleRequest(toggleVoiceConversation), [toggleVoiceConversation])
+
   const contextMenu = (
     <ContextMenu
       onInsertText={insertText}
diff --git a/apps/desktop/src/app/hooks/use-keybinds.ts b/apps/desktop/src/app/hooks/use-keybinds.ts
index 891c834c520..817da734338 100644
--- a/apps/desktop/src/app/hooks/use-keybinds.ts
+++ b/apps/desktop/src/app/hooks/use-keybinds.ts
@@ -40,7 +40,7 @@ import {
 import { openNewSessionInNewWindow } from '@/store/windows'
 import { useTheme } from '@/themes/context'
 
-import { requestComposerFocus } from '../chat/composer/focus'
+import { requestComposerFocus, requestVoiceToggle } from '../chat/composer/focus'
 import { SIDEBAR_COLLAPSE_MEDIA_QUERY } from '../layout-constants'
 import {
   AGENTS_ROUTE,
@@ -114,6 +114,7 @@ export function useKeybinds(deps: KeybindRuntimeDeps): void {
 
     'composer.focus': () => requestComposerFocus('main'),
     'composer.modelPicker': () => setModelPickerOpen(true),
+    'composer.voice': requestVoiceToggle,
 
     'nav.commandPalette': toggleCommandPalette,
     'nav.commandCenter': deps.toggleCommandCenter,
diff --git a/apps/desktop/src/app/settings/config-settings.tsx b/apps/desktop/src/app/settings/config-settings.tsx
index 3f570f7adfb..dbd2280c4a0 100644
--- a/apps/desktop/src/app/settings/config-settings.tsx
+++ b/apps/desktop/src/app/settings/config-settings.tsx
@@ -26,6 +26,26 @@ import { ModelSettings } from './model-settings'
 import { EmptyState, ListRow, LoadingState, SettingsContent } from './primitives'
 import { ProviderConfigPanel } from './provider-config-panel'
 
+// On the Voice page, only surface the sub-fields of the *selected* TTS/STT
+// provider — otherwise every provider's options render at once (the "totally
+// crazy" wall of ~30 fields). Top-level keys (tts.provider, stt.enabled,
+// voice.*) always show; STT provider fields hide entirely when STT is off.
+export function voiceFieldVisible(key: string, config: HermesConfigRecord): boolean {
+  const match = /^(tts|stt)\.([^.]+)\./.exec(key)
+
+  if (!match) {
+    return true
+  }
+
+  const [, domain, provider] = match
+
+  if (domain === 'stt' && !getNested(config, 'stt.enabled')) {
+    return false
+  }
+
+  return provider === String(getNested(config, `${domain}.provider`) ?? '')
+}
+
 function ConfigField({
   schemaKey,
   schema,
@@ -356,6 +376,9 @@ export function ConfigSettings({
     return <LoadingState label={c.loading} />
   }
 
+  const visibleFields =
+    activeSectionId === 'voice' ? fields.filter(([key]) => voiceFieldVisible(key, config)) : fields
+
   return (
     <SettingsContent>
       {activeSectionId === 'model' && (
@@ -363,11 +386,11 @@ export function ConfigSettings({
           <ModelSettings onMainModelChanged={onMainModelChanged} />
         </div>
       )}
-      {fields.length === 0 ? (
+      {visibleFields.length === 0 ? (
         <EmptyState description={c.emptyDesc} title={c.emptyTitle} />
       ) : (
         <div className="grid gap-1">
-          {fields.map(([key, field]) => (
+          {visibleFields.map(([key, field]) => (
             <div className="scroll-mt-6 rounded-lg" id={`setting-field-${key}`} key={key}>
               <ConfigField
                 descriptionExtra={
diff --git a/apps/desktop/src/app/settings/voice-field-visible.test.ts b/apps/desktop/src/app/settings/voice-field-visible.test.ts
new file mode 100644
index 00000000000..61228e7c7db
--- /dev/null
+++ b/apps/desktop/src/app/settings/voice-field-visible.test.ts
@@ -0,0 +1,48 @@
+import { describe, expect, it } from 'vitest'
+
+import type { HermesConfigRecord } from '@/types/hermes'
+
+import { voiceFieldVisible } from './config-settings'
+
+const cfg = (over: Record<string, unknown> = {}): HermesConfigRecord =>
+  ({
+    tts: { provider: 'edge', edge: {}, openai: {} },
+    stt: { enabled: true, provider: 'local', local: {}, groq: {} },
+    ...over
+  }) as unknown as HermesConfigRecord
+
+describe('voiceFieldVisible', () => {
+  it('always shows top-level + non-provider keys', () => {
+    const config = cfg()
+
+    for (const key of ['tts.provider', 'stt.enabled', 'stt.provider', 'voice.auto_tts', 'voice.record_key']) {
+      expect(voiceFieldVisible(key, config)).toBe(true)
+    }
+  })
+
+  it('shows only the selected TTS provider sub-fields', () => {
+    const config = cfg()
+    expect(voiceFieldVisible('tts.edge.voice', config)).toBe(true)
+    expect(voiceFieldVisible('tts.openai.voice', config)).toBe(false)
+    expect(voiceFieldVisible('tts.elevenlabs.voice_id', config)).toBe(false)
+  })
+
+  it('shows only the selected STT provider sub-fields', () => {
+    const config = cfg()
+    expect(voiceFieldVisible('stt.local.model', config)).toBe(true)
+    expect(voiceFieldVisible('stt.groq.model', config)).toBe(false)
+  })
+
+  it('hides every STT provider sub-field when STT is disabled', () => {
+    const config = cfg({ stt: { enabled: false, provider: 'local', local: {} } })
+    expect(voiceFieldVisible('stt.local.model', config)).toBe(false)
+    // ...but the enable/provider toggles themselves stay visible.
+    expect(voiceFieldVisible('stt.enabled', config)).toBe(true)
+    expect(voiceFieldVisible('stt.provider', config)).toBe(true)
+  })
+
+  it('tracks a provider switch', () => {
+    expect(voiceFieldVisible('tts.openai.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(true)
+    expect(voiceFieldVisible('tts.edge.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(false)
+  })
+})
diff --git a/apps/desktop/src/i18n/en.ts b/apps/desktop/src/i18n/en.ts
index 8a1a295ce92..e20b118f262 100644
--- a/apps/desktop/src/i18n/en.ts
+++ b/apps/desktop/src/i18n/en.ts
@@ -211,6 +211,7 @@ export const en: Translations = {
       'session.togglePin': 'Pin / unpin current session',
       'composer.focus': 'Focus composer',
       'composer.modelPicker': 'Open model picker',
+      'composer.voice': 'Start / stop voice conversation',
       'view.toggleSidebar': 'Toggle sessions sidebar',
       'view.toggleRightSidebar': 'Toggle file browser',
       'view.showFiles': 'Show file browser',
diff --git a/apps/desktop/src/i18n/zh.ts b/apps/desktop/src/i18n/zh.ts
index 6423e1749a9..dec1959362f 100644
--- a/apps/desktop/src/i18n/zh.ts
+++ b/apps/desktop/src/i18n/zh.ts
@@ -206,6 +206,7 @@ export const zh: Translations = {
       'session.togglePin': '固定/取消固定当前会话',
       'composer.focus': '聚焦输入框',
       'composer.modelPicker': '打开模型选择器',
+      'composer.voice': '开始 / 停止语音对话',
       'view.toggleSidebar': '切换会话侧边栏',
       'view.toggleRightSidebar': '切换文件浏览器',
       'view.showFiles': '显示文件浏览器',
diff --git a/apps/desktop/src/lib/keybinds/actions.ts b/apps/desktop/src/lib/keybinds/actions.ts
index 38eab936f09..361906b213f 100644
--- a/apps/desktop/src/lib/keybinds/actions.ts
+++ b/apps/desktop/src/lib/keybinds/actions.ts
@@ -5,6 +5,8 @@
 // like navigate / theme); labels come from i18n (`t.keybinds.actions[id]`). To
 // add a hotkey, add a row here and a handler there — nothing else.
 
+import { IS_MAC } from './combo'
+
 export type KeybindCategory = 'composer' | 'profiles' | 'session' | 'navigation' | 'view'
 
 // The self-referential opener — bound + dispatched like any action, but shown in
@@ -55,6 +57,12 @@ export const KEYBIND_ACTIONS: readonly KeybindActionMeta[] = [
   // ── Composer ─────────────────────────────────────────────────────────────
   { id: 'composer.focus', category: 'composer', defaults: [] },
   { id: 'composer.modelPicker', category: 'composer', defaults: [] },
+  // Voice conversation toggle. Matches the documented `voice.record_key`
+  // (Ctrl+B). On macOS that's literally ⌃B — distinct from the ⌘B sidebar
+  // toggle. Off macOS `ctrl` folds to `mod`, which IS the ⌘B/Ctrl+B sidebar
+  // chord, so ship it unbound there (rebindable in the panel) rather than
+  // stealing the long-standing sidebar binding.
+  { id: 'composer.voice', category: 'composer', defaults: IS_MAC ? ['ctrl+b'] : [] },
 
   // ── Profiles ─────────────────────────────────────────────────────────────
   { id: 'profile.default', category: 'profiles', defaults: ['mod+d'] },
diff --git a/apps/desktop/src/lib/voice-playback.ts b/apps/desktop/src/lib/voice-playback.ts
index 1554ed8a315..eea1b5b6e0a 100644
--- a/apps/desktop/src/lib/voice-playback.ts
+++ b/apps/desktop/src/lib/voice-playback.ts
@@ -8,6 +8,12 @@ import {
 
 import { sanitizeTextForSpeech } from './speech-text'
 
+// Free Edge TTS occasionally hands back audio that never fires `playing`/`ended`
+// nor `error` — leaving voice mode stuck "speaking" forever. Reject if playback
+// fails to start or stalls mid-stream for this long (rearmed on each progress
+// tick, so legitimately long speech is never cut off).
+const PLAYBACK_STALL_MS = 15_000
+
 let currentAudio: HTMLAudioElement | null = null
 let currentStop: (() => void) | null = null
 let sequence = 0
@@ -78,12 +84,31 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
     setVoicePlaybackState(currentState('speaking', options, audio))
 
     await new Promise<void>((resolve, reject) => {
+      let stall: number | null = null
+
       const cleanup = () => {
+        if (stall !== null) {
+          window.clearTimeout(stall)
+          stall = null
+        }
+
         audio.removeEventListener('ended', onEnded)
         audio.removeEventListener('error', onError)
+        audio.removeEventListener('timeupdate', armStall)
         currentStop = null
       }
 
+      const armStall = () => {
+        if (stall !== null) {
+          window.clearTimeout(stall)
+        }
+
+        stall = window.setTimeout(() => {
+          cleanup()
+          reject(new Error('Playback stalled'))
+        }, PLAYBACK_STALL_MS)
+      }
+
       const onEnded = () => {
         cleanup()
         resolve()
@@ -101,7 +126,9 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
 
       audio.addEventListener('ended', onEnded, { once: true })
       audio.addEventListener('error', onError, { once: true })
-      void audio.play().catch(reject)
+      audio.addEventListener('timeupdate', armStall)
+      armStall()
+      void audio.play().catch(onError)
     })
 
     if (!isCurrent()) {