fix(desktop): wire Ctrl+B voice, declutter voice settings, stop endless TTS hang

Three voice-mode papercuts in the desktop app: 1. Ctrl+B did nothing. The docs + `voice.record_key` advertise Ctrl+B to talk, but the desktop never bound it (only ⌘B = sidebar existed). Add a rebindable `composer.voice` action that toggles the voice conversation, defaulting to ⌃B on macOS (distinct from ⌘B; off-macOS `ctrl` folds to the sidebar chord, so it ships unbound there to avoid stealing it). The global keybind reaches the composer through a new focus-bus event. 2. The Voice settings page rendered every provider's options at once (~30 fields). Filter to the *selected* TTS/STT provider's sub-fields; STT provider fields hide when STT is off. Picking "edge" now shows just the Edge voice, making it obvious voice chat also needs STT enabled. 3. Voice mode could hang "speaking" forever. Free Edge TTS sometimes returns audio that never fires `playing`/`ended`/`error`, so the playback promise never settled. Add a stall watchdog (rearmed on each progress tick, so long speech is never cut off) that rejects a stuck stream, letting the loop recover with a clear error.
2026-06-27 11:22:03 +00:00 · 2026-06-24 18:26:14 -05:00 · 2026-06-24 18:26:14 -05:00 · 8d1706ae5c
commit 8d1706ae5c
parent 404b06ac4f
9 changed files with 141 additions and 5 deletions
--- a/apps/desktop/src/app/chat/composer/focus.ts
+++ b/apps/desktop/src/app/chat/composer/focus.ts
@ -34,6 +34,7 @@ interface InsertRefsDetail {
 const FOCUS_EVENT = 'hermes:composer-focus'
 const INSERT_EVENT = 'hermes:composer-insert'
 const INSERT_REFS_EVENT = 'hermes:composer-insert-refs'
+const VOICE_TOGGLE_EVENT = 'hermes:composer-voice-toggle'

 let activeTarget: ComposerTarget = 'main'

@ -105,6 +106,13 @@ export const requestComposerInsertRefs = (
 export const onComposerInsertRefsRequest = (handler: (detail: InsertRefsDetail) => void) =>
  subscribe<InsertRefsDetail>(INSERT_REFS_EVENT, handler)

+/** Toggle the active composer's voice conversation — the `composer.voice`
+ *  hotkey (Ctrl+B) reaching into the composer that owns the voice state. */
+export const requestVoiceToggle = () => dispatch<{ at: number }>(VOICE_TOGGLE_EVENT, { at: Date.now() })
+
+export const onComposerVoiceToggleRequest = (handler: () => void) =>
+  subscribe<{ at: number }>(VOICE_TOGGLE_EVENT, () => handler())
+
 /**
 * Focus a composer input across React commit + browser focus restore.
 *
--- a/apps/desktop/src/app/chat/composer/index.tsx
+++ b/apps/desktop/src/app/chat/composer/index.tsx
@ -79,7 +79,8 @@ import {
  markActiveComposer,
  onComposerFocusRequest,
  onComposerInsertRefsRequest,
-  onComposerInsertRequest
+  onComposerInsertRequest,
+  onComposerVoiceToggleRequest
 } from './focus'
 import { HelpHint } from './help-hint'
 import { useAtCompletions } from './hooks/use-at-completions'
@ -1844,6 +1845,24 @@ export function ChatBar({
    pendingResponse
  })

+  // The `composer.voice` hotkey (Ctrl+B) toggles the conversation. Starting
+  // with STT unconfigured lets the conversation surface its own "configure
+  // speech-to-text" notice rather than silently no-opping.
+  const toggleVoiceConversation = useCallback(() => {
+    if (disabled) {
+      return
+    }
+
+    if (voiceConversationActive) {
+      setVoiceConversationActive(false)
+      void conversation.end()
+    } else {
+      setVoiceConversationActive(true)
+    }
+  }, [conversation, disabled, voiceConversationActive])
+
+  useEffect(() => onComposerVoiceToggleRequest(toggleVoiceConversation), [toggleVoiceConversation])
+
  const contextMenu = (
    <ContextMenu
      onInsertText={insertText}
--- a/apps/desktop/src/app/hooks/use-keybinds.ts
+++ b/apps/desktop/src/app/hooks/use-keybinds.ts
@ -40,7 +40,7 @@ import {
 import { openNewSessionInNewWindow } from '@/store/windows'
 import { useTheme } from '@/themes/context'

-import { requestComposerFocus } from '../chat/composer/focus'
+import { requestComposerFocus, requestVoiceToggle } from '../chat/composer/focus'
 import { SIDEBAR_COLLAPSE_MEDIA_QUERY } from '../layout-constants'
 import {
  AGENTS_ROUTE,
@ -114,6 +114,7 @@ export function useKeybinds(deps: KeybindRuntimeDeps): void {

    'composer.focus': () => requestComposerFocus('main'),
    'composer.modelPicker': () => setModelPickerOpen(true),
+    'composer.voice': requestVoiceToggle,

    'nav.commandPalette': toggleCommandPalette,
    'nav.commandCenter': deps.toggleCommandCenter,
--- a/apps/desktop/src/app/settings/config-settings.tsx
+++ b/apps/desktop/src/app/settings/config-settings.tsx
@ -26,6 +26,26 @@ import { ModelSettings } from './model-settings'
 import { EmptyState, ListRow, LoadingState, SettingsContent } from './primitives'
 import { ProviderConfigPanel } from './provider-config-panel'

+// On the Voice page, only surface the sub-fields of the *selected* TTS/STT
+// provider — otherwise every provider's options render at once (the "totally
+// crazy" wall of ~30 fields). Top-level keys (tts.provider, stt.enabled,
+// voice.*) always show; STT provider fields hide entirely when STT is off.
+export function voiceFieldVisible(key: string, config: HermesConfigRecord): boolean {
+  const match = /^(tts|stt)\.([^.]+)\./.exec(key)
+
+  if (!match) {
+    return true
+  }
+
+  const [, domain, provider] = match
+
+  if (domain === 'stt' && !getNested(config, 'stt.enabled')) {
+    return false
+  }
+
+  return provider === String(getNested(config, `${domain}.provider`) ?? '')
+}
+
 function ConfigField({
  schemaKey,
  schema,
@ -356,6 +376,9 @@ export function ConfigSettings({
    return <LoadingState label={c.loading} />
  }

+  const visibleFields =
+    activeSectionId === 'voice' ? fields.filter(([key]) => voiceFieldVisible(key, config)) : fields
+
  return (
    <SettingsContent>
      {activeSectionId === 'model' && (
@ -363,11 +386,11 @@ export function ConfigSettings({
          <ModelSettings onMainModelChanged={onMainModelChanged} />
        </div>
      )}
-      {fields.length === 0 ? (
+      {visibleFields.length === 0 ? (
        <EmptyState description={c.emptyDesc} title={c.emptyTitle} />
      ) : (
        <div className="grid gap-1">
-          {fields.map(([key, field]) => (
+          {visibleFields.map(([key, field]) => (
            <div className="scroll-mt-6 rounded-lg" id={`setting-field-${key}`} key={key}>
              <ConfigField
                descriptionExtra={
--- a/apps/desktop/src/app/settings/voice-field-visible.test.ts
+++ b/apps/desktop/src/app/settings/voice-field-visible.test.ts
@ -0,0 +1,48 @@
+import { describe, expect, it } from 'vitest'
+
+import type { HermesConfigRecord } from '@/types/hermes'
+
+import { voiceFieldVisible } from './config-settings'
+
+const cfg = (over: Record<string, unknown> = {}): HermesConfigRecord =>
+  ({
+    tts: { provider: 'edge', edge: {}, openai: {} },
+    stt: { enabled: true, provider: 'local', local: {}, groq: {} },
+    ...over
+  }) as unknown as HermesConfigRecord
+
+describe('voiceFieldVisible', () => {
+  it('always shows top-level + non-provider keys', () => {
+    const config = cfg()
+
+    for (const key of ['tts.provider', 'stt.enabled', 'stt.provider', 'voice.auto_tts', 'voice.record_key']) {
+      expect(voiceFieldVisible(key, config)).toBe(true)
+    }
+  })
+
+  it('shows only the selected TTS provider sub-fields', () => {
+    const config = cfg()
+    expect(voiceFieldVisible('tts.edge.voice', config)).toBe(true)
+    expect(voiceFieldVisible('tts.openai.voice', config)).toBe(false)
+    expect(voiceFieldVisible('tts.elevenlabs.voice_id', config)).toBe(false)
+  })
+
+  it('shows only the selected STT provider sub-fields', () => {
+    const config = cfg()
+    expect(voiceFieldVisible('stt.local.model', config)).toBe(true)
+    expect(voiceFieldVisible('stt.groq.model', config)).toBe(false)
+  })
+
+  it('hides every STT provider sub-field when STT is disabled', () => {
+    const config = cfg({ stt: { enabled: false, provider: 'local', local: {} } })
+    expect(voiceFieldVisible('stt.local.model', config)).toBe(false)
+    // ...but the enable/provider toggles themselves stay visible.
+    expect(voiceFieldVisible('stt.enabled', config)).toBe(true)
+    expect(voiceFieldVisible('stt.provider', config)).toBe(true)
+  })
+
+  it('tracks a provider switch', () => {
+    expect(voiceFieldVisible('tts.openai.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(true)
+    expect(voiceFieldVisible('tts.edge.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(false)
+  })
+})
--- a/apps/desktop/src/i18n/en.ts
+++ b/apps/desktop/src/i18n/en.ts
@ -211,6 +211,7 @@ export const en: Translations = {
      'session.togglePin': 'Pin / unpin current session',
      'composer.focus': 'Focus composer',
      'composer.modelPicker': 'Open model picker',
+      'composer.voice': 'Start / stop voice conversation',
      'view.toggleSidebar': 'Toggle sessions sidebar',
      'view.toggleRightSidebar': 'Toggle file browser',
      'view.showFiles': 'Show file browser',
--- a/apps/desktop/src/i18n/zh.ts
+++ b/apps/desktop/src/i18n/zh.ts
@ -206,6 +206,7 @@ export const zh: Translations = {
      'session.togglePin': '固定/取消固定当前会话',
      'composer.focus': '聚焦输入框',
      'composer.modelPicker': '打开模型选择器',
+      'composer.voice': '开始 / 停止语音对话',
      'view.toggleSidebar': '切换会话侧边栏',
      'view.toggleRightSidebar': '切换文件浏览器',
      'view.showFiles': '显示文件浏览器',
--- a/apps/desktop/src/lib/keybinds/actions.ts
+++ b/apps/desktop/src/lib/keybinds/actions.ts
@ -5,6 +5,8 @@
 // like navigate / theme); labels come from i18n (`t.keybinds.actions[id]`). To
 // add a hotkey, add a row here and a handler there — nothing else.

+import { IS_MAC } from './combo'
+
 export type KeybindCategory = 'composer' | 'profiles' | 'session' | 'navigation' | 'view'

 // The self-referential opener — bound + dispatched like any action, but shown in
@ -55,6 +57,12 @@ export const KEYBIND_ACTIONS: readonly KeybindActionMeta[] = [
  // ── Composer ─────────────────────────────────────────────────────────────
  { id: 'composer.focus', category: 'composer', defaults: [] },
  { id: 'composer.modelPicker', category: 'composer', defaults: [] },
+  // Voice conversation toggle. Matches the documented `voice.record_key`
+  // (Ctrl+B). On macOS that's literally ⌃B — distinct from the ⌘B sidebar
+  // toggle. Off macOS `ctrl` folds to `mod`, which IS the ⌘B/Ctrl+B sidebar
+  // chord, so ship it unbound there (rebindable in the panel) rather than
+  // stealing the long-standing sidebar binding.
+  { id: 'composer.voice', category: 'composer', defaults: IS_MAC ? ['ctrl+b'] : [] },

  // ── Profiles ─────────────────────────────────────────────────────────────
  { id: 'profile.default', category: 'profiles', defaults: ['mod+d'] },
--- a/apps/desktop/src/lib/voice-playback.ts
+++ b/apps/desktop/src/lib/voice-playback.ts
@ -8,6 +8,12 @@ import {

 import { sanitizeTextForSpeech } from './speech-text'

+// Free Edge TTS occasionally hands back audio that never fires `playing`/`ended`
+// nor `error` — leaving voice mode stuck "speaking" forever. Reject if playback
+// fails to start or stalls mid-stream for this long (rearmed on each progress
+// tick, so legitimately long speech is never cut off).
+const PLAYBACK_STALL_MS = 15_000
+
 let currentAudio: HTMLAudioElement | null = null
 let currentStop: (() => void) | null = null
 let sequence = 0
@ -78,12 +84,31 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
    setVoicePlaybackState(currentState('speaking', options, audio))

    await new Promise<void>((resolve, reject) => {
+      let stall: number | null = null
+
      const cleanup = () => {
+        if (stall !== null) {
+          window.clearTimeout(stall)
+          stall = null
+        }
+
        audio.removeEventListener('ended', onEnded)
        audio.removeEventListener('error', onError)
+        audio.removeEventListener('timeupdate', armStall)
        currentStop = null
      }

+      const armStall = () => {
+        if (stall !== null) {
+          window.clearTimeout(stall)
+        }
+
+        stall = window.setTimeout(() => {
+          cleanup()
+          reject(new Error('Playback stalled'))
+        }, PLAYBACK_STALL_MS)
+      }
+
      const onEnded = () => {
        cleanup()
        resolve()
@ -101,7 +126,9 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions

      audio.addEventListener('ended', onEnded, { once: true })
      audio.addEventListener('error', onError, { once: true })
-      void audio.play().catch(reject)
+      audio.addEventListener('timeupdate', armStall)
+      armStall()
+      void audio.play().catch(onError)
    })

    if (!isCurrent()) {