Merge pull request #52187 from NousResearch/bb/desktop-voice

fix(desktop): wire Ctrl+B voice, declutter voice settings, stop endless TTS hang
2026-06-27 11:22:03 +00:00 · 2026-06-24 19:03:25 -05:00 · 2026-06-24 19:03:25 -05:00 · 70650e82a3
commit 70650e82a3
parent 9a94865552 8d1706ae5c
9 changed files with 141 additions and 5 deletions
--- a/apps/desktop/src/app/chat/composer/focus.ts
+++ b/apps/desktop/src/app/chat/composer/focus.ts
@ -34,6 +34,7 @@ interface InsertRefsDetail {
 const FOCUS_EVENT = 'hermes:composer-focus'
 const INSERT_EVENT = 'hermes:composer-insert'
 const INSERT_REFS_EVENT = 'hermes:composer-insert-refs'
+const VOICE_TOGGLE_EVENT = 'hermes:composer-voice-toggle'

 let activeTarget: ComposerTarget = 'main'

@ -105,6 +106,13 @@ export const requestComposerInsertRefs = (
 export const onComposerInsertRefsRequest = (handler: (detail: InsertRefsDetail) => void) =>
  subscribe<InsertRefsDetail>(INSERT_REFS_EVENT, handler)

+/** Toggle the active composer's voice conversation — the `composer.voice`
+ *  hotkey (Ctrl+B) reaching into the composer that owns the voice state. */
+export const requestVoiceToggle = () => dispatch<{ at: number }>(VOICE_TOGGLE_EVENT, { at: Date.now() })
+
+export const onComposerVoiceToggleRequest = (handler: () => void) =>
+  subscribe<{ at: number }>(VOICE_TOGGLE_EVENT, () => handler())
+
 /**
 * Focus a composer input across React commit + browser focus restore.
 *
--- a/apps/desktop/src/app/chat/composer/index.tsx
+++ b/apps/desktop/src/app/chat/composer/index.tsx
@ -79,7 +79,8 @@ import {
  markActiveComposer,
  onComposerFocusRequest,
  onComposerInsertRefsRequest,
-  onComposerInsertRequest
+  onComposerInsertRequest,
+  onComposerVoiceToggleRequest
 } from './focus'
 import { HelpHint } from './help-hint'
 import { useAtCompletions } from './hooks/use-at-completions'
@ -1844,6 +1845,24 @@ export function ChatBar({
    pendingResponse
  })

+  // The `composer.voice` hotkey (Ctrl+B) toggles the conversation. Starting
+  // with STT unconfigured lets the conversation surface its own "configure
+  // speech-to-text" notice rather than silently no-opping.
+  const toggleVoiceConversation = useCallback(() => {
+    if (disabled) {
+      return
+    }
+
+    if (voiceConversationActive) {
+      setVoiceConversationActive(false)
+      void conversation.end()
+    } else {
+      setVoiceConversationActive(true)
+    }
+  }, [conversation, disabled, voiceConversationActive])
+
+  useEffect(() => onComposerVoiceToggleRequest(toggleVoiceConversation), [toggleVoiceConversation])
+
  const contextMenu = (
    <ContextMenu
      onInsertText={insertText}
--- a/apps/desktop/src/app/hooks/use-keybinds.ts
+++ b/apps/desktop/src/app/hooks/use-keybinds.ts
@ -40,7 +40,7 @@ import {
 import { openNewSessionInNewWindow } from '@/store/windows'
 import { useTheme } from '@/themes/context'

-import { requestComposerFocus } from '../chat/composer/focus'
+import { requestComposerFocus, requestVoiceToggle } from '../chat/composer/focus'
 import { SIDEBAR_COLLAPSE_MEDIA_QUERY } from '../layout-constants'
 import {
  AGENTS_ROUTE,
@ -114,6 +114,7 @@ export function useKeybinds(deps: KeybindRuntimeDeps): void {

    'composer.focus': () => requestComposerFocus('main'),
    'composer.modelPicker': () => setModelPickerOpen(true),
+    'composer.voice': requestVoiceToggle,

    'nav.commandPalette': toggleCommandPalette,
    'nav.commandCenter': deps.toggleCommandCenter,
--- a/apps/desktop/src/app/settings/config-settings.tsx
+++ b/apps/desktop/src/app/settings/config-settings.tsx
@ -26,6 +26,26 @@ import { ModelSettings } from './model-settings'
 import { EmptyState, ListRow, LoadingState, SettingsContent } from './primitives'
 import { ProviderConfigPanel } from './provider-config-panel'

+// On the Voice page, only surface the sub-fields of the *selected* TTS/STT
+// provider — otherwise every provider's options render at once (the "totally
+// crazy" wall of ~30 fields). Top-level keys (tts.provider, stt.enabled,
+// voice.*) always show; STT provider fields hide entirely when STT is off.
+export function voiceFieldVisible(key: string, config: HermesConfigRecord): boolean {
+  const match = /^(tts|stt)\.([^.]+)\./.exec(key)
+
+  if (!match) {
+    return true
+  }
+
+  const [, domain, provider] = match
+
+  if (domain === 'stt' && !getNested(config, 'stt.enabled')) {
+    return false
+  }
+
+  return provider === String(getNested(config, `${domain}.provider`) ?? '')
+}
+
 function ConfigField({
  schemaKey,
  schema,
@ -356,6 +376,9 @@ export function ConfigSettings({
    return <LoadingState label={c.loading} />
  }

+  const visibleFields =
+    activeSectionId === 'voice' ? fields.filter(([key]) => voiceFieldVisible(key, config)) : fields
+
  return (
    <SettingsContent>
      {activeSectionId === 'model' && (
@ -363,11 +386,11 @@ export function ConfigSettings({
          <ModelSettings onMainModelChanged={onMainModelChanged} />
        </div>
      )}
-      {fields.length === 0 ? (
+      {visibleFields.length === 0 ? (
        <EmptyState description={c.emptyDesc} title={c.emptyTitle} />
      ) : (
        <div className="grid gap-1">
-          {fields.map(([key, field]) => (
+          {visibleFields.map(([key, field]) => (
            <div className="scroll-mt-6 rounded-lg" id={`setting-field-${key}`} key={key}>
              <ConfigField
                descriptionExtra={
--- a/apps/desktop/src/app/settings/voice-field-visible.test.ts
+++ b/apps/desktop/src/app/settings/voice-field-visible.test.ts
@ -0,0 +1,48 @@
+import { describe, expect, it } from 'vitest'
+
+import type { HermesConfigRecord } from '@/types/hermes'
+
+import { voiceFieldVisible } from './config-settings'
+
+const cfg = (over: Record<string, unknown> = {}): HermesConfigRecord =>
+  ({
+    tts: { provider: 'edge', edge: {}, openai: {} },
+    stt: { enabled: true, provider: 'local', local: {}, groq: {} },
+    ...over
+  }) as unknown as HermesConfigRecord
+
+describe('voiceFieldVisible', () => {
+  it('always shows top-level + non-provider keys', () => {
+    const config = cfg()
+
+    for (const key of ['tts.provider', 'stt.enabled', 'stt.provider', 'voice.auto_tts', 'voice.record_key']) {
+      expect(voiceFieldVisible(key, config)).toBe(true)
+    }
+  })
+
+  it('shows only the selected TTS provider sub-fields', () => {
+    const config = cfg()
+    expect(voiceFieldVisible('tts.edge.voice', config)).toBe(true)
+    expect(voiceFieldVisible('tts.openai.voice', config)).toBe(false)
+    expect(voiceFieldVisible('tts.elevenlabs.voice_id', config)).toBe(false)
+  })
+
+  it('shows only the selected STT provider sub-fields', () => {
+    const config = cfg()
+    expect(voiceFieldVisible('stt.local.model', config)).toBe(true)
+    expect(voiceFieldVisible('stt.groq.model', config)).toBe(false)
+  })
+
+  it('hides every STT provider sub-field when STT is disabled', () => {
+    const config = cfg({ stt: { enabled: false, provider: 'local', local: {} } })
+    expect(voiceFieldVisible('stt.local.model', config)).toBe(false)
+    // ...but the enable/provider toggles themselves stay visible.
+    expect(voiceFieldVisible('stt.enabled', config)).toBe(true)
+    expect(voiceFieldVisible('stt.provider', config)).toBe(true)
+  })
+
+  it('tracks a provider switch', () => {
+    expect(voiceFieldVisible('tts.openai.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(true)
+    expect(voiceFieldVisible('tts.edge.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(false)
+  })
+})
--- a/apps/desktop/src/i18n/en.ts
+++ b/apps/desktop/src/i18n/en.ts
@ -211,6 +211,7 @@ export const en: Translations = {
      'session.togglePin': 'Pin / unpin current session',
      'composer.focus': 'Focus composer',
      'composer.modelPicker': 'Open model picker',
+      'composer.voice': 'Start / stop voice conversation',
      'view.toggleSidebar': 'Toggle sessions sidebar',
      'view.toggleRightSidebar': 'Toggle file browser',
      'view.showFiles': 'Show file browser',
--- a/apps/desktop/src/i18n/zh.ts
+++ b/apps/desktop/src/i18n/zh.ts
@ -206,6 +206,7 @@ export const zh: Translations = {
      'session.togglePin': '固定/取消固定当前会话',
      'composer.focus': '聚焦输入框',
      'composer.modelPicker': '打开模型选择器',
+      'composer.voice': '开始 / 停止语音对话',
      'view.toggleSidebar': '切换会话侧边栏',
      'view.toggleRightSidebar': '切换文件浏览器',
      'view.showFiles': '显示文件浏览器',
--- a/apps/desktop/src/lib/keybinds/actions.ts
+++ b/apps/desktop/src/lib/keybinds/actions.ts
@ -5,6 +5,8 @@
 // like navigate / theme); labels come from i18n (`t.keybinds.actions[id]`). To
 // add a hotkey, add a row here and a handler there — nothing else.

+import { IS_MAC } from './combo'
+
 export type KeybindCategory = 'composer' | 'profiles' | 'session' | 'navigation' | 'view'

 // The self-referential opener — bound + dispatched like any action, but shown in
@ -55,6 +57,12 @@ export const KEYBIND_ACTIONS: readonly KeybindActionMeta[] = [
  // ── Composer ─────────────────────────────────────────────────────────────
  { id: 'composer.focus', category: 'composer', defaults: [] },
  { id: 'composer.modelPicker', category: 'composer', defaults: [] },
+  // Voice conversation toggle. Matches the documented `voice.record_key`
+  // (Ctrl+B). On macOS that's literally ⌃B — distinct from the ⌘B sidebar
+  // toggle. Off macOS `ctrl` folds to `mod`, which IS the ⌘B/Ctrl+B sidebar
+  // chord, so ship it unbound there (rebindable in the panel) rather than
+  // stealing the long-standing sidebar binding.
+  { id: 'composer.voice', category: 'composer', defaults: IS_MAC ? ['ctrl+b'] : [] },

  // ── Profiles ─────────────────────────────────────────────────────────────
  { id: 'profile.default', category: 'profiles', defaults: ['mod+d'] },
--- a/apps/desktop/src/lib/voice-playback.ts
+++ b/apps/desktop/src/lib/voice-playback.ts
@ -8,6 +8,12 @@ import {

 import { sanitizeTextForSpeech } from './speech-text'

+// Free Edge TTS occasionally hands back audio that never fires `playing`/`ended`
+// nor `error` — leaving voice mode stuck "speaking" forever. Reject if playback
+// fails to start or stalls mid-stream for this long (rearmed on each progress
+// tick, so legitimately long speech is never cut off).
+const PLAYBACK_STALL_MS = 15_000
+
 let currentAudio: HTMLAudioElement | null = null
 let currentStop: (() => void) | null = null
 let sequence = 0
@ -78,12 +84,31 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
    setVoicePlaybackState(currentState('speaking', options, audio))

    await new Promise<void>((resolve, reject) => {
+      let stall: number | null = null
+
      const cleanup = () => {
+        if (stall !== null) {
+          window.clearTimeout(stall)
+          stall = null
+        }
+
        audio.removeEventListener('ended', onEnded)
        audio.removeEventListener('error', onError)
+        audio.removeEventListener('timeupdate', armStall)
        currentStop = null
      }

+      const armStall = () => {
+        if (stall !== null) {
+          window.clearTimeout(stall)
+        }
+
+        stall = window.setTimeout(() => {
+          cleanup()
+          reject(new Error('Playback stalled'))
+        }, PLAYBACK_STALL_MS)
+      }
+
      const onEnded = () => {
        cleanup()
        resolve()
@ -101,7 +126,9 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions

      audio.addEventListener('ended', onEnded, { once: true })
      audio.addEventListener('error', onError, { once: true })
-      void audio.play().catch(reject)
+      audio.addEventListener('timeupdate', armStall)
+      armStall()
+      void audio.play().catch(onError)
    })

    if (!isCurrent()) {