fix(desktop): wire Ctrl+B voice, declutter voice settings, stop endless TTS hang

Three voice-mode papercuts in the desktop app:

1. Ctrl+B did nothing. The docs + `voice.record_key` advertise Ctrl+B to
   talk, but the desktop never bound it (only ⌘B = sidebar existed). Add a
   rebindable `composer.voice` action that toggles the voice conversation,
   defaulting to ⌃B on macOS (distinct from ⌘B; off-macOS `ctrl` folds to
   the sidebar chord, so it ships unbound there to avoid stealing it). The
   global keybind reaches the composer through a new focus-bus event.

2. The Voice settings page rendered every provider's options at once (~30
   fields). Filter to the *selected* TTS/STT provider's sub-fields; STT
   provider fields hide when STT is off. Picking "edge" now shows just the
   Edge voice, making it obvious voice chat also needs STT enabled.

3. Voice mode could hang "speaking" forever. Free Edge TTS sometimes returns
   audio that never fires `playing`/`ended`/`error`, so the playback promise
   never settled. Add a stall watchdog (rearmed on each progress tick, so
   long speech is never cut off) that rejects a stuck stream, letting the
   loop recover with a clear error.
This commit is contained in:
Brooklyn Nicholson 2026-06-24 18:26:14 -05:00
parent 404b06ac4f
commit 8d1706ae5c
9 changed files with 141 additions and 5 deletions

View file

@ -34,6 +34,7 @@ interface InsertRefsDetail {
const FOCUS_EVENT = 'hermes:composer-focus'
const INSERT_EVENT = 'hermes:composer-insert'
const INSERT_REFS_EVENT = 'hermes:composer-insert-refs'
const VOICE_TOGGLE_EVENT = 'hermes:composer-voice-toggle'
let activeTarget: ComposerTarget = 'main'
@ -105,6 +106,13 @@ export const requestComposerInsertRefs = (
export const onComposerInsertRefsRequest = (handler: (detail: InsertRefsDetail) => void) =>
subscribe<InsertRefsDetail>(INSERT_REFS_EVENT, handler)
/** Toggle the active composer's voice conversation the `composer.voice`
* hotkey (Ctrl+B) reaching into the composer that owns the voice state. */
export const requestVoiceToggle = () => dispatch<{ at: number }>(VOICE_TOGGLE_EVENT, { at: Date.now() })
export const onComposerVoiceToggleRequest = (handler: () => void) =>
subscribe<{ at: number }>(VOICE_TOGGLE_EVENT, () => handler())
/**
* Focus a composer input across React commit + browser focus restore.
*

View file

@ -79,7 +79,8 @@ import {
markActiveComposer,
onComposerFocusRequest,
onComposerInsertRefsRequest,
onComposerInsertRequest
onComposerInsertRequest,
onComposerVoiceToggleRequest
} from './focus'
import { HelpHint } from './help-hint'
import { useAtCompletions } from './hooks/use-at-completions'
@ -1844,6 +1845,24 @@ export function ChatBar({
pendingResponse
})
// The `composer.voice` hotkey (Ctrl+B) toggles the conversation. Starting
// with STT unconfigured lets the conversation surface its own "configure
// speech-to-text" notice rather than silently no-opping.
const toggleVoiceConversation = useCallback(() => {
if (disabled) {
return
}
if (voiceConversationActive) {
setVoiceConversationActive(false)
void conversation.end()
} else {
setVoiceConversationActive(true)
}
}, [conversation, disabled, voiceConversationActive])
useEffect(() => onComposerVoiceToggleRequest(toggleVoiceConversation), [toggleVoiceConversation])
const contextMenu = (
<ContextMenu
onInsertText={insertText}

View file

@ -40,7 +40,7 @@ import {
import { openNewSessionInNewWindow } from '@/store/windows'
import { useTheme } from '@/themes/context'
import { requestComposerFocus } from '../chat/composer/focus'
import { requestComposerFocus, requestVoiceToggle } from '../chat/composer/focus'
import { SIDEBAR_COLLAPSE_MEDIA_QUERY } from '../layout-constants'
import {
AGENTS_ROUTE,
@ -114,6 +114,7 @@ export function useKeybinds(deps: KeybindRuntimeDeps): void {
'composer.focus': () => requestComposerFocus('main'),
'composer.modelPicker': () => setModelPickerOpen(true),
'composer.voice': requestVoiceToggle,
'nav.commandPalette': toggleCommandPalette,
'nav.commandCenter': deps.toggleCommandCenter,

View file

@ -26,6 +26,26 @@ import { ModelSettings } from './model-settings'
import { EmptyState, ListRow, LoadingState, SettingsContent } from './primitives'
import { ProviderConfigPanel } from './provider-config-panel'
// On the Voice page, only surface the sub-fields of the *selected* TTS/STT
// provider — otherwise every provider's options render at once (the "totally
// crazy" wall of ~30 fields). Top-level keys (tts.provider, stt.enabled,
// voice.*) always show; STT provider fields hide entirely when STT is off.
export function voiceFieldVisible(key: string, config: HermesConfigRecord): boolean {
const match = /^(tts|stt)\.([^.]+)\./.exec(key)
if (!match) {
return true
}
const [, domain, provider] = match
if (domain === 'stt' && !getNested(config, 'stt.enabled')) {
return false
}
return provider === String(getNested(config, `${domain}.provider`) ?? '')
}
function ConfigField({
schemaKey,
schema,
@ -356,6 +376,9 @@ export function ConfigSettings({
return <LoadingState label={c.loading} />
}
const visibleFields =
activeSectionId === 'voice' ? fields.filter(([key]) => voiceFieldVisible(key, config)) : fields
return (
<SettingsContent>
{activeSectionId === 'model' && (
@ -363,11 +386,11 @@ export function ConfigSettings({
<ModelSettings onMainModelChanged={onMainModelChanged} />
</div>
)}
{fields.length === 0 ? (
{visibleFields.length === 0 ? (
<EmptyState description={c.emptyDesc} title={c.emptyTitle} />
) : (
<div className="grid gap-1">
{fields.map(([key, field]) => (
{visibleFields.map(([key, field]) => (
<div className="scroll-mt-6 rounded-lg" id={`setting-field-${key}`} key={key}>
<ConfigField
descriptionExtra={

View file

@ -0,0 +1,48 @@
import { describe, expect, it } from 'vitest'
import type { HermesConfigRecord } from '@/types/hermes'
import { voiceFieldVisible } from './config-settings'
const cfg = (over: Record<string, unknown> = {}): HermesConfigRecord =>
({
tts: { provider: 'edge', edge: {}, openai: {} },
stt: { enabled: true, provider: 'local', local: {}, groq: {} },
...over
}) as unknown as HermesConfigRecord
describe('voiceFieldVisible', () => {
it('always shows top-level + non-provider keys', () => {
const config = cfg()
for (const key of ['tts.provider', 'stt.enabled', 'stt.provider', 'voice.auto_tts', 'voice.record_key']) {
expect(voiceFieldVisible(key, config)).toBe(true)
}
})
it('shows only the selected TTS provider sub-fields', () => {
const config = cfg()
expect(voiceFieldVisible('tts.edge.voice', config)).toBe(true)
expect(voiceFieldVisible('tts.openai.voice', config)).toBe(false)
expect(voiceFieldVisible('tts.elevenlabs.voice_id', config)).toBe(false)
})
it('shows only the selected STT provider sub-fields', () => {
const config = cfg()
expect(voiceFieldVisible('stt.local.model', config)).toBe(true)
expect(voiceFieldVisible('stt.groq.model', config)).toBe(false)
})
it('hides every STT provider sub-field when STT is disabled', () => {
const config = cfg({ stt: { enabled: false, provider: 'local', local: {} } })
expect(voiceFieldVisible('stt.local.model', config)).toBe(false)
// ...but the enable/provider toggles themselves stay visible.
expect(voiceFieldVisible('stt.enabled', config)).toBe(true)
expect(voiceFieldVisible('stt.provider', config)).toBe(true)
})
it('tracks a provider switch', () => {
expect(voiceFieldVisible('tts.openai.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(true)
expect(voiceFieldVisible('tts.edge.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(false)
})
})

View file

@ -211,6 +211,7 @@ export const en: Translations = {
'session.togglePin': 'Pin / unpin current session',
'composer.focus': 'Focus composer',
'composer.modelPicker': 'Open model picker',
'composer.voice': 'Start / stop voice conversation',
'view.toggleSidebar': 'Toggle sessions sidebar',
'view.toggleRightSidebar': 'Toggle file browser',
'view.showFiles': 'Show file browser',

View file

@ -206,6 +206,7 @@ export const zh: Translations = {
'session.togglePin': '固定/取消固定当前会话',
'composer.focus': '聚焦输入框',
'composer.modelPicker': '打开模型选择器',
'composer.voice': '开始 / 停止语音对话',
'view.toggleSidebar': '切换会话侧边栏',
'view.toggleRightSidebar': '切换文件浏览器',
'view.showFiles': '显示文件浏览器',

View file

@ -5,6 +5,8 @@
// like navigate / theme); labels come from i18n (`t.keybinds.actions[id]`). To
// add a hotkey, add a row here and a handler there — nothing else.
import { IS_MAC } from './combo'
export type KeybindCategory = 'composer' | 'profiles' | 'session' | 'navigation' | 'view'
// The self-referential opener — bound + dispatched like any action, but shown in
@ -55,6 +57,12 @@ export const KEYBIND_ACTIONS: readonly KeybindActionMeta[] = [
// ── Composer ─────────────────────────────────────────────────────────────
{ id: 'composer.focus', category: 'composer', defaults: [] },
{ id: 'composer.modelPicker', category: 'composer', defaults: [] },
// Voice conversation toggle. Matches the documented `voice.record_key`
// (Ctrl+B). On macOS that's literally ⌃B — distinct from the ⌘B sidebar
// toggle. Off macOS `ctrl` folds to `mod`, which IS the ⌘B/Ctrl+B sidebar
// chord, so ship it unbound there (rebindable in the panel) rather than
// stealing the long-standing sidebar binding.
{ id: 'composer.voice', category: 'composer', defaults: IS_MAC ? ['ctrl+b'] : [] },
// ── Profiles ─────────────────────────────────────────────────────────────
{ id: 'profile.default', category: 'profiles', defaults: ['mod+d'] },

View file

@ -8,6 +8,12 @@ import {
import { sanitizeTextForSpeech } from './speech-text'
// Free Edge TTS occasionally hands back audio that never fires `playing`/`ended`
// nor `error` — leaving voice mode stuck "speaking" forever. Reject if playback
// fails to start or stalls mid-stream for this long (rearmed on each progress
// tick, so legitimately long speech is never cut off).
const PLAYBACK_STALL_MS = 15_000
let currentAudio: HTMLAudioElement | null = null
let currentStop: (() => void) | null = null
let sequence = 0
@ -78,12 +84,31 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
setVoicePlaybackState(currentState('speaking', options, audio))
await new Promise<void>((resolve, reject) => {
let stall: number | null = null
const cleanup = () => {
if (stall !== null) {
window.clearTimeout(stall)
stall = null
}
audio.removeEventListener('ended', onEnded)
audio.removeEventListener('error', onError)
audio.removeEventListener('timeupdate', armStall)
currentStop = null
}
const armStall = () => {
if (stall !== null) {
window.clearTimeout(stall)
}
stall = window.setTimeout(() => {
cleanup()
reject(new Error('Playback stalled'))
}, PLAYBACK_STALL_MS)
}
const onEnded = () => {
cleanup()
resolve()
@ -101,7 +126,9 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
audio.addEventListener('ended', onEnded, { once: true })
audio.addEventListener('error', onError, { once: true })
void audio.play().catch(reject)
audio.addEventListener('timeupdate', armStall)
armStall()
void audio.play().catch(onError)
})
if (!isCurrent()) {