diff --git a/apps/desktop/src/app/chat/composer/focus.ts b/apps/desktop/src/app/chat/composer/focus.ts index 916436de0b2..3de3f5c9800 100644 --- a/apps/desktop/src/app/chat/composer/focus.ts +++ b/apps/desktop/src/app/chat/composer/focus.ts @@ -34,6 +34,7 @@ interface InsertRefsDetail { const FOCUS_EVENT = 'hermes:composer-focus' const INSERT_EVENT = 'hermes:composer-insert' const INSERT_REFS_EVENT = 'hermes:composer-insert-refs' +const VOICE_TOGGLE_EVENT = 'hermes:composer-voice-toggle' let activeTarget: ComposerTarget = 'main' @@ -105,6 +106,13 @@ export const requestComposerInsertRefs = ( export const onComposerInsertRefsRequest = (handler: (detail: InsertRefsDetail) => void) => subscribe(INSERT_REFS_EVENT, handler) +/** Toggle the active composer's voice conversation — the `composer.voice` + * hotkey (Ctrl+B) reaching into the composer that owns the voice state. */ +export const requestVoiceToggle = () => dispatch<{ at: number }>(VOICE_TOGGLE_EVENT, { at: Date.now() }) + +export const onComposerVoiceToggleRequest = (handler: () => void) => + subscribe<{ at: number }>(VOICE_TOGGLE_EVENT, () => handler()) + /** * Focus a composer input across React commit + browser focus restore. * diff --git a/apps/desktop/src/app/chat/composer/index.tsx b/apps/desktop/src/app/chat/composer/index.tsx index 5b38cae3ff0..d4ec0a36a1d 100644 --- a/apps/desktop/src/app/chat/composer/index.tsx +++ b/apps/desktop/src/app/chat/composer/index.tsx @@ -79,7 +79,8 @@ import { markActiveComposer, onComposerFocusRequest, onComposerInsertRefsRequest, - onComposerInsertRequest + onComposerInsertRequest, + onComposerVoiceToggleRequest } from './focus' import { HelpHint } from './help-hint' import { useAtCompletions } from './hooks/use-at-completions' @@ -1844,6 +1845,24 @@ export function ChatBar({ pendingResponse }) + // The `composer.voice` hotkey (Ctrl+B) toggles the conversation. Starting + // with STT unconfigured lets the conversation surface its own "configure + // speech-to-text" notice rather than silently no-opping. + const toggleVoiceConversation = useCallback(() => { + if (disabled) { + return + } + + if (voiceConversationActive) { + setVoiceConversationActive(false) + void conversation.end() + } else { + setVoiceConversationActive(true) + } + }, [conversation, disabled, voiceConversationActive]) + + useEffect(() => onComposerVoiceToggleRequest(toggleVoiceConversation), [toggleVoiceConversation]) + const contextMenu = ( requestComposerFocus('main'), 'composer.modelPicker': () => setModelPickerOpen(true), + 'composer.voice': requestVoiceToggle, 'nav.commandPalette': toggleCommandPalette, 'nav.commandCenter': deps.toggleCommandCenter, diff --git a/apps/desktop/src/app/settings/config-settings.tsx b/apps/desktop/src/app/settings/config-settings.tsx index 3f570f7adfb..dbd2280c4a0 100644 --- a/apps/desktop/src/app/settings/config-settings.tsx +++ b/apps/desktop/src/app/settings/config-settings.tsx @@ -26,6 +26,26 @@ import { ModelSettings } from './model-settings' import { EmptyState, ListRow, LoadingState, SettingsContent } from './primitives' import { ProviderConfigPanel } from './provider-config-panel' +// On the Voice page, only surface the sub-fields of the *selected* TTS/STT +// provider — otherwise every provider's options render at once (the "totally +// crazy" wall of ~30 fields). Top-level keys (tts.provider, stt.enabled, +// voice.*) always show; STT provider fields hide entirely when STT is off. +export function voiceFieldVisible(key: string, config: HermesConfigRecord): boolean { + const match = /^(tts|stt)\.([^.]+)\./.exec(key) + + if (!match) { + return true + } + + const [, domain, provider] = match + + if (domain === 'stt' && !getNested(config, 'stt.enabled')) { + return false + } + + return provider === String(getNested(config, `${domain}.provider`) ?? '') +} + function ConfigField({ schemaKey, schema, @@ -356,6 +376,9 @@ export function ConfigSettings({ return } + const visibleFields = + activeSectionId === 'voice' ? fields.filter(([key]) => voiceFieldVisible(key, config)) : fields + return ( {activeSectionId === 'model' && ( @@ -363,11 +386,11 @@ export function ConfigSettings({ )} - {fields.length === 0 ? ( + {visibleFields.length === 0 ? ( ) : (
- {fields.map(([key, field]) => ( + {visibleFields.map(([key, field]) => (
= {}): HermesConfigRecord => + ({ + tts: { provider: 'edge', edge: {}, openai: {} }, + stt: { enabled: true, provider: 'local', local: {}, groq: {} }, + ...over + }) as unknown as HermesConfigRecord + +describe('voiceFieldVisible', () => { + it('always shows top-level + non-provider keys', () => { + const config = cfg() + + for (const key of ['tts.provider', 'stt.enabled', 'stt.provider', 'voice.auto_tts', 'voice.record_key']) { + expect(voiceFieldVisible(key, config)).toBe(true) + } + }) + + it('shows only the selected TTS provider sub-fields', () => { + const config = cfg() + expect(voiceFieldVisible('tts.edge.voice', config)).toBe(true) + expect(voiceFieldVisible('tts.openai.voice', config)).toBe(false) + expect(voiceFieldVisible('tts.elevenlabs.voice_id', config)).toBe(false) + }) + + it('shows only the selected STT provider sub-fields', () => { + const config = cfg() + expect(voiceFieldVisible('stt.local.model', config)).toBe(true) + expect(voiceFieldVisible('stt.groq.model', config)).toBe(false) + }) + + it('hides every STT provider sub-field when STT is disabled', () => { + const config = cfg({ stt: { enabled: false, provider: 'local', local: {} } }) + expect(voiceFieldVisible('stt.local.model', config)).toBe(false) + // ...but the enable/provider toggles themselves stay visible. + expect(voiceFieldVisible('stt.enabled', config)).toBe(true) + expect(voiceFieldVisible('stt.provider', config)).toBe(true) + }) + + it('tracks a provider switch', () => { + expect(voiceFieldVisible('tts.openai.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(true) + expect(voiceFieldVisible('tts.edge.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(false) + }) +}) diff --git a/apps/desktop/src/i18n/en.ts b/apps/desktop/src/i18n/en.ts index 8a1a295ce92..e20b118f262 100644 --- a/apps/desktop/src/i18n/en.ts +++ b/apps/desktop/src/i18n/en.ts @@ -211,6 +211,7 @@ export const en: Translations = { 'session.togglePin': 'Pin / unpin current session', 'composer.focus': 'Focus composer', 'composer.modelPicker': 'Open model picker', + 'composer.voice': 'Start / stop voice conversation', 'view.toggleSidebar': 'Toggle sessions sidebar', 'view.toggleRightSidebar': 'Toggle file browser', 'view.showFiles': 'Show file browser', diff --git a/apps/desktop/src/i18n/zh.ts b/apps/desktop/src/i18n/zh.ts index 6423e1749a9..dec1959362f 100644 --- a/apps/desktop/src/i18n/zh.ts +++ b/apps/desktop/src/i18n/zh.ts @@ -206,6 +206,7 @@ export const zh: Translations = { 'session.togglePin': '固定/取消固定当前会话', 'composer.focus': '聚焦输入框', 'composer.modelPicker': '打开模型选择器', + 'composer.voice': '开始 / 停止语音对话', 'view.toggleSidebar': '切换会话侧边栏', 'view.toggleRightSidebar': '切换文件浏览器', 'view.showFiles': '显示文件浏览器', diff --git a/apps/desktop/src/lib/keybinds/actions.ts b/apps/desktop/src/lib/keybinds/actions.ts index 38eab936f09..361906b213f 100644 --- a/apps/desktop/src/lib/keybinds/actions.ts +++ b/apps/desktop/src/lib/keybinds/actions.ts @@ -5,6 +5,8 @@ // like navigate / theme); labels come from i18n (`t.keybinds.actions[id]`). To // add a hotkey, add a row here and a handler there — nothing else. +import { IS_MAC } from './combo' + export type KeybindCategory = 'composer' | 'profiles' | 'session' | 'navigation' | 'view' // The self-referential opener — bound + dispatched like any action, but shown in @@ -55,6 +57,12 @@ export const KEYBIND_ACTIONS: readonly KeybindActionMeta[] = [ // ── Composer ───────────────────────────────────────────────────────────── { id: 'composer.focus', category: 'composer', defaults: [] }, { id: 'composer.modelPicker', category: 'composer', defaults: [] }, + // Voice conversation toggle. Matches the documented `voice.record_key` + // (Ctrl+B). On macOS that's literally ⌃B — distinct from the ⌘B sidebar + // toggle. Off macOS `ctrl` folds to `mod`, which IS the ⌘B/Ctrl+B sidebar + // chord, so ship it unbound there (rebindable in the panel) rather than + // stealing the long-standing sidebar binding. + { id: 'composer.voice', category: 'composer', defaults: IS_MAC ? ['ctrl+b'] : [] }, // ── Profiles ───────────────────────────────────────────────────────────── { id: 'profile.default', category: 'profiles', defaults: ['mod+d'] }, diff --git a/apps/desktop/src/lib/voice-playback.ts b/apps/desktop/src/lib/voice-playback.ts index 1554ed8a315..eea1b5b6e0a 100644 --- a/apps/desktop/src/lib/voice-playback.ts +++ b/apps/desktop/src/lib/voice-playback.ts @@ -8,6 +8,12 @@ import { import { sanitizeTextForSpeech } from './speech-text' +// Free Edge TTS occasionally hands back audio that never fires `playing`/`ended` +// nor `error` — leaving voice mode stuck "speaking" forever. Reject if playback +// fails to start or stalls mid-stream for this long (rearmed on each progress +// tick, so legitimately long speech is never cut off). +const PLAYBACK_STALL_MS = 15_000 + let currentAudio: HTMLAudioElement | null = null let currentStop: (() => void) | null = null let sequence = 0 @@ -78,12 +84,31 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions setVoicePlaybackState(currentState('speaking', options, audio)) await new Promise((resolve, reject) => { + let stall: number | null = null + const cleanup = () => { + if (stall !== null) { + window.clearTimeout(stall) + stall = null + } + audio.removeEventListener('ended', onEnded) audio.removeEventListener('error', onError) + audio.removeEventListener('timeupdate', armStall) currentStop = null } + const armStall = () => { + if (stall !== null) { + window.clearTimeout(stall) + } + + stall = window.setTimeout(() => { + cleanup() + reject(new Error('Playback stalled')) + }, PLAYBACK_STALL_MS) + } + const onEnded = () => { cleanup() resolve() @@ -101,7 +126,9 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions audio.addEventListener('ended', onEnded, { once: true }) audio.addEventListener('error', onError, { once: true }) - void audio.play().catch(reject) + audio.addEventListener('timeupdate', armStall) + armStall() + void audio.play().catch(onError) }) if (!isCurrent()) {