Merge pull request #52187 from NousResearch/bb/desktop-voice

fix(desktop): wire Ctrl+B voice, declutter voice settings, stop endless TTS hang
This commit is contained in:
brooklyn! 2026-06-24 19:03:25 -05:00 committed by GitHub
commit 70650e82a3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 141 additions and 5 deletions

View file

@ -34,6 +34,7 @@ interface InsertRefsDetail {
const FOCUS_EVENT = 'hermes:composer-focus'
const INSERT_EVENT = 'hermes:composer-insert'
const INSERT_REFS_EVENT = 'hermes:composer-insert-refs'
const VOICE_TOGGLE_EVENT = 'hermes:composer-voice-toggle'
let activeTarget: ComposerTarget = 'main'
@ -105,6 +106,13 @@ export const requestComposerInsertRefs = (
export const onComposerInsertRefsRequest = (handler: (detail: InsertRefsDetail) => void) =>
subscribe<InsertRefsDetail>(INSERT_REFS_EVENT, handler)
/** Toggle the active composer's voice conversation the `composer.voice`
* hotkey (Ctrl+B) reaching into the composer that owns the voice state. */
export const requestVoiceToggle = () => dispatch<{ at: number }>(VOICE_TOGGLE_EVENT, { at: Date.now() })
export const onComposerVoiceToggleRequest = (handler: () => void) =>
subscribe<{ at: number }>(VOICE_TOGGLE_EVENT, () => handler())
/**
* Focus a composer input across React commit + browser focus restore.
*

View file

@ -79,7 +79,8 @@ import {
markActiveComposer,
onComposerFocusRequest,
onComposerInsertRefsRequest,
onComposerInsertRequest
onComposerInsertRequest,
onComposerVoiceToggleRequest
} from './focus'
import { HelpHint } from './help-hint'
import { useAtCompletions } from './hooks/use-at-completions'
@ -1844,6 +1845,24 @@ export function ChatBar({
pendingResponse
})
// The `composer.voice` hotkey (Ctrl+B) toggles the conversation. Starting
// with STT unconfigured lets the conversation surface its own "configure
// speech-to-text" notice rather than silently no-opping.
const toggleVoiceConversation = useCallback(() => {
if (disabled) {
return
}
if (voiceConversationActive) {
setVoiceConversationActive(false)
void conversation.end()
} else {
setVoiceConversationActive(true)
}
}, [conversation, disabled, voiceConversationActive])
useEffect(() => onComposerVoiceToggleRequest(toggleVoiceConversation), [toggleVoiceConversation])
const contextMenu = (
<ContextMenu
onInsertText={insertText}

View file

@ -40,7 +40,7 @@ import {
import { openNewSessionInNewWindow } from '@/store/windows'
import { useTheme } from '@/themes/context'
import { requestComposerFocus } from '../chat/composer/focus'
import { requestComposerFocus, requestVoiceToggle } from '../chat/composer/focus'
import { SIDEBAR_COLLAPSE_MEDIA_QUERY } from '../layout-constants'
import {
AGENTS_ROUTE,
@ -114,6 +114,7 @@ export function useKeybinds(deps: KeybindRuntimeDeps): void {
'composer.focus': () => requestComposerFocus('main'),
'composer.modelPicker': () => setModelPickerOpen(true),
'composer.voice': requestVoiceToggle,
'nav.commandPalette': toggleCommandPalette,
'nav.commandCenter': deps.toggleCommandCenter,

View file

@ -26,6 +26,26 @@ import { ModelSettings } from './model-settings'
import { EmptyState, ListRow, LoadingState, SettingsContent } from './primitives'
import { ProviderConfigPanel } from './provider-config-panel'
// On the Voice page, only surface the sub-fields of the *selected* TTS/STT
// provider — otherwise every provider's options render at once (the "totally
// crazy" wall of ~30 fields). Top-level keys (tts.provider, stt.enabled,
// voice.*) always show; STT provider fields hide entirely when STT is off.
export function voiceFieldVisible(key: string, config: HermesConfigRecord): boolean {
const match = /^(tts|stt)\.([^.]+)\./.exec(key)
if (!match) {
return true
}
const [, domain, provider] = match
if (domain === 'stt' && !getNested(config, 'stt.enabled')) {
return false
}
return provider === String(getNested(config, `${domain}.provider`) ?? '')
}
function ConfigField({
schemaKey,
schema,
@ -356,6 +376,9 @@ export function ConfigSettings({
return <LoadingState label={c.loading} />
}
const visibleFields =
activeSectionId === 'voice' ? fields.filter(([key]) => voiceFieldVisible(key, config)) : fields
return (
<SettingsContent>
{activeSectionId === 'model' && (
@ -363,11 +386,11 @@ export function ConfigSettings({
<ModelSettings onMainModelChanged={onMainModelChanged} />
</div>
)}
{fields.length === 0 ? (
{visibleFields.length === 0 ? (
<EmptyState description={c.emptyDesc} title={c.emptyTitle} />
) : (
<div className="grid gap-1">
{fields.map(([key, field]) => (
{visibleFields.map(([key, field]) => (
<div className="scroll-mt-6 rounded-lg" id={`setting-field-${key}`} key={key}>
<ConfigField
descriptionExtra={

View file

@ -0,0 +1,48 @@
import { describe, expect, it } from 'vitest'
import type { HermesConfigRecord } from '@/types/hermes'
import { voiceFieldVisible } from './config-settings'
const cfg = (over: Record<string, unknown> = {}): HermesConfigRecord =>
({
tts: { provider: 'edge', edge: {}, openai: {} },
stt: { enabled: true, provider: 'local', local: {}, groq: {} },
...over
}) as unknown as HermesConfigRecord
describe('voiceFieldVisible', () => {
it('always shows top-level + non-provider keys', () => {
const config = cfg()
for (const key of ['tts.provider', 'stt.enabled', 'stt.provider', 'voice.auto_tts', 'voice.record_key']) {
expect(voiceFieldVisible(key, config)).toBe(true)
}
})
it('shows only the selected TTS provider sub-fields', () => {
const config = cfg()
expect(voiceFieldVisible('tts.edge.voice', config)).toBe(true)
expect(voiceFieldVisible('tts.openai.voice', config)).toBe(false)
expect(voiceFieldVisible('tts.elevenlabs.voice_id', config)).toBe(false)
})
it('shows only the selected STT provider sub-fields', () => {
const config = cfg()
expect(voiceFieldVisible('stt.local.model', config)).toBe(true)
expect(voiceFieldVisible('stt.groq.model', config)).toBe(false)
})
it('hides every STT provider sub-field when STT is disabled', () => {
const config = cfg({ stt: { enabled: false, provider: 'local', local: {} } })
expect(voiceFieldVisible('stt.local.model', config)).toBe(false)
// ...but the enable/provider toggles themselves stay visible.
expect(voiceFieldVisible('stt.enabled', config)).toBe(true)
expect(voiceFieldVisible('stt.provider', config)).toBe(true)
})
it('tracks a provider switch', () => {
expect(voiceFieldVisible('tts.openai.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(true)
expect(voiceFieldVisible('tts.edge.voice', cfg({ tts: { provider: 'openai', openai: {} } }))).toBe(false)
})
})

View file

@ -211,6 +211,7 @@ export const en: Translations = {
'session.togglePin': 'Pin / unpin current session',
'composer.focus': 'Focus composer',
'composer.modelPicker': 'Open model picker',
'composer.voice': 'Start / stop voice conversation',
'view.toggleSidebar': 'Toggle sessions sidebar',
'view.toggleRightSidebar': 'Toggle file browser',
'view.showFiles': 'Show file browser',

View file

@ -206,6 +206,7 @@ export const zh: Translations = {
'session.togglePin': '固定/取消固定当前会话',
'composer.focus': '聚焦输入框',
'composer.modelPicker': '打开模型选择器',
'composer.voice': '开始 / 停止语音对话',
'view.toggleSidebar': '切换会话侧边栏',
'view.toggleRightSidebar': '切换文件浏览器',
'view.showFiles': '显示文件浏览器',

View file

@ -5,6 +5,8 @@
// like navigate / theme); labels come from i18n (`t.keybinds.actions[id]`). To
// add a hotkey, add a row here and a handler there — nothing else.
import { IS_MAC } from './combo'
export type KeybindCategory = 'composer' | 'profiles' | 'session' | 'navigation' | 'view'
// The self-referential opener — bound + dispatched like any action, but shown in
@ -55,6 +57,12 @@ export const KEYBIND_ACTIONS: readonly KeybindActionMeta[] = [
// ── Composer ─────────────────────────────────────────────────────────────
{ id: 'composer.focus', category: 'composer', defaults: [] },
{ id: 'composer.modelPicker', category: 'composer', defaults: [] },
// Voice conversation toggle. Matches the documented `voice.record_key`
// (Ctrl+B). On macOS that's literally ⌃B — distinct from the ⌘B sidebar
// toggle. Off macOS `ctrl` folds to `mod`, which IS the ⌘B/Ctrl+B sidebar
// chord, so ship it unbound there (rebindable in the panel) rather than
// stealing the long-standing sidebar binding.
{ id: 'composer.voice', category: 'composer', defaults: IS_MAC ? ['ctrl+b'] : [] },
// ── Profiles ─────────────────────────────────────────────────────────────
{ id: 'profile.default', category: 'profiles', defaults: ['mod+d'] },

View file

@ -8,6 +8,12 @@ import {
import { sanitizeTextForSpeech } from './speech-text'
// Free Edge TTS occasionally hands back audio that never fires `playing`/`ended`
// nor `error` — leaving voice mode stuck "speaking" forever. Reject if playback
// fails to start or stalls mid-stream for this long (rearmed on each progress
// tick, so legitimately long speech is never cut off).
const PLAYBACK_STALL_MS = 15_000
let currentAudio: HTMLAudioElement | null = null
let currentStop: (() => void) | null = null
let sequence = 0
@ -78,12 +84,31 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
setVoicePlaybackState(currentState('speaking', options, audio))
await new Promise<void>((resolve, reject) => {
let stall: number | null = null
const cleanup = () => {
if (stall !== null) {
window.clearTimeout(stall)
stall = null
}
audio.removeEventListener('ended', onEnded)
audio.removeEventListener('error', onError)
audio.removeEventListener('timeupdate', armStall)
currentStop = null
}
const armStall = () => {
if (stall !== null) {
window.clearTimeout(stall)
}
stall = window.setTimeout(() => {
cleanup()
reject(new Error('Playback stalled'))
}, PLAYBACK_STALL_MS)
}
const onEnded = () => {
cleanup()
resolve()
@ -101,7 +126,9 @@ export async function playSpeechText(text: string, options: VoicePlaybackOptions
audio.addEventListener('ended', onEnded, { once: true })
audio.addEventListener('error', onError, { once: true })
void audio.play().catch(reject)
audio.addEventListener('timeupdate', armStall)
armStall()
void audio.play().catch(onError)
})
if (!isCurrent()) {