From 967bc910b42e488d44a885092c127fec73b79559 Mon Sep 17 00:00:00 2001 From: emozilla Date: Sun, 31 May 2026 03:49:28 -0400 Subject: [PATCH] fix(desktop): repair voice dictation on Windows Voice dictation was broken on Windows in two ways: 1. Mic access was denied. The Electron permission request handler only granted 'media' requests whose details.mediaTypes included 'audio', but Chromium on Windows frequently fires the mic request with an empty mediaTypes array, so getUserMedia threw NotAllowedError. The handler now grants audio-capture when mediaTypes includes 'audio' OR is empty/absent, handles the 'audioCapture' permission name, and adds a setPermissionCheckHandler (the synchronous path Chromium also consults for getUserMedia on Windows). Video is still denied. 2. Transcripts went nowhere. The composer's insertText handler (used by dictation and other inserts) only updated the assistant-ui composer store via setText, never the contentEditable editor DOM. The draft->editor sync effect only re-renders the editor when it is NOT focused, and dictation runs while the editor has/regains focus, so the transcript was stored but never shown and could not be sent. insertText now renders into the editor DOM and places the caret, mirroring appendExternalText. Also hardens fetchJson: a 2xx response with an HTML body (or text/html content-type) now rejects with a clear message naming the URL instead of an opaque JSON.parse 'Unexpected token <' error. --- apps/desktop/electron/main.cjs | 51 +++++++++++++++++--- apps/desktop/src/app/chat/composer/index.tsx | 15 ++++++ 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/apps/desktop/electron/main.cjs b/apps/desktop/electron/main.cjs index f5f6a376d33..62af3d859a5 100644 --- a/apps/desktop/electron/main.cjs +++ b/apps/desktop/electron/main.cjs @@ -2611,15 +2611,54 @@ function installContextMenu(window) { }) } -function installMediaPermissions() { - session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback, details) => { - if (permission === 'media' && details?.mediaTypes?.includes('audio')) { - callback(true) +// Microphone capture for the voice composer. The renderer drives mic access +// through getUserMedia, which Chromium gates behind these two session hooks. +// +// The naive `details.mediaTypes.includes('audio')` check works on macOS but +// breaks on Windows: Chromium frequently fires the mic permission request with +// an empty/undefined `mediaTypes`, so the strict check denies it and +// getUserMedia throws NotAllowedError ("Microphone permission was denied"). +// We therefore treat an audio-capture request as allowed whenever it's the +// 'media'/'audioCapture' permission AND mediaTypes either includes 'audio' OR +// is empty/absent (the Windows case). Video is still denied. +function isAudioCapturePermission(permission, details) { + if (permission === 'audioCapture') { + return true + } + if (permission !== 'media') { + return false + } + const mediaTypes = details?.mediaTypes + if (!Array.isArray(mediaTypes) || mediaTypes.length === 0) { + // Windows: mediaTypes is often empty for a mic request. Don't deny on + // missing metadata. (A video request would carry mediaTypes:['video'].) + return true + } + return mediaTypes.includes('audio') && !mediaTypes.includes('video') +} - return +function installMediaPermissions() { + // Async request handler: the prompt-style path (most platforms). + session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback, details) => { + callback(isAudioCapturePermission(permission, details)) + }) + + // Synchronous check handler: Chromium consults this for getUserMedia on + // Windows in addition to (or instead of) the request handler. Without it, + // the check defaults to false and the mic is denied before the request + // handler ever runs. + session.defaultSession.setPermissionCheckHandler((_webContents, permission, _origin, details) => { + if (permission === 'media' || permission === 'audioCapture') { + // details.mediaType is a single string here (not the mediaTypes array). + const mediaType = details?.mediaType + if (mediaType === 'video') { + return false + } + + return true } - callback(false) + return false }) } diff --git a/apps/desktop/src/app/chat/composer/index.tsx b/apps/desktop/src/app/chat/composer/index.tsx index 10887fbb8d6..a0b1a370baa 100644 --- a/apps/desktop/src/app/chat/composer/index.tsx +++ b/apps/desktop/src/app/chat/composer/index.tsx @@ -331,6 +331,21 @@ export function ChatBar({ draftRef.current = nextDraft aui.composer().setText(nextDraft) + + // Push the new text into the contentEditable editor directly. Setting the + // assistant-ui composer state alone is not enough: the draft→editor sync + // effect only re-renders the editor when it is NOT focused + // (document.activeElement !== editor), and the dictation/insert paths + // typically run while the editor has (or immediately regains) focus — so + // the store would hold the text but the visible editor would stay empty + // and there'd be nothing to send. Mirror appendExternalText here. + const editor = editorRef.current + + if (editor) { + renderComposerContents(editor, nextDraft) + placeCaretEnd(editor) + } + requestMainFocus() }