fix(desktop): repair voice dictation on Windows

Voice dictation was broken on Windows in two ways: 1. Mic access was denied. The Electron permission request handler only granted 'media' requests whose details.mediaTypes included 'audio', but Chromium on Windows frequently fires the mic request with an empty mediaTypes array, so getUserMedia threw NotAllowedError. The handler now grants audio-capture when mediaTypes includes 'audio' OR is empty/absent, handles the 'audioCapture' permission name, and adds a setPermissionCheckHandler (the synchronous path Chromium also consults for getUserMedia on Windows). Video is still denied. 2. Transcripts went nowhere. The composer's insertText handler (used by dictation and other inserts) only updated the assistant-ui composer store via setText, never the contentEditable editor DOM. The draft->editor sync effect only re-renders the editor when it is NOT focused, and dictation runs while the editor has/regains focus, so the transcript was stored but never shown and could not be sent. insertText now renders into the editor DOM and places the caret, mirroring appendExternalText. Also hardens fetchJson: a 2xx response with an HTML body (or text/html content-type) now rejects with a clear message naming the URL instead of an opaque JSON.parse 'Unexpected token <' error.
2026-07-29 18:46:59 +00:00 · 2026-05-31 03:49:28 -04:00 · 2026-05-31 03:49:28 -04:00 · 967bc910b4
commit 967bc910b4
parent 403a833604
2 changed files with 60 additions and 6 deletions
--- a/apps/desktop/electron/main.cjs
+++ b/apps/desktop/electron/main.cjs
@ -2611,15 +2611,54 @@ function installContextMenu(window) {
  })
 }

-function installMediaPermissions() {
-  session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback, details) => {
-    if (permission === 'media' && details?.mediaTypes?.includes('audio')) {
-      callback(true)
+// Microphone capture for the voice composer. The renderer drives mic access
+// through getUserMedia, which Chromium gates behind these two session hooks.
+//
+// The naive `details.mediaTypes.includes('audio')` check works on macOS but
+// breaks on Windows: Chromium frequently fires the mic permission request with
+// an empty/undefined `mediaTypes`, so the strict check denies it and
+// getUserMedia throws NotAllowedError ("Microphone permission was denied").
+// We therefore treat an audio-capture request as allowed whenever it's the
+// 'media'/'audioCapture' permission AND mediaTypes either includes 'audio' OR
+// is empty/absent (the Windows case). Video is still denied.
+function isAudioCapturePermission(permission, details) {
+  if (permission === 'audioCapture') {
+    return true
+  }
+  if (permission !== 'media') {
+    return false
+  }
+  const mediaTypes = details?.mediaTypes
+  if (!Array.isArray(mediaTypes) || mediaTypes.length === 0) {
+    // Windows: mediaTypes is often empty for a mic request. Don't deny on
+    // missing metadata. (A video request would carry mediaTypes:['video'].)
+    return true
+  }
+  return mediaTypes.includes('audio') && !mediaTypes.includes('video')
+}

-      return
+function installMediaPermissions() {
+  // Async request handler: the prompt-style path (most platforms).
+  session.defaultSession.setPermissionRequestHandler((_webContents, permission, callback, details) => {
+    callback(isAudioCapturePermission(permission, details))
+  })
+
+  // Synchronous check handler: Chromium consults this for getUserMedia on
+  // Windows in addition to (or instead of) the request handler. Without it,
+  // the check defaults to false and the mic is denied before the request
+  // handler ever runs.
+  session.defaultSession.setPermissionCheckHandler((_webContents, permission, _origin, details) => {
+    if (permission === 'media' || permission === 'audioCapture') {
+      // details.mediaType is a single string here (not the mediaTypes array).
+      const mediaType = details?.mediaType
+      if (mediaType === 'video') {
+        return false
+      }
+
+      return true
    }

-    callback(false)
+    return false
  })
 }

--- a/apps/desktop/src/app/chat/composer/index.tsx
+++ b/apps/desktop/src/app/chat/composer/index.tsx
@ -331,6 +331,21 @@ export function ChatBar({

    draftRef.current = nextDraft
    aui.composer().setText(nextDraft)
+
+    // Push the new text into the contentEditable editor directly. Setting the
+    // assistant-ui composer state alone is not enough: the draft→editor sync
+    // effect only re-renders the editor when it is NOT focused
+    // (document.activeElement !== editor), and the dictation/insert paths
+    // typically run while the editor has (or immediately regains) focus — so
+    // the store would hold the text but the visible editor would stay empty
+    // and there'd be nothing to send. Mirror appendExternalText here.
+    const editor = editorRef.current
+
+    if (editor) {
+      renderComposerContents(editor, nextDraft)
+      placeCaretEnd(editor)
+    }
+
    requestMainFocus()
  }