feat: lots of speech stuff

2026-06-14 09:11:54 +00:00 · 2026-05-01 19:28:02 -05:00 · 2026-05-01 19:28:02 -05:00 · d5d7b5c6dc
commit d5d7b5c6dc
parent 9f3d393a4d
41 changed files with 1405 additions and 361 deletions
--- a/.env.example
+++ b/.env.example
@ -384,9 +384,9 @@ IMAGE_TOOLS_DEBUG=false
 # Default STT provider is "local" (faster-whisper) — runs on your machine, no API key needed.
 # Install with: pip install faster-whisper
 # Model downloads automatically on first use (~150 MB for "base").
-# To use cloud providers instead, set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY above.
-# Provider priority: local > groq > openai
-# Configure in config.yaml: stt.provider: local | groq | openai
+# To use cloud providers instead, set GROQ_API_KEY, VOICE_TOOLS_OPENAI_KEY, or ELEVENLABS_API_KEY above.
+# Provider priority: local > groq > openai > mistral > xai > elevenlabs
+# Configure in config.yaml: stt.provider: local | groq | openai | mistral | xai | elevenlabs

 # =============================================================================
 # STT ADVANCED OVERRIDES (optional)
@ -394,10 +394,12 @@ IMAGE_TOOLS_DEBUG=false
 # Override default STT models per provider (normally set via stt.model in config.yaml)
 # STT_GROQ_MODEL=whisper-large-v3-turbo
 # STT_OPENAI_MODEL=whisper-1
+# STT_ELEVENLABS_MODEL=scribe_v2

 # Override STT provider endpoints (for proxies or self-hosted instances)
 # GROQ_BASE_URL=https://api.groq.com/openai/v1
 # STT_OPENAI_BASE_URL=https://api.openai.com/v1
+# ELEVENLABS_STT_BASE_URL=https://api.elevenlabs.io/v1

 # =============================================================================
 # MICROSOFT TEAMS INTEGRATION
--- a/apps/desktop/package-lock.json
+++ b/apps/desktop/package-lock.json
@ -10,6 +10,7 @@
      "dependencies": {
        "@assistant-ui/react": "^0.12.28",
        "@assistant-ui/react-streamdown": "^0.1.11",
+        "@audiowave/react": "^0.6.2",
        "@chenglou/pretext": "^0.0.6",
        "@nanostores/react": "^1.1.0",
        "@radix-ui/react-slot": "^1.2.4",
@ -305,6 +306,25 @@
        }
      }
    },
+    "node_modules/@audiowave/core": {
+      "version": "0.3.1",
+      "resolved": "https://registry.npmjs.org/@audiowave/core/-/core-0.3.1.tgz",
+      "integrity": "sha512-KtC2MTWKp6Orkedty3I8IklVBVQ2IFaFWDJ1cz+UsACpX2x1gINwZGTRZT7bw/dx8KazNSMuVK5lm1jL67KQkQ==",
+      "license": "MIT"
+    },
+    "node_modules/@audiowave/react": {
+      "version": "0.6.2",
+      "resolved": "https://registry.npmjs.org/@audiowave/react/-/react-0.6.2.tgz",
+      "integrity": "sha512-hajG2Iv3mVxived9wXad8L0ZQF+HmYnB3IrfOkIdkTv4RxOJDXwFWMAd0zb7ZU1Qz0IEYZXCbASFWyuxEQ7PAw==",
+      "license": "MIT",
+      "dependencies": {
+        "@audiowave/core": "0.3.1"
+      },
+      "peerDependencies": {
+        "react": ">=16.8.0",
+        "react-dom": ">=16.8.0"
+      }
+    },
    "node_modules/@babel/code-frame": {
      "version": "7.29.0",
      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz",
--- a/apps/desktop/package.json
+++ b/apps/desktop/package.json
@ -23,6 +23,7 @@
  "dependencies": {
    "@assistant-ui/react": "^0.12.28",
    "@assistant-ui/react-streamdown": "^0.1.11",
+    "@audiowave/react": "^0.6.2",
    "@chenglou/pretext": "^0.0.6",
    "@nanostores/react": "^1.1.0",
    "@radix-ui/react-slot": "^1.2.4",
--- a/apps/desktop/src/app/chat/composer/constants.ts
+++ b/apps/desktop/src/app/chat/composer/constants.ts
@ -1,4 +1,3 @@
-import type { Unstable_TriggerItem } from '@assistant-ui/core'
 import type { Unstable_IconComponent } from '@assistant-ui/react'
 import { FileText, FolderOpen, ImageIcon, Link, type LucideIcon } from 'lucide-react'
 import type { CSSProperties } from 'react'
@ -37,7 +36,7 @@ export const DIRECTIVE_ICONS: Record<string, Unstable_IconComponent> = {
 }

 export const DIRECTIVE_POPOVER_CLASS =
-  'absolute bottom-24 left-1/2 z-50 w-[min(calc(100vw-1.5rem),28rem)] max-h-[min(28rem,calc(100vh-8rem))] -translate-x-1/2 overflow-y-auto overscroll-contain rounded-2xl border border-border/70 bg-popover p-1.5 text-popover-foreground shadow-2xl'
+  'absolute bottom-24 left-1/2 z-50 w-[min(calc(100vw-1.5rem),26rem)] max-h-[min(24rem,calc(100vh-8rem))] -translate-x-1/2 overflow-y-auto overscroll-contain rounded-2xl border border-border/60 bg-popover/95 p-1.5 text-popover-foreground shadow-2xl backdrop-blur-md ring-1 ring-black/5'

 export const PROMPT_SNIPPETS = [
  {
@ -64,37 +63,6 @@ export const ASK_PLACEHOLDERS = [
  'Duck mode: gentle debugging, together.'
 ]

-export const REF_ITEMS: Unstable_TriggerItem[] = [
-  {
-    id: 'file:',
-    type: 'file',
-    label: 'File',
-    description: 'Attach a file path',
-    metadata: { icon: 'file' }
-  },
-  {
-    id: 'folder:',
-    type: 'folder',
-    label: 'Folder',
-    description: 'Attach a folder path',
-    metadata: { icon: 'folder' }
-  },
-  {
-    id: 'url:',
-    type: 'url',
-    label: 'URL',
-    description: 'Attach a web page',
-    metadata: { icon: 'url' }
-  },
-  {
-    id: 'image:',
-    type: 'image',
-    label: 'Image',
-    description: 'Attach an image path',
-    metadata: { icon: 'image' }
-  }
-]
-
 export const EDGE_NEWLINES_RE = /^[\t ]*(?:\r\n|\r|\n)+|(?:\r\n|\r|\n)+[\t ]*$/g
 export const DEFAULT_MAX_RECORDING_SECONDS = 120

--- a/apps/desktop/src/app/chat/composer/context-menu.tsx
+++ b/apps/desktop/src/app/chat/composer/context-menu.tsx
@ -15,11 +15,10 @@ import {
 import { cn } from '@/lib/utils'

 import { GHOST_ICON_BTN, PROMPT_SNIPPETS } from './constants'
-import type { ChatBarState, ContextSuggestion } from './types'
+import type { ChatBarState } from './types'

 export function ContextMenu({
  state,
-  onAddContextRef,
  onInsertText,
  onOpenUrlDialog,
  onPasteClipboardImage,
@ -28,7 +27,6 @@ export function ContextMenu({
  onPickImages
 }: {
  state: ChatBarState
-  onAddContextRef?: (refText: string, label?: string, detail?: string) => void
  onInsertText: (text: string) => void
  onOpenUrlDialog: () => void
  onPasteClipboardImage?: () => void
@ -36,11 +34,6 @@ export function ContextMenu({
  onPickFolders?: () => void
  onPickImages?: () => void
 }) {
-  const choose = (item: ContextSuggestion) =>
-    onAddContextRef ? onAddContextRef(item.text, item.display, item.meta) : onInsertText(item.text)
-
-  const suggestions = state.tools.suggestions?.slice(0, 8) ?? []
-
  return (
    <DropdownMenu>
      <DropdownMenuTrigger asChild>
@ -56,48 +49,28 @@ export function ContextMenu({
          <Plus size={18} />
        </Button>
      </DropdownMenuTrigger>
-      <DropdownMenuContent align="start" className="w-64" side="top" sideOffset={10}>
-        <DropdownMenuLabel className="text-xs text-muted-foreground">Add context</DropdownMenuLabel>
+      <DropdownMenuContent align="start" className="w-60" side="top" sideOffset={10}>
+        <DropdownMenuLabel className="text-[0.7rem] font-medium uppercase tracking-wide text-muted-foreground/85">
+          Attach
+        </DropdownMenuLabel>
        <ContextMenuItem disabled={!onPickFiles} icon={FileText} onSelect={onPickFiles}>
-          Files
+          Files…
        </ContextMenuItem>
        <ContextMenuItem disabled={!onPickFolders} icon={FolderOpen} onSelect={onPickFolders}>
-          Folders
+          Folder…
        </ContextMenuItem>
        <ContextMenuItem disabled={!onPickImages} icon={ImageIcon} onSelect={onPickImages}>
-          Images
+          Images…
        </ContextMenuItem>
        <ContextMenuItem disabled={!onPasteClipboardImage} icon={Clipboard} onSelect={onPasteClipboardImage}>
-          Image from clipboard
+          Paste image
        </ContextMenuItem>
        <ContextMenuItem icon={Link} onSelect={onOpenUrlDialog}>
-          URL
+          URL…
        </ContextMenuItem>

        <DropdownMenuSeparator />

-        <DropdownMenuSub>
-          <DropdownMenuSubTrigger>
-            <FileText />
-            <span>Suggested files</span>
-          </DropdownMenuSubTrigger>
-          <DropdownMenuSubContent className="w-72">
-            {suggestions.length === 0 ? (
-              <DropdownMenuItem disabled>
-                <span className="text-muted-foreground">No suggestions</span>
-              </DropdownMenuItem>
-            ) : (
-              suggestions.map(item => (
-                <DropdownMenuItem key={item.text} onSelect={() => choose(item)}>
-                  <FileText />
-                  <span className="min-w-0 flex-1 truncate">{item.display}</span>
-                  {item.meta && <span className="max-w-28 truncate text-xs text-muted-foreground">{item.meta}</span>}
-                </DropdownMenuItem>
-              ))
-            )}
-          </DropdownMenuSubContent>
-        </DropdownMenuSub>
-
        <DropdownMenuSub>
          <DropdownMenuSubTrigger>
            <MessageSquareText />
@ -111,6 +84,13 @@ export function ContextMenu({
            ))}
          </DropdownMenuSubContent>
        </DropdownMenuSub>
+
+        <DropdownMenuSeparator />
+
+        <div className="px-2 py-1 text-[0.7rem] text-muted-foreground/80">
+          Tip: type <kbd className="rounded bg-muted/70 px-1 py-px font-mono text-[0.65rem]">@</kbd> to reference files
+          inline.
+        </div>
      </DropdownMenuContent>
    </DropdownMenu>
  )
--- a/apps/desktop/src/app/chat/composer/controls.tsx
+++ b/apps/desktop/src/app/chat/composer/controls.tsx
@ -15,6 +15,7 @@ interface ConversationProps {
  status: ConversationStatus
  onEnd: () => void
  onStart: () => void
+  onStopTurn: () => void
  onToggleMute: () => void
 }

@ -80,6 +81,7 @@ function ConversationPill({
  level,
  muted,
  onEnd,
+  onStopTurn,
  onToggleMute,
  status
 }: ConversationProps & { disabled: boolean }) {
@ -104,10 +106,10 @@ function ConversationPill({
        aria-pressed={muted}
        className={cn(GHOST_ICON_BTN, 'p-0', muted && 'bg-muted text-muted-foreground')}
        disabled={disabled}
-      onClick={() => {
-        triggerHaptic('selection')
-        onToggleMute()
-      }}
+        onClick={() => {
+          triggerHaptic('selection')
+          onToggleMute()
+        }}
        size="icon"
        title={muted ? 'Unmute microphone' : 'Mute microphone'}
        type="button"
@ -115,6 +117,23 @@ function ConversationPill({
      >
        {muted ? <MicOff size={16} /> : <Mic size={16} />}
      </Button>
+      {listening && (
+        <Button
+          aria-label="Stop listening and send"
+          className="h-8 shrink-0 gap-1.5 rounded-full px-2.5 text-xs text-muted-foreground hover:bg-accent hover:text-foreground"
+          disabled={disabled}
+          onClick={() => {
+            triggerHaptic('submit')
+            onStopTurn()
+          }}
+          title="Stop listening and send"
+          type="button"
+          variant="ghost"
+        >
+          <Square className="fill-current" size={11} />
+          <span>Stop</span>
+        </Button>
+      )}
      <Button
        aria-label="End voice conversation"
        className="h-8 gap-1.5 rounded-full bg-primary px-3 text-xs font-medium text-primary-foreground hover:bg-primary/90"
--- a/apps/desktop/src/app/chat/composer/directive-popover.tsx
+++ b/apps/desktop/src/app/chat/composer/directive-popover.tsx
@ -5,9 +5,9 @@ import {
  type Unstable_MentionCategory,
  type Unstable_MentionDirective
 } from '@assistant-ui/react'
-import { ChevronDown } from 'lucide-react'
+import { FileText } from 'lucide-react'

-import { DIRECTIVE_POPOVER_CLASS, REF_ITEMS } from './constants'
+import { DIRECTIVE_POPOVER_CLASS } from './constants'
 import type { ContextSuggestion } from './types'

 export function DirectivePopover({
@ -24,80 +24,73 @@ export function DirectivePopover({
  return (
    <ComposerPrimitive.Unstable_TriggerPopover adapter={adapter} char="@" className={DIRECTIVE_POPOVER_CLASS}>
      <ComposerPrimitive.Unstable_TriggerPopover.Directive {...directive} />
-      <ComposerPrimitive.Unstable_TriggerPopoverCategories>
-        {categories => (
-          <div className="grid gap-1">
-            {categories.map(c => (
-              <ComposerPrimitive.Unstable_TriggerPopoverCategoryItem
-                categoryId={c.id}
-                className="flex w-full items-center justify-between rounded-xl px-3 py-2 text-left text-sm hover:bg-accent data-highlighted:bg-accent"
-                key={c.id}
-              >
-                <span>{c.label}</span>
-                <ChevronDown className="-rotate-90 size-3.5 text-muted-foreground" />
-              </ComposerPrimitive.Unstable_TriggerPopoverCategoryItem>
-            ))}
-          </div>
-        )}
-      </ComposerPrimitive.Unstable_TriggerPopoverCategories>
      <ComposerPrimitive.Unstable_TriggerPopoverItems>
        {items => (
-          <div className="grid gap-1">
-            <ComposerPrimitive.Unstable_TriggerPopoverBack className="mb-1 text-xs text-muted-foreground hover:text-foreground">
-              Back
-            </ComposerPrimitive.Unstable_TriggerPopoverBack>
-            {items.map((item, index) => {
-              const Icon = directiveIcon(item, iconMap, Fallback)
+          <div className="grid gap-0.5">
+            <div className="px-2 pb-1 pt-0.5 text-[0.7rem] font-medium uppercase tracking-wide text-muted-foreground/80">
+              Reference a file
+            </div>
+            {items.length === 0 ? (
+              <div className="px-3 py-3 text-sm text-muted-foreground">
+                <p>No file suggestions yet.</p>
+                <p className="mt-1 text-xs text-muted-foreground/80">
+                  Keep typing to filter, or click <span className="font-medium text-foreground/80">+</span> to attach
+                  files, folders, or a URL.
+                </p>
+              </div>
+            ) : (
+              items.map((item, index) => {
+                const Icon = directiveIcon(item, iconMap, Fallback)

-              return (
-                <ComposerPrimitive.Unstable_TriggerPopoverItem
-                  className="flex w-full items-center gap-2 rounded-xl px-3 py-2 text-left text-sm hover:bg-accent data-highlighted:bg-accent"
-                  index={index}
-                  item={item}
-                  key={`${item.type}:${item.id}`}
-                >
-                  <Icon className="size-4 shrink-0 text-muted-foreground" />
-                  <span className="grid min-w-0 flex-1 gap-0.5">
-                    <span className="truncate font-medium">{item.label}</span>
-                    {item.description && (
-                      <span className="truncate text-xs text-muted-foreground">{item.description}</span>
-                    )}
-                  </span>
-                </ComposerPrimitive.Unstable_TriggerPopoverItem>
-              )
-            })}
+                return (
+                  <ComposerPrimitive.Unstable_TriggerPopoverItem
+                    className="flex w-full items-center gap-2 rounded-xl px-2.5 py-1.5 text-left text-sm transition-colors hover:bg-accent/70 data-highlighted:bg-accent"
+                    index={index}
+                    item={item}
+                    key={`${item.type}:${item.id}`}
+                  >
+                    <Icon className="size-4 shrink-0 text-muted-foreground/80" />
+                    <span className="grid min-w-0 flex-1 gap-0.5">
+                      <span className="truncate font-medium text-foreground">{item.label}</span>
+                      {item.description && (
+                        <span className="truncate text-[0.72rem] text-muted-foreground/85">{item.description}</span>
+                      )}
+                    </span>
+                  </ComposerPrimitive.Unstable_TriggerPopoverItem>
+                )
+              })
+            )}
          </div>
        )}
      </ComposerPrimitive.Unstable_TriggerPopoverItems>
    </ComposerPrimitive.Unstable_TriggerPopover>
  )
 }
+
 export function buildMentionCategories(suggestions: ContextSuggestion[] | undefined): Unstable_MentionCategory[] {
-  const items = (suggestions ?? [])
-    .map(s => {
-      const match = s.text.match(/^@(file|folder|url|image):(.+)$/)
+  const items: Unstable_TriggerItem[] = []

-      if (!match) {
-        return null
-      }
+  for (const s of suggestions ?? []) {
+    const match = s.text.match(/^@(file|folder|url|image):(.+)$/)

-      const [, type, id] = match
+    if (!match) {
+      continue
+    }

-      return {
-        id,
-        type,
-        label: s.display || id,
-        description: s.meta,
-        metadata: { icon: type }
-      }
+    const [, type, id] = match
+
+    items.push({
+      id,
+      type,
+      label: s.display || id,
+      description: s.meta,
+      metadata: { icon: type }
    })
-    .filter((item): item is NonNullable<typeof item> => Boolean(item))
+  }

-  return [
-    { id: 'refs', label: 'Hermes refs', items: REF_ITEMS },
-    ...(items.length ? [{ id: 'context', label: 'Suggested files', items }] : [])
-  ]
+  return [{ id: 'context', label: 'References', items }]
 }
+
 function directiveIcon(
  item: Unstable_TriggerItem,
  iconMap: Record<string, Unstable_IconComponent>,
@ -106,5 +99,5 @@ function directiveIcon(
  const meta = item.metadata as Record<string, unknown> | undefined
  const key = typeof meta?.icon === 'string' ? meta.icon : item.type

-  return iconMap[key] ?? iconMap[item.type] ?? fallback
+  return iconMap[key] ?? iconMap[item.type] ?? fallback ?? FileText
 }
--- a/apps/desktop/src/app/chat/composer/hooks/use-voice-conversation.ts
+++ b/apps/desktop/src/app/chat/composer/hooks/use-voice-conversation.ts
@ -1,6 +1,6 @@
 import { useCallback, useEffect, useRef, useState } from 'react'

-import { speakText } from '@/hermes'
+import { playSpeechText, stopVoicePlayback } from '@/lib/voice-playback'
 import { notify, notifyError } from '@/store/notifications'

 import {
@ -14,13 +14,19 @@ import { useMicRecorder } from './use-mic-recorder'

 export type ConversationStatus = 'idle' | 'listening' | 'transcribing' | 'thinking' | 'speaking'

+interface PendingVoiceResponse {
+  id: string
+  pending: boolean
+  text: string
+}
+
 interface VoiceConversationOptions {
  busy: boolean
  enabled: boolean
  onFatalError?: () => void
-  onSubmit: (text: string) => void
+  onSubmit: (text: string) => Promise<void> | void
  onTranscribeAudio?: (audio: Blob) => Promise<string>
-  pendingResponseText: () => string | null
+  pendingResponse: () => PendingVoiceResponse | null
  consumePendingResponse: () => void
 }

@ -30,16 +36,19 @@ export function useVoiceConversation({
  onFatalError,
  onSubmit,
  onTranscribeAudio,
-  pendingResponseText,
+  pendingResponse,
  consumePendingResponse
 }: VoiceConversationOptions) {
  const { handle, level } = useMicRecorder()
  const [status, setStatus] = useState<ConversationStatus>('idle')
  const [muted, setMuted] = useState(false)
-  const audioRef = useRef<HTMLAudioElement | null>(null)
  const turnTimeoutRef = useRef<number | null>(null)
  const pendingStartRef = useRef(false)
-  const lastSpokenRef = useRef<string | null>(null)
+  const turnClosingRef = useRef(false)
+  const awaitingSpokenResponseRef = useRef(false)
+  const responseIdRef = useRef<string | null>(null)
+  const spokenSourceLengthRef = useRef(0)
+  const speechBufferRef = useRef('')
  const enabledRef = useRef(enabled)
  const mutedRef = useRef(muted)
  const busyRef = useRef(busy)
@ -69,36 +78,74 @@ export function useVoiceConversation({
    }
  }

-  const stopAudio = useCallback(() => {
-    const audio = audioRef.current
+  const resetSpeechBuffer = () => {
+    responseIdRef.current = null
+    spokenSourceLengthRef.current = 0
+    speechBufferRef.current = ''
+  }

-    if (audio) {
-      audio.pause()
-      audio.src = ''
-      audioRef.current = null
-    }
-  }, [])
-
-  const handleTurn = useCallback(async () => {
-    clearTurnTimeout()
-    setStatus('transcribing')
-    const result = await handle.stop()
-
-    if (!result || !result.heardSpeech || !onTranscribeAudio) {
-      if (enabledRef.current && !mutedRef.current && !busyRef.current && statusRef.current !== 'speaking') {
-        pendingStartRef.current = true
-      }
-
-      setStatus('idle')
+  const appendSpeechText = (text: string) => {
+    const cleaned = text

+    if (!cleaned) {
      return
    }

-    try {
-      const transcript = (await onTranscribeAudio(result.audio)).trim()
+    speechBufferRef.current = `${speechBufferRef.current} ${cleaned}`.trim()
+  }

-      if (!transcript) {
-        if (enabledRef.current) {
+  const takeSpeechChunk = (force = false): string | null => {
+    const buffer = speechBufferRef.current.replace(/\s+/g, ' ').trim()
+
+    if (!buffer) {
+      speechBufferRef.current = ''
+
+      return null
+    }
+
+    const sentence = buffer.match(/^(.+?[.!?。！？])(?:\s+|$)/)
+
+    if (sentence?.[1] && (sentence[1].length >= 8 || force)) {
+      const chunk = sentence[1].trim()
+      speechBufferRef.current = buffer.slice(sentence[1].length).trim()
+
+      return chunk
+    }
+
+    if (!force && buffer.length > 220) {
+      const softBoundary = Math.max(buffer.lastIndexOf(', ', 180), buffer.lastIndexOf('; ', 180), buffer.lastIndexOf(': ', 180))
+
+      if (softBoundary > 80) {
+        const chunk = buffer.slice(0, softBoundary + 1).trim()
+        speechBufferRef.current = buffer.slice(softBoundary + 1).trim()
+
+        return chunk
+      }
+    }
+
+    if (!force) {
+      return null
+    }
+
+    speechBufferRef.current = ''
+
+    return buffer
+  }
+
+  const handleTurn = useCallback(async (forceTranscribe = false) => {
+    if (turnClosingRef.current) {
+      return
+    }
+
+    turnClosingRef.current = true
+    clearTurnTimeout()
+    setStatus('transcribing')
+
+    try {
+      const result = await handle.stop()
+
+      if (!result || (!result.heardSpeech && !forceTranscribe) || !onTranscribeAudio) {
+        if (enabledRef.current && !mutedRef.current && !busyRef.current && statusRef.current !== 'speaking') {
          pendingStartRef.current = true
        }

@ -107,16 +154,34 @@ export function useVoiceConversation({
        return
      }

-      onSubmit(transcript)
-      setStatus('thinking')
-    } catch (error) {
-      notifyError(error, 'Voice transcription failed')
+      try {
+        const transcript = (await onTranscribeAudio(result.audio)).trim()

-      if (enabledRef.current && !mutedRef.current && !busyRef.current) {
-        pendingStartRef.current = true
+        if (!transcript) {
+          if (enabledRef.current) {
+            pendingStartRef.current = true
+          }
+
+          setStatus('idle')
+
+          return
+        }
+
+        awaitingSpokenResponseRef.current = true
+        resetSpeechBuffer()
+        await onSubmit(transcript)
+        setStatus('thinking')
+      } catch (error) {
+        notifyError(error, 'Voice transcription failed')
+
+        if (enabledRef.current && !mutedRef.current && !busyRef.current) {
+          pendingStartRef.current = true
+        }
+
+        setStatus('idle')
      }
-
-      setStatus('idle')
+    } finally {
+      turnClosingRef.current = false
    }
  }, [handle, onSubmit, onTranscribeAudio])

@ -158,24 +223,13 @@ export function useVoiceConversation({

  const speak = useCallback(
    async (text: string) => {
-      stopAudio()
      setStatus('speaking')

      try {
-        const response = await speakText(text)
-        const audio = new Audio(response.data_url)
-        audioRef.current = audio
-
-        await new Promise<void>((resolve, reject) => {
-          audio.addEventListener('ended', () => resolve(), { once: true })
-          audio.addEventListener('error', () => reject(new Error('Playback failed')), { once: true })
-          void audio.play().catch(reject)
-        })
+        await playSpeechText(text, { source: 'voice-conversation' })
      } catch (error) {
        notifyError(error, 'Voice playback failed')
      } finally {
-        audioRef.current = null
-
        if (enabledRef.current) {
          pendingStartRef.current = true
          setStatus('idle')
@ -184,7 +238,7 @@ export function useVoiceConversation({
        }
      }
    },
-    [stopAudio]
+    []
  )

  const start = useCallback(async () => {
@ -200,20 +254,31 @@ export function useVoiceConversation({
    }

    setMuted(false)
-    lastSpokenRef.current = null
+    awaitingSpokenResponseRef.current = false
+    resetSpeechBuffer()
+    consumePendingResponse()
    pendingStartRef.current = true
-  }, [onFatalError, onTranscribeAudio])
+    await startListening()
+  }, [consumePendingResponse, onFatalError, onTranscribeAudio, startListening])

  const end = useCallback(async () => {
    pendingStartRef.current = false
    clearTurnTimeout()
-    stopAudio()
+    stopVoicePlayback()
    handle.cancel()
-    lastSpokenRef.current = null
+    turnClosingRef.current = false
+    awaitingSpokenResponseRef.current = false
+    resetSpeechBuffer()
    consumePendingResponse()
    setMuted(false)
    setStatus('idle')
-  }, [consumePendingResponse, handle, stopAudio])
+  }, [consumePendingResponse, handle])
+
+  const stopTurn = useCallback(() => {
+    if (statusRef.current === 'listening') {
+      void handleTurn(true)
+    }
+  }, [handleTurn])

  const toggleMute = useCallback(() => {
    setMuted(value => {
@ -231,22 +296,77 @@ export function useVoiceConversation({
    })
  }, [handle])

-  // Drive the loop: speak any new assistant response, otherwise start listening
-  // when the agent is idle and we're between turns.
+  useEffect(() => {
+    if (!enabled) {
+      return
+    }
+
+    const onKeyDown = (event: KeyboardEvent) => {
+      if (event.code !== 'Space' || event.repeat || event.metaKey || event.ctrlKey || event.altKey) {
+        return
+      }
+
+      if (statusRef.current !== 'listening') {
+        return
+      }
+
+      event.preventDefault()
+      stopTurn()
+    }
+
+    window.addEventListener('keydown', onKeyDown, { capture: true })
+
+    return () => window.removeEventListener('keydown', onKeyDown, { capture: true })
+  }, [enabled, stopTurn])
+
+  // Drive the loop: after a voice-submitted turn, speak stable chunks as the
+  // assistant stream grows. Otherwise start listening when idle between turns.
  useEffect(() => {
    if (!enabled || muted) {
      return
    }

-    const text = pendingResponseText()
-    const trimmed = text?.trim() ?? ''
+    if (awaitingSpokenResponseRef.current && status !== 'speaking') {
+      const response = pendingResponse()

-    if (trimmed && trimmed !== lastSpokenRef.current && status !== 'speaking') {
-      lastSpokenRef.current = trimmed
-      consumePendingResponse()
-      void speak(trimmed)
+      if (response) {
+        if (response.id !== responseIdRef.current) {
+          resetSpeechBuffer()
+          responseIdRef.current = response.id
+        }

-      return
+        if (response.text.length > spokenSourceLengthRef.current) {
+          appendSpeechText(response.text.slice(spokenSourceLengthRef.current))
+          spokenSourceLengthRef.current = response.text.length
+        }
+
+        const chunk = takeSpeechChunk(!response.pending && !busy)
+
+        if (chunk) {
+          void speak(chunk)
+
+          return
+        }
+
+        if (!response.pending && !busy) {
+          awaitingSpokenResponseRef.current = false
+          consumePendingResponse()
+          resetSpeechBuffer()
+          pendingStartRef.current = true
+          setStatus('idle')
+
+          return
+        }
+      }
+
+      if (!busy && status === 'thinking') {
+        awaitingSpokenResponseRef.current = false
+        resetSpeechBuffer()
+        pendingStartRef.current = true
+        setStatus('idle')
+
+        return
+      }
    }

    if (busy || status !== 'idle') {
@ -256,7 +376,7 @@ export function useVoiceConversation({
    if (pendingStartRef.current) {
      void startListening()
    }
-  }, [busy, consumePendingResponse, enabled, muted, pendingResponseText, speak, startListening, status])
+  }, [busy, consumePendingResponse, enabled, muted, pendingResponse, speak, startListening, status])

  useEffect(() => {
    if (enabled && !wasEnabledRef.current) {
@ -270,5 +390,5 @@ export function useVoiceConversation({
    wasEnabledRef.current = enabled
  }, [enabled, end, start])

-  return { end, level, muted, start, status, toggleMute }
+  return { end, level, muted, start, status, stopTurn, toggleMute }
 }
--- a/apps/desktop/src/app/chat/composer/index.tsx
+++ b/apps/desktop/src/app/chat/composer/index.tsx
@ -32,7 +32,7 @@ import { useVoiceConversation } from './hooks/use-voice-conversation'
 import { useVoiceRecorder } from './hooks/use-voice-recorder'
 import type { ChatBarProps } from './types'
 import { UrlDialog } from './url-dialog'
-import { VoiceActivity } from './voice-activity'
+import { VoiceActivity, VoicePlaybackActivity } from './voice-activity'

 function trimPastedEdgeNewlines(text: string): string {
  return text.replace(EDGE_NEWLINES_RE, '')
@ -45,7 +45,6 @@ export function ChatBar({
  maxRecordingSeconds = DEFAULT_MAX_RECORDING_SECONDS,
  state,
  onCancel,
-  onAddContextRef,
  onAddUrl,
  onPasteClipboardImage,
  onPickFiles,
@ -203,7 +202,7 @@ export function ChatBar({
      onCancel()
    } else if (draft.trim() || attachments.length > 0) {
      triggerHaptic('submit')
-      onSubmit(draft)
+      void onSubmit(draft)
      aui.composer().setText('')
    }

@ -235,9 +234,9 @@ export function ChatBar({
    onTranscribeAudio
  })

-  const pendingResponseText = () => {
+  const pendingResponse = () => {
    const messages = $messages.get()
-    const last = messages.findLast(m => m.role === 'assistant' && !m.pending && !m.hidden)
+    const last = messages.findLast(m => m.role === 'assistant' && !m.hidden)

    if (!last || last.id === lastSpokenIdRef.current) {
      return null
@ -249,9 +248,11 @@ export function ChatBar({
      return null
    }

-    lastSpokenIdRef.current = last.id
-
-    return text
+    return {
+      id: last.id,
+      pending: Boolean(last.pending),
+      text
+    }
  }

  const consumePendingResponse = () => {
@ -263,13 +264,13 @@ export function ChatBar({
    }
  }

-  const submitVoiceTurn = (text: string) => {
+  const submitVoiceTurn = async (text: string) => {
    if (busy) {
      return
    }

    triggerHaptic('submit')
-    onSubmit(text)
+    await onSubmit(text)
    aui.composer().setText('')
    draftRef.current = ''
  }
@ -281,12 +282,11 @@ export function ChatBar({
    onFatalError: () => setVoiceConversationActive(false),
    onSubmit: submitVoiceTurn,
    onTranscribeAudio,
-    pendingResponseText
+    pendingResponse
  })

  const contextMenu = (
    <ContextMenu
-      onAddContextRef={onAddContextRef}
      onInsertText={insertText}
      onOpenUrlDialog={() => {
        triggerHaptic('open')
@ -313,6 +313,7 @@ export function ChatBar({
          void conversation.end()
        },
        onStart: () => setVoiceConversationActive(true),
+        onStopTurn: conversation.stopTurn,
        onToggleMute: conversation.toggleMute,
        status: conversation.status
      }}
@ -343,14 +344,12 @@ export function ChatBar({
  return (
    <>
      <ComposerPrimitive.Unstable_TriggerPopoverRoot>
-        {mentionCategories.length > 0 && (
-          <DirectivePopover
-            adapter={mention.adapter}
-            directive={mention.directive}
-            fallbackIcon={mention.fallbackIcon ?? FileText}
-            iconMap={mention.iconMap ?? DIRECTIVE_ICONS}
-          />
-        )}
+        <DirectivePopover
+          adapter={mention.adapter}
+          directive={mention.directive}
+          fallbackIcon={mention.fallbackIcon ?? FileText}
+          iconMap={mention.iconMap ?? DIRECTIVE_ICONS}
+        />
        <ComposerPrimitive.Root
          className={cn(SHELL, 'group/composer pb-8 pt-2')}
          onSubmit={e => {
@ -407,6 +406,7 @@ export function ChatBar({
              style={{ ...COMPOSER_BACKDROP_STYLE, borderRadius: `${glassTweaks.liquid.cornerRadius}px` }}
            >
              <VoiceActivity state={voiceActivityState} />
+              <VoicePlaybackActivity />
              {attachments.length > 0 && <AttachmentList attachments={attachments} onRemove={onRemoveAttachment} />}
              {stacked ? (
                <>
--- a/apps/desktop/src/app/chat/composer/types.ts
+++ b/apps/desktop/src/app/chat/composer/types.ts
@ -36,7 +36,7 @@ export interface ChatBarProps {
  onPickFolders?: () => void
  onPickImages?: () => void
  onRemoveAttachment?: (id: string) => void
-  onSubmit: (value: string) => void
+  onSubmit: (value: string) => Promise<void> | void
  onTranscribeAudio?: (audio: Blob) => Promise<string>
 }

--- a/apps/desktop/src/app/chat/composer/url-dialog.tsx
+++ b/apps/desktop/src/app/chat/composer/url-dialog.tsx
@ -1,9 +1,12 @@
+import { Globe } from 'lucide-react'
 import type * as React from 'react'

 import { Button } from '@/components/ui/button'
 import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle } from '@/components/ui/dialog'
 import { Input } from '@/components/ui/input'

+const URL_HINT = /^https?:\/\//i
+
 export function UrlDialog({
  inputRef,
  onChange,
@ -19,14 +22,23 @@ export function UrlDialog({
  open: boolean
  value: string
 }) {
+  const trimmed = value.trim()
+  const looksLikeUrl = trimmed.length > 0 && URL_HINT.test(trimmed)
+
  return (
    <Dialog onOpenChange={onOpenChange} open={open}>
-      <DialogContent className="max-w-md">
-        <DialogHeader>
-          <DialogTitle>Add URL Context</DialogTitle>
-          <DialogDescription>
-            Hermes will fetch this URL via the existing @url context resolver when you send the prompt.
-          </DialogDescription>
+      <DialogContent className="max-w-md gap-5">
+        <DialogHeader className="flex-row items-center gap-3 sm:items-center">
+          <span
+            aria-hidden
+            className="grid size-9 shrink-0 place-items-center rounded-xl bg-[color-mix(in_srgb,var(--dt-primary)_14%,transparent)] text-primary ring-1 ring-inset ring-primary/15"
+          >
+            <Globe className="size-4" />
+          </span>
+          <div className="grid gap-0.5 text-left">
+            <DialogTitle>Attach a URL</DialogTitle>
+            <DialogDescription>Hermes will fetch the page and include it as context for this turn.</DialogDescription>
+          </div>
        </DialogHeader>
        <form
          className="grid gap-4"
@ -35,18 +47,29 @@ export function UrlDialog({
            onSubmit()
          }}
        >
-          <Input
-            onChange={e => onChange(e.target.value)}
-            placeholder="https://example.com"
-            ref={inputRef}
-            value={value}
-          />
+          <div className="grid gap-1.5">
+            <Input
+              autoComplete="off"
+              autoCorrect="off"
+              inputMode="url"
+              onChange={e => onChange(e.target.value)}
+              placeholder="https://example.com/post"
+              ref={inputRef}
+              spellCheck={false}
+              value={value}
+            />
+            {trimmed.length > 0 && !looksLikeUrl && (
+              <p className="text-xs text-muted-foreground/85">
+                Include the full URL, e.g. <span className="font-mono">https://…</span>
+              </p>
+            )}
+          </div>
          <DialogFooter>
            <Button onClick={() => onOpenChange(false)} type="button" variant="ghost">
              Cancel
            </Button>
-            <Button disabled={!value.trim()} type="submit">
-              Add URL
+            <Button disabled={!looksLikeUrl} type="submit">
+              Attach
            </Button>
          </DialogFooter>
        </form>
--- a/apps/desktop/src/app/chat/composer/voice-activity.tsx
+++ b/apps/desktop/src/app/chat/composer/voice-activity.tsx
@ -1,6 +1,10 @@
-import { Loader2, Mic } from 'lucide-react'
+import { useStore } from '@nanostores/react'
+import { Loader2, Mic, Volume2, VolumeX } from 'lucide-react'

+import { Button } from '@/components/ui/button'
 import { cn } from '@/lib/utils'
+import { stopVoicePlayback } from '@/lib/voice-playback'
+import { $voicePlayback } from '@/store/voice-playback'

 import type { VoiceActivityState } from './types'

@ -36,6 +40,25 @@ function VoiceLevelBars({ level, active }: { active: boolean; level: number }) {
  )
 }

+function PlaybackBars() {
+  const bars = [820, 940, 760, 880, 700, 980, 790]
+
+  return (
+    <div aria-hidden="true" className="flex h-4 items-center gap-0.75">
+      {bars.map((duration, index) => (
+        <span
+          className="voice-wave-bar h-full w-0.5 rounded-full bg-current"
+          key={index}
+          style={{
+            animationDelay: `${index * -110}ms`,
+            animationDuration: `${duration}ms`
+          }}
+        />
+      ))}
+    </div>
+  )
+}
+
 export function VoiceActivity({
  state
 }: {
@ -75,3 +98,50 @@ export function VoiceActivity({
    </div>
  )
 }
+
+export function VoicePlaybackActivity() {
+  const playback = useStore($voicePlayback)
+
+  if (playback.status === 'idle') {
+    return null
+  }
+
+  const preparing = playback.status === 'preparing'
+
+  const title = preparing
+    ? 'Preparing audio'
+    : playback.source === 'voice-conversation'
+      ? 'Speaking response'
+      : 'Reading aloud'
+
+  return (
+    <div
+      aria-live="polite"
+      className={cn(
+        'flex h-8 items-center gap-2 rounded-xl border border-primary/20 bg-primary/10 px-2.5 text-xs text-primary',
+        'shadow-[inset_0_1px_0_rgba(255,255,255,0.35)] backdrop-blur-sm'
+      )}
+      role="status"
+    >
+      <div className="flex size-5 shrink-0 items-center justify-center rounded-full bg-primary/15 text-primary">
+        {preparing ? <Loader2 className="animate-spin" size={12} /> : <Volume2 size={12} />}
+      </div>
+
+      <div className="flex min-w-0 flex-1 items-center gap-2">
+        <span className="truncate font-medium text-foreground/85">{title}</span>
+        {!preparing && <PlaybackBars />}
+      </div>
+
+      <Button
+        className="h-6 shrink-0 gap-1 rounded-full px-2 text-[0.6875rem]"
+        onClick={stopVoicePlayback}
+        size="sm"
+        type="button"
+        variant="ghost"
+      >
+        <VolumeX size={12} />
+        Stop
+      </Button>
+    </div>
+  )
+}
--- a/apps/desktop/src/app/chat/hooks/use-composer-actions.ts
+++ b/apps/desktop/src/app/chat/hooks/use-composer-actions.ts
@ -1,5 +1,6 @@
 import { useCallback } from 'react'

+import { formatRefValue } from '@/components/assistant-ui/directive-text'
 import { attachmentId, contextPath, pathLabel } from '@/lib/chat-runtime'
 import {
  addComposerAttachment,
@ -57,7 +58,7 @@ export function useComposerActions({ activeSessionId, currentCwd, requestGateway
          kind,
          label: pathLabel(path),
          detail: rel,
-          refText: `@${kind}:${rel}`,
+          refText: `@${kind}:${formatRefValue(rel)}`,
          path
        })
      }
--- a/apps/desktop/src/app/chat/index.tsx
+++ b/apps/desktop/src/app/chat/index.tsx
@ -8,13 +8,14 @@ import { useStore } from '@nanostores/react'
 import { useQuery } from '@tanstack/react-query'
 import { ChevronDown } from 'lucide-react'
 import type * as React from 'react'
-import { Suspense, useMemo } from 'react'
+import { Suspense, useMemo, useRef } from 'react'
 import { useLocation } from 'react-router-dom'

 import { Thread } from '@/components/assistant-ui/thread'
 import { NotificationStack } from '@/components/notifications'
 import { Button } from '@/components/ui/button'
 import { getGlobalModelOptions, type HermesGateway } from '@/hermes'
+import type { ChatMessage } from '@/lib/chat-messages'
 import { quickModelOptions, sessionTitle, toRuntimeMessage } from '@/lib/chat-runtime'
 import { cn } from '@/lib/utils'
 import { $pinnedSessionIds } from '@/store/layout'
@ -57,7 +58,7 @@ interface ChatViewProps extends Omit<React.ComponentProps<'div'>, 'onSubmit'> {
  onPickFolders: () => void
  onPickImages: () => void
  onRemoveAttachment: (id: string) => void
-  onSubmit: (text: string) => void
+  onSubmit: (text: string) => Promise<void> | void
  onChangeCwd: (cwd: string) => void
  onBrowseCwd: () => void
  onOpenModelPicker: () => void
@ -118,6 +119,7 @@ export function ChatView({
  const pinnedSessionIds = useStore($pinnedSessionIds)
  const selectedSessionId = useStore($selectedStoredSessionId)
  const sessions = useStore($sessions)
+  const runtimeMessageCacheRef = useRef(new WeakMap<ChatMessage, ThreadMessage>())
  const activeStoredSession = sessions.find(session => session.id === selectedSessionId) || null
  const isRoutedSessionView = Boolean(routeSessionId(location.pathname))
  const selectedIsPinned = selectedSessionId ? pinnedSessionIds.includes(selectedSessionId) : false
@ -128,6 +130,7 @@ export function ChatView({
  const loadingSession = isRoutedSessionView && messages.length === 0
  const threadLoading = threadLoadingState(loadingSession, busy, awaitingResponse)
  const showChatBar = !loadingSession
+  const threadKey = selectedSessionId || activeSessionId || (isRoutedSessionView ? location.pathname : 'new')
  const title = activeStoredSession ? sessionTitle(activeStoredSession) : ''

  const modelOptionsQuery = useQuery<ModelOptionsResponse>({
@ -190,7 +193,14 @@ export function ChatView({
        parentId = branchParentByGroup.get(message.branchGroupId) ?? null
      }

-      items.push({ message: toRuntimeMessage(message), parentId })
+      const cachedMessage = runtimeMessageCacheRef.current.get(message)
+      const runtimeMessage = cachedMessage ?? toRuntimeMessage(message)
+
+      if (!cachedMessage) {
+        runtimeMessageCacheRef.current.set(message, runtimeMessage)
+      }
+
+      items.push({ message: runtimeMessage, parentId })

      if (!message.hidden) {
        visibleParentId = message.id
@ -248,6 +258,7 @@ export function ChatView({
              intro={showIntro ? { personality: introPersonality, seed: introSeed } : undefined}
              loading={threadLoading}
              onBranchInNewChat={onBranchInNewChat}
+              sessionKey={threadKey}
            />
            {showChatBar && (
              <Suspense fallback={<ChatBarFallback />}>
--- a/apps/desktop/src/app/desktop-controller.tsx
+++ b/apps/desktop/src/app/desktop-controller.tsx
@ -14,6 +14,7 @@ import {
  listSessions,
  setGlobalModel
 } from '../hermes'
+import { formatRefValue } from '../components/assistant-ui/directive-text'
 import { toChatMessages } from '../lib/chat-messages'
 import { BUILTIN_PERSONALITIES, normalizePersonalityValue, personalityNamesFromConfig } from '../lib/chat-runtime'
 import { $pinnedSessionIds, pinSession, unpinSession } from '../store/layout'
@ -571,7 +572,7 @@ export function DesktopController() {
      gateway={gatewayRef.current}
      maxVoiceRecordingSeconds={voiceMaxRecordingSeconds}
      onAddContextRef={addContextRefAttachment}
-      onAddUrl={url => addContextRefAttachment(`@url:${url}`, url)}
+      onAddUrl={url => addContextRefAttachment(`@url:${formatRefValue(url)}`, url)}
      onBranchInNewChat={messageId => void branchInNewChat(messageId)}
      onBrowseCwd={() => void browseSessionCwd()}
      onCancel={() => void cancelRun()}
@ -589,7 +590,7 @@ export function DesktopController() {
      onReload={reloadFromMessage}
      onRemoveAttachment={id => void removeAttachment(id)}
      onSelectPersonality={name => void selectPersonality(name)}
-      onSubmit={text => void submitText(text)}
+      onSubmit={submitText}
      onThreadMessagesChange={handleThreadMessagesChange}
      onToggleSelectedPin={toggleSelectedPin}
      onTranscribeAudio={transcribeVoiceAudio}
--- a/apps/desktop/src/app/session/hooks/use-message-stream.ts
+++ b/apps/desktop/src/app/session/hooks/use-message-stream.ts
@ -1,6 +1,5 @@
 import type { QueryClient } from '@tanstack/react-query'
 import { type MutableRefObject, useCallback } from 'react'
-import { flushSync } from 'react-dom'

 import {
  appendReasoningPart,
@ -60,7 +59,6 @@ export function useMessageStream({
      transform: (parts: ChatMessagePart[], message: ChatMessage) => ChatMessagePart[],
      seed: () => ChatMessagePart[],
      opts: {
-        sync?: boolean
        pending?: (message: ChatMessage) => boolean
      } = {}
    ) => {
@ -112,7 +110,7 @@ export function useMessageStream({
        })
      }

-      opts.sync ? flushSync(apply) : apply()
+      apply()
    },
    [updateSessionState]
  )
@ -126,8 +124,7 @@ export function useMessageStream({
      mutateStream(
        sessionId,
        parts => appendTextPart(parts, delta),
-        () => [textPart(delta)],
-        { sync: true }
+        () => [textPart(delta)]
      )
    },
    [mutateStream]
@ -152,8 +149,7 @@ export function useMessageStream({

          return appendReasoningPart(parts, delta)
        },
-        () => [reasoningPart(delta)],
-        { sync: true }
+        () => [reasoningPart(delta)]
      )
    },
    [mutateStream]
@ -299,6 +295,7 @@ export function useMessageStream({
        const apply = explicitSid ? isActiveEvent : !activeSessionIdRef.current
        const modelChanged = typeof payload?.model === 'string'
        const providerChanged = typeof payload?.provider === 'string'
+        const runningChanged = typeof payload?.running === 'boolean'

        if (apply) {
          if (modelChanged) {
@ -320,6 +317,35 @@ export function useMessageStream({
          if (typeof payload?.personality === 'string') {
            setCurrentPersonality(normalizePersonalityValue(payload.personality))
          }
+
+          if (runningChanged && sessionId) {
+            updateSessionState(sessionId, state => {
+              const busy = Boolean(payload!.running)
+
+              if (state.busy === busy && (busy || !state.awaitingResponse)) {
+                return state
+              }
+
+              if (busy) {
+                return {
+                  ...state,
+                  busy
+                }
+              }
+
+              if (state.awaitingResponse && !state.sawAssistantPayload) {
+                return state
+              }
+
+              return {
+                ...state,
+                awaitingResponse: false,
+                busy,
+                pendingBranchGroup: null,
+                streamId: null
+              }
+            })
+          }
        }

        void refreshHermesConfig()
@ -355,11 +381,11 @@ export function useMessageStream({
        }
      } else if (event.type === 'reasoning.delta') {
        if (sessionId) {
-          appendReasoningDelta(sessionId, coerceGatewayText(payload?.text))
+          appendReasoningDelta(sessionId, coerceThinkingText(payload?.text))
        }
      } else if (event.type === 'reasoning.available') {
        if (sessionId) {
-          appendReasoningDelta(sessionId, coerceGatewayText(payload?.text), true)
+          appendReasoningDelta(sessionId, coerceThinkingText(payload?.text), true)
        }
      } else if (event.type === 'message.complete') {
        if (!sessionId) {
--- a/apps/desktop/src/app/session/hooks/use-prompt-actions.ts
+++ b/apps/desktop/src/app/session/hooks/use-prompt-actions.ts
@ -13,7 +13,7 @@ import {
 import { triggerHaptic } from '@/lib/haptics'
 import { $composerAttachments, clearComposerAttachments } from '@/store/composer'
 import { clearNotifications, notify, notifyError } from '@/store/notifications'
-import { $busy, $messages, setAwaitingResponse, setBusy } from '@/store/session'
+import { $busy, $messages, setAwaitingResponse, setBusy, setMessages } from '@/store/session'

 import type { ClientSessionState, SlashExecResponse } from '../../types'

@ -296,12 +296,34 @@ export function usePromptActions({
  )

  const cancelRun = useCallback(async () => {
-    if (!activeSessionId) {
+    const sessionId = activeSessionId || activeSessionIdRef.current
+
+    busyRef.current = false
+    setBusy(false)
+    setAwaitingResponse(false)
+
+    const finalizeMessages = (messages: ChatMessage[]) =>
+      messages.map(message =>
+        message.pending
+          ? {
+              ...message,
+              parts: chatMessageText(message).trim()
+                ? appendTextPart(message.parts, INTERRUPTED_MARKER)
+                : [...message.parts, textPart(INTERRUPTED_MARKER.trim())],
+              pending: false
+            }
+          : message
+      )
+
+    if (!sessionId) {
+      setMessages(finalizeMessages($messages.get()))
+
      return
    }

-    updateSessionState(activeSessionId, state => {
+    updateSessionState(sessionId, state => {
      const streamId = state.streamId
+
      const messages = streamId
        ? state.messages.map(message =>
            message.id === streamId
@ -314,7 +336,7 @@ export function usePromptActions({
                }
              : message
          )
-        : state.messages
+        : finalizeMessages(state.messages)

      return {
        ...state,
@ -328,11 +350,11 @@ export function usePromptActions({
    })

    try {
-      await requestGateway('session.interrupt', { session_id: activeSessionId })
+      await requestGateway('session.interrupt', { session_id: sessionId })
    } catch (err) {
      notifyError(err, 'Stop failed')
    }
-  }, [activeSessionId, requestGateway, updateSessionState])
+  }, [activeSessionId, activeSessionIdRef, busyRef, requestGateway, updateSessionState])

  const reloadFromMessage = useCallback(
    async (parentId: string | null) => {
--- a/apps/desktop/src/app/session/hooks/use-session-actions.ts
+++ b/apps/desktop/src/app/session/hooks/use-session-actions.ts
@ -87,6 +87,11 @@ export function useSessionActions({

  const createBackendSessionForSend = useCallback(async (): Promise<string | null> => {
    const created = await requestGateway<SessionCreateResponse>('session.create', { cols: 96 })
+
+    if (created.stored_session_id) {
+      navigate(sessionRoute(created.stored_session_id), { replace: true })
+    }
+
    setActiveSessionId(created.session_id)
    activeSessionIdRef.current = created.session_id
    ensureSessionState(created.session_id, created.stored_session_id ?? null)
@ -94,7 +99,6 @@ export function useSessionActions({
    if (created.stored_session_id) {
      setSelectedStoredSessionId(created.stored_session_id)
      selectedStoredSessionIdRef.current = created.stored_session_id
-      navigate(sessionRoute(created.stored_session_id), { replace: true })
    }

    if (created.info?.model) {
--- a/apps/desktop/src/app/settings/constants.ts
+++ b/apps/desktop/src/app/settings/constants.ts
@ -60,6 +60,7 @@ export const ENUM_OPTIONS: Record<string, string[]> = {
  'context.engine': ['compressor', 'default', 'custom'],
  'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'],
  'memory.provider': ['', 'builtin', 'honcho'],
+  'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'],
  'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'],
  'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
 }
@ -101,6 +102,10 @@ export const FIELD_LABELS: Record<string, string> = {
  'stt.provider': 'Speech-To-Text Provider',
  'stt.local.model': 'Local Transcription Model',
  'stt.local.language': 'Transcription Language',
+  'stt.elevenlabs.model_id': 'ElevenLabs STT Model',
+  'stt.elevenlabs.language_code': 'ElevenLabs Language',
+  'stt.elevenlabs.tag_audio_events': 'Tag Audio Events',
+  'stt.elevenlabs.diarize': 'Speaker Diarization',
  'tts.provider': 'Text-To-Speech Provider',
  'tts.edge.voice': 'Edge Voice',
  'tts.openai.model': 'OpenAI TTS Model',
@ -157,6 +162,7 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = {
  'compression.enabled': 'Summarize older context when conversations get large.',
  'voice.auto_tts': 'Automatically speak assistant responses.',
  'stt.enabled': 'Enable local or provider-backed speech transcription.',
+  'stt.elevenlabs.language_code': 'Optional ISO-639-3 language code. Blank lets ElevenLabs auto-detect.',
  'agent.max_turns': 'Upper bound for tool-calling turns before Hermes stops a run.'
 }

@ -241,6 +247,10 @@ export const SECTIONS: DesktopConfigSection[] = [
      'tts.elevenlabs.model_id',
      'stt.local.model',
      'stt.local.language',
+      'stt.elevenlabs.model_id',
+      'stt.elevenlabs.language_code',
+      'stt.elevenlabs.tag_audio_events',
+      'stt.elevenlabs.diarize',
      'voice.record_key',
      'voice.max_recording_seconds'
    ]
--- a/apps/desktop/src/components/assistant-ui/directive-text.test.ts
+++ b/apps/desktop/src/components/assistant-ui/directive-text.test.ts
@ -0,0 +1,39 @@
+import { describe, expect, it } from 'vitest'
+
+import { formatRefValue, hermesDirectiveFormatter } from './directive-text'
+
+describe('formatRefValue', () => {
+  it('leaves simple paths untouched', () => {
+    expect(formatRefValue('src/index.ts')).toBe('src/index.ts')
+    expect(formatRefValue('https://example.com/post')).toBe('https://example.com/post')
+  })
+
+  it('wraps paths with whitespace in backticks', () => {
+    expect(formatRefValue('apple-touch-icon (1).png')).toBe('`apple-touch-icon (1).png`')
+  })
+
+  it('falls back to double quotes when value contains backticks', () => {
+    expect(formatRefValue('weird `name` (1).md')).toBe('"weird `name` (1).md"')
+  })
+})
+
+describe('hermesDirectiveFormatter.parse', () => {
+  it('keeps quoted file paths whole when parsing', () => {
+    const segments = hermesDirectiveFormatter.parse('see @image:`apple-touch-icon (1).png` for the icon')
+
+    expect(segments).toEqual([
+      { kind: 'text', text: 'see ' },
+      { kind: 'mention', type: 'image', label: 'apple-touch-icon (1).png', id: 'apple-touch-icon (1).png' },
+      { kind: 'text', text: ' for the icon' }
+    ])
+  })
+
+  it('still parses unquoted paths', () => {
+    const segments = hermesDirectiveFormatter.parse('@file:src/main.tsx the entry point')
+
+    expect(segments).toEqual([
+      { kind: 'mention', type: 'file', label: 'main.tsx', id: 'src/main.tsx' },
+      { kind: 'text', text: ' the entry point' }
+    ])
+  })
+})
--- a/apps/desktop/src/components/assistant-ui/directive-text.tsx
+++ b/apps/desktop/src/components/assistant-ui/directive-text.tsx
@ -24,10 +24,63 @@ const ICONS: Record<HermesRefType, ComponentType<{ className?: string }>> = {
 * so they render as inline chips in user messages instead of raw text.
 *
 * Supported types: file, folder, url, image. Anything else stays plain text.
+ *
+ * Mirrors the Python `agent/context_references.REFERENCE_PATTERN` syntax:
+ * the value may be wrapped in backticks, single quotes, or double quotes so
+ * paths with spaces/parens/etc. survive parsing intact.
 */
-const CANONICAL_DIRECTIVE_RE = /:([\w-]{1,64})\[([^\]\n]{1,1024})\](?:\{name=([^}\n]{1,1024})\})?/gu
+const CANONICAL_DIRECTIVE_RE = /:([\w-]{1,64})\[([^\]\n]{1,1024})\](?:\{name=([^}\n]{1,1024})\})?/g

-const HERMES_DIRECTIVE_RE = /@(file|folder|url|image|tool):(\S+)/gu
+const HERMES_DIRECTIVE_RE = new RegExp(
+  '@(file|folder|url|image|tool):(' +
+    '`[^`\\n]+`' +
+    '|"[^"\\n]+"' +
+    "|'[^'\\n]+'" +
+    '|\\S+' +
+    ')',
+  'g'
+)
+
+const TRAILING_PUNCTUATION_RE = /[,.;!?]+$/
+
+function unwrapRefValue(raw: string): string {
+  if (raw.length < 2) {
+    return raw
+  }
+
+  const head = raw[0]
+  const tail = raw[raw.length - 1]
+
+  if ((head === '`' && tail === '`') || (head === '"' && tail === '"') || (head === "'" && tail === "'")) {
+    return raw.slice(1, -1)
+  }
+
+  return raw.replace(TRAILING_PUNCTUATION_RE, '')
+}
+
+function needsQuoting(value: string): boolean {
+  return /[\s()\[\]{}<>"'`]/.test(value)
+}
+
+export function formatRefValue(value: string): string {
+  if (!needsQuoting(value)) {
+    return value
+  }
+
+  if (!value.includes('`')) {
+    return `\`${value}\``
+  }
+
+  if (!value.includes('"')) {
+    return `"${value}"`
+  }
+
+  if (!value.includes("'")) {
+    return `'${value}'`
+  }
+
+  return value
+}

 export const hermesDirectiveFormatter: Unstable_DirectiveFormatter = {
  serialize(item: Unstable_TriggerItem): string {
@ -35,7 +88,7 @@ export const hermesDirectiveFormatter: Unstable_DirectiveFormatter = {
      return `@${item.id}`
    }

-    return `@${item.type}:${item.id}`
+    return `@${item.type}:${formatRefValue(item.id)}`
  },
  parse(text: string): readonly Unstable_DirectiveSegment[] {
    return parseDirectiveText(text)
@ -51,13 +104,17 @@ function parseDirectiveText(text: string): Unstable_DirectiveSegment[] {
      label: match[2] || match[3] || '',
      id: match[3] || match[2] || ''
    })),
-    ...Array.from(text.matchAll(HERMES_DIRECTIVE_RE)).map(match => ({
-      start: match.index ?? 0,
-      end: (match.index ?? 0) + match[0].length,
-      type: match[1] || 'file',
-      label: shortLabel(match[1] as HermesRefType, match[2] || ''),
-      id: match[2] || ''
-    }))
+    ...Array.from(text.matchAll(HERMES_DIRECTIVE_RE)).map(match => {
+      const id = unwrapRefValue(match[2] || '')
+
+      return {
+        start: match.index ?? 0,
+        end: (match.index ?? 0) + match[0].length,
+        type: match[1] || 'file',
+        label: shortLabel(match[1] as HermesRefType, id),
+        id
+      }
+    })
  ]
    .filter(match => match.id)
    .sort((a, b) => a.start - b.start)
@ -136,14 +193,14 @@ const DirectiveChip: FC<{
  return (
    <span
      className={cn(
-        'mx-0.5 inline-flex max-w-56 items-center gap-1 rounded-full border border-border/80 bg-background/95 px-1.5 py-0.5 align-[0.05em] text-[0.82em] font-medium leading-none text-foreground shadow-sm ring-1 ring-black/3'
+        'mx-0.5 inline-flex max-w-64 items-center gap-1 rounded-full bg-[color-mix(in_srgb,var(--dt-primary)_16%,transparent)] px-2 py-0.5 align-[0.02em] text-[0.92em] font-semibold leading-tight text-primary ring-1 ring-inset ring-primary/10'
      )}
      data-directive-id={id}
      data-directive-type={type}
      data-slot="aui_directive-chip"
      title={id}
    >
-      {Icon && <Icon className="size-3 shrink-0 text-muted-foreground" />}
+      {Icon && <Icon className="size-3.5 shrink-0 text-primary" />}
      <span className="truncate">{label}</span>
    </span>
  )
--- a/apps/desktop/src/components/assistant-ui/intro.tsx
+++ b/apps/desktop/src/components/assistant-ui/intro.tsx
@ -19,6 +19,7 @@ export type IntroProps = {
 const NEUTRAL_PERSONALITIES = new Set(['', 'default', 'none', 'neutral'])

 const HERMES_FRAME_COUNT = 8
+const ASSET_BASE_URL = import.meta.env.BASE_URL || '/'

 const FALLBACK_COPY: IntroCopy[] = [
  {
@ -154,6 +155,10 @@ function resolveCopy(personality?: string, seed?: number): IntroCopy {
  return pickCopy(copies, seed)
 }

+function publicAssetPath(path: string): string {
+  return `${ASSET_BASE_URL}${path}`.replace(/([^:]\/)\/+/g, '$1')
+}
+
 export const Intro: FC<IntroProps> = ({ personality, seed }) => {
  const [mountSeed] = useState(() => Math.floor(Math.random() * 100000))
  const [frameOffset, setFrameOffset] = useState(0)
@ -184,7 +189,7 @@ export const Intro: FC<IntroProps> = ({ personality, seed }) => {
          aria-hidden="true"
          className="h-full w-full scale-110 object-contain select-none"
          draggable={false}
-          src={`/hermes-frames/hermes-frame-${frameIndex}.png?v=matte-clean-6`}
+          src={publicAssetPath(`hermes-frames/hermes-frame-${frameIndex}.png?v=matte-clean-6`)}
        />
      </button>
      <p className="mb-3 text-xs font-medium uppercase tracking-[0.18em] text-muted-foreground/75">Hermes Agent</p>
--- a/apps/desktop/src/components/assistant-ui/streaming.test.tsx
+++ b/apps/desktop/src/components/assistant-ui/streaming.test.tsx
@ -1,19 +1,53 @@
 import { AssistantRuntimeProvider, type ThreadMessage, useExternalStoreRuntime } from '@assistant-ui/react'
-import { act, render, screen, waitFor } from '@testing-library/react'
+import { act, fireEvent, render, screen, waitFor } from '@testing-library/react'
 import { useEffect, useState } from 'react'
-import { describe, expect, it, vi } from 'vitest'
+import { beforeEach, describe, expect, it, vi } from 'vitest'

 import { Thread } from './thread'

 const createdAt = new Date('2026-05-01T00:00:00.000Z')

+const resizeObservers = new Set<TestResizeObserver>()
+
 class TestResizeObserver {
-  observe() {}
+  private target: Element | null = null
+
+  constructor(private readonly callback: ResizeObserverCallback) {
+    resizeObservers.add(this)
+  }
+
+  observe(target: Element) {
+    this.target = target
+  }
+
  unobserve() {}
-  disconnect() {}
+
+  disconnect() {
+    resizeObservers.delete(this)
+  }
+
+  trigger(height: number) {
+    if (!this.target) {
+      return
+    }
+
+    this.callback(
+      [
+        {
+          contentRect: { height } as DOMRectReadOnly,
+          target: this.target
+        } as ResizeObserverEntry
+      ],
+      this as unknown as ResizeObserver
+    )
+  }
 }

 vi.stubGlobal('ResizeObserver', TestResizeObserver)
+vi.stubGlobal('requestAnimationFrame', (callback: FrameRequestCallback) =>
+  window.setTimeout(() => callback(performance.now()), 0)
+)
+vi.stubGlobal('cancelAnimationFrame', (id: number) => window.clearTimeout(id))

 Element.prototype.scrollTo = function scrollTo() {}

@ -90,6 +124,10 @@ function StreamingHarness() {
 }

 describe('assistant-ui streaming renderer', () => {
+  beforeEach(() => {
+    resizeObservers.clear()
+  })
+
  it('renders assistant text incrementally before completion', async () => {
    const { container } = render(<StreamingHarness />)

@ -115,4 +153,42 @@ describe('assistant-ui streaming renderer', () => {
      expect(container.textContent).toContain('first chunk second chunk')
    })
  })
+
+  it('does not pull the viewport back down after the user scrolls up during streaming', async () => {
+    const { container } = render(<StreamingHarness />)
+
+    const viewport = container.querySelector('[data-slot="aui_thread-viewport"]') as HTMLDivElement
+    let scrollHeight = 1_000
+
+    Object.defineProperty(viewport, 'clientHeight', { configurable: true, value: 200 })
+    Object.defineProperty(viewport, 'scrollHeight', {
+      configurable: true,
+      get: () => scrollHeight
+    })
+
+    await wait(80)
+
+    await act(async () => {
+      viewport.scrollTop = 800
+      fireEvent.scroll(viewport)
+    })
+    await wait(0)
+
+    await act(async () => {
+      fireEvent.wheel(viewport, { deltaY: -120 })
+      viewport.scrollTop = 420
+      fireEvent.scroll(viewport)
+    })
+
+    scrollHeight = 1_200
+
+    await act(async () => {
+      for (const observer of resizeObservers) {
+        observer.trigger(1_200)
+      }
+    })
+    await wait(0)
+
+    expect(viewport.scrollTop).toBe(420)
+  })
 })
--- a/apps/desktop/src/components/assistant-ui/thread.tsx
+++ b/apps/desktop/src/components/assistant-ui/thread.tsx
@ -8,18 +8,28 @@ import {
  type ToolCallMessagePartProps,
  useAuiState
 } from '@assistant-ui/react'
+import { useStore } from '@nanostores/react'
 import {
  CheckIcon,
  ChevronLeftIcon,
  ChevronRightIcon,
  CopyIcon,
  GitBranchIcon,
+  Loader2Icon,
  MoreHorizontalIcon,
  RefreshCwIcon,
  Volume2Icon,
  VolumeXIcon
 } from 'lucide-react'
-import { type FC, type ReactNode, useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
+import {
+  type FC,
+  type ReactNode,
+  useCallback,
+  useEffect,
+  useLayoutEffect,
+  useRef,
+  useState
+} from 'react'

 import { useElapsedSeconds } from '@/components/assistant-ui/activity-timer'
 import { ActivityTimerText } from '@/components/assistant-ui/activity-timer-text'
@ -38,11 +48,12 @@ import {
  DropdownMenuTrigger
 } from '@/components/ui/dropdown-menu'
 import { Loader } from '@/components/ui/loader'
-import { speakText } from '@/hermes'
 import { triggerHaptic } from '@/lib/haptics'
 import { cn } from '@/lib/utils'
+import { playSpeechText, stopVoicePlayback } from '@/lib/voice-playback'
 import { notifyError } from '@/store/notifications'
 import { setThreadScrolledUp } from '@/store/thread-scroll'
+import { $voicePlayback } from '@/store/voice-playback'

 const THINKING_FACES = [
  '(｡•́︿•̀｡)',
@ -119,12 +130,16 @@ export const Thread: FC<{
  intro?: IntroProps
  loading?: ThreadLoadingState
  onBranchInNewChat?: (messageId: string) => void
-}> = ({ intro, loading, onBranchInNewChat }) => {
+  sessionKey?: string | null
+}> = ({ intro, loading, onBranchInNewChat, sessionKey }) => {
  const viewportRef = useRef<HTMLDivElement | null>(null)
+  const contentRef = useRef<HTMLDivElement | null>(null)
  const messageCount = useAuiState(s => s.thread.messages.length)
  const isRunning = useAuiState(s => s.thread.isRunning)
  const lastMessageId = useAuiState(s => s.thread.messages.at(-1)?.id ?? '')
  const shouldStickToBottomRef = useRef(true)
+  const scrollFrameRef = useRef<number | null>(null)
+  const sessionKeyRef = useRef<string | null>(sessionKey ?? null)

  const handleScroll = useCallback((event: React.UIEvent<HTMLDivElement>) => {
    const nearBottom = isNearBottom(event.currentTarget)
@ -132,8 +147,44 @@ export const Thread: FC<{
    setThreadScrolledUp(!nearBottom)
  }, [])

+  const handleWheel = useCallback((event: React.WheelEvent<HTMLDivElement>) => {
+    if (event.deltaY < 0) {
+      shouldStickToBottomRef.current = false
+      setThreadScrolledUp(true)
+    }
+  }, [])
+
+  const scrollToBottom = useCallback(() => {
+    const viewport = viewportRef.current
+
+    if (!viewport) {
+      return
+    }
+
+    viewport.scrollTop = viewport.scrollHeight
+    shouldStickToBottomRef.current = true
+    setThreadScrolledUp(false)
+  }, [])
+
+  const scheduleScrollToBottom = useCallback(() => {
+    if (scrollFrameRef.current !== null) {
+      window.cancelAnimationFrame(scrollFrameRef.current)
+    }
+
+    scrollFrameRef.current = window.requestAnimationFrame(() => {
+      scrollFrameRef.current = null
+      scrollToBottom()
+    })
+  }, [scrollToBottom])
+
  useEffect(() => {
-    return () => setThreadScrolledUp(false)
+    return () => {
+      if (scrollFrameRef.current !== null) {
+        window.cancelAnimationFrame(scrollFrameRef.current)
+      }
+
+      setThreadScrolledUp(false)
+    }
  }, [])

  useLayoutEffect(() => {
@ -143,16 +194,48 @@ export const Thread: FC<{
      return
    }

-    const force = loading === 'session'
+    const nextSessionKey = sessionKey ?? null
+    const sessionChanged = sessionKeyRef.current !== nextSessionKey
+    sessionKeyRef.current = nextSessionKey
+    const force = loading === 'session' || sessionChanged

    if (!force && !shouldStickToBottomRef.current) {
      return
    }

-    viewport.scrollTop = viewport.scrollHeight
-    shouldStickToBottomRef.current = true
-    setThreadScrolledUp(false)
-  }, [isRunning, lastMessageId, loading, messageCount])
+    scheduleScrollToBottom()
+  }, [isRunning, lastMessageId, loading, messageCount, scheduleScrollToBottom, sessionKey])
+
+  useLayoutEffect(() => {
+    const content = contentRef.current
+    const viewport = viewportRef.current
+
+    if (!content || !viewport) {
+      return
+    }
+
+    let previousHeight = content.getBoundingClientRect().height
+
+    const observer = new ResizeObserver(entries => {
+      const height = entries[0]?.contentRect.height ?? content.getBoundingClientRect().height
+
+      if (height === previousHeight) {
+        return
+      }
+
+      previousHeight = height
+
+      if (!shouldStickToBottomRef.current && !isNearBottom(viewport)) {
+        return
+      }
+
+      scheduleScrollToBottom()
+    })
+
+    observer.observe(content)
+
+    return () => observer.disconnect()
+  }, [scheduleScrollToBottom])

  return (
    <GeneratedImageProvider>
@ -160,15 +243,17 @@ export const Thread: FC<{
        <AuiIf condition={s => Boolean(intro) && s.thread.isEmpty}>{intro && <Intro {...intro} />}</AuiIf>

        <ThreadPrimitive.Viewport
-          className="h-full min-h-0 overflow-y-auto overscroll-contain px-[clamp(1rem,10%,12rem)] pt-[calc(var(--vsq)*19)] scroll-smooth"
+          autoScroll={false}
+          className="h-full min-h-0 overflow-y-auto overscroll-contain px-[clamp(1rem,10%,12rem)] pt-[calc(var(--vsq)*19)]"
          data-slot="aui_thread-viewport"
          onScroll={handleScroll}
+          onWheel={handleWheel}
          ref={viewportRef}
          scrollToBottomOnInitialize
          scrollToBottomOnRunStart
          scrollToBottomOnThreadSwitch
        >
-          <div className="flex w-full flex-col gap-3">
+          <div className="flex w-full flex-col gap-3" ref={contentRef}>
            <ThreadPrimitive.Messages>{() => <ThreadMessage onBranchInNewChat={onBranchInNewChat} />}</ThreadPrimitive.Messages>
            {loading === 'response' && <ResponseLoadingIndicator />}
            {loading === 'working' && <WorkingIndicator />}
@ -446,7 +531,7 @@ const AssistantActionBar: FC<MessageActionProps> = ({ messageId, messageText, on
              <GitBranchIcon />
              Branch in new chat
            </DropdownMenuItem>
-            <ReadAloudItem text={messageText} />
+            <ReadAloudItem messageId={messageId} text={messageText} />
          </DropdownMenuContent>
        </DropdownMenu>
      </ActionBarPrimitive.Root>
@ -479,80 +564,39 @@ const CopyMessageButton: FC<{ text: string }> = ({ text }) => {
  )
 }

-let currentAudio: HTMLAudioElement | null = null
+const ReadAloudItem: FC<{ messageId: string; text: string }> = ({ messageId, text }) => {
+  const voicePlayback = useStore($voicePlayback)

-function stopCurrentAudio() {
-  if (!currentAudio) {
-    return
-  }
+  const readAloudStatus =
+    voicePlayback.source === 'read-aloud' && voicePlayback.messageId === messageId ? voicePlayback.status : 'idle'

-  currentAudio.pause()
-  currentAudio.src = ''
-  currentAudio = null
-}
-
-const ReadAloudItem: FC<{ text: string }> = ({ text }) => {
-  const [reading, setReading] = useState(false)
-  const seqRef = useRef(0)
-
-  const stop = useCallback(() => {
-    seqRef.current += 1
-    stopCurrentAudio()
-    setReading(false)
-  }, [])
+  const isPreparing = readAloudStatus === 'preparing'
+  const isSpeaking = readAloudStatus === 'speaking'
+  const anyPlaybackActive = voicePlayback.status !== 'idle'
+  const Icon = isPreparing ? Loader2Icon : isSpeaking ? VolumeXIcon : Volume2Icon

  const read = useCallback(async () => {
-    if (!text) {
+    if (!text || $voicePlayback.get().status !== 'idle') {
      return
    }

-    stopCurrentAudio()
-    const seq = ++seqRef.current
-    const isCurrent = () => seq === seqRef.current
-
-    const finish = () => {
-      if (!isCurrent()) {
-        return
-      }
-
-      currentAudio = null
-      setReading(false)
-    }
-
-    setReading(true)
-
    try {
-      const { data_url } = await speakText(text)
-
-      if (!isCurrent()) {
-        return
-      }
-
-      const audio = new Audio(data_url)
-      currentAudio = audio
-      audio.addEventListener('ended', finish, { once: true })
-      audio.addEventListener('error', finish, { once: true })
-      await audio.play()
+      await playSpeechText(text, { messageId, source: 'read-aloud' })
    } catch (error) {
-      if (isCurrent()) {
-        notifyError(error, 'Read aloud failed')
-        finish()
-      }
+      notifyError(error, 'Read aloud failed')
    }
-  }, [text])
-
-  const Icon = reading ? VolumeXIcon : Volume2Icon
+  }, [messageId, text])

  return (
    <DropdownMenuItem
-      disabled={!reading && !text}
+      disabled={isPreparing || (!isSpeaking && (anyPlaybackActive || !text))}
      onSelect={e => {
        e.preventDefault()
-        void (reading ? stop() : read())
+        void (isSpeaking ? stopVoicePlayback() : read())
      }}
    >
-      <Icon />
-      {reading ? 'Stop reading' : 'Read aloud'}
+      <Icon className={isPreparing ? 'animate-spin' : undefined} />
+      {isPreparing ? 'Preparing audio...' : isSpeaking ? 'Stop reading' : 'Read aloud'}
    </DropdownMenuItem>
  )
 }
--- a/apps/desktop/src/lib/chat-messages.test.ts
+++ b/apps/desktop/src/lib/chat-messages.test.ts
@ -0,0 +1,18 @@
+import { describe, expect, it } from 'vitest'
+
+import { chatMessageText, toChatMessages } from './chat-messages'
+
+describe('toChatMessages', () => {
+  it('hides attached context payloads from user message display', () => {
+    const [message] = toChatMessages([
+      {
+        role: 'user',
+        content:
+          'what is this file\n\n--- Attached Context ---\n\n📄 @file:tsconfig.tsbuildinfo (981 tokens)\n```json\n{"root":["./src/main.tsx"]}\n```',
+        timestamp: 1
+      }
+    ])
+
+    expect(chatMessageText(message)).toBe('@file:tsconfig.tsbuildinfo\n\nwhat is this file')
+  })
+})
--- a/apps/desktop/src/lib/chat-messages.ts
+++ b/apps/desktop/src/lib/chat-messages.ts
@ -29,6 +29,7 @@ export type GatewayEventPayload = {
  todos?: unknown
  model?: string
  provider?: string
+  running?: boolean
  cwd?: string
  branch?: string
  personality?: string
@ -49,6 +50,28 @@ export function chatMessageText(message: ChatMessage): string {
    .join('')
 }

+const ATTACHED_CONTEXT_MARKER_RE = /(?:^|\n)--- Attached Context ---\s*\n/
+const CONTEXT_WARNINGS_MARKER_RE = /(?:^|\n)--- Context Warnings ---[\s\S]*$/
+const CONTEXT_REF_RE = /@(file|folder|url|image|tool):(?:"[^"\n]+"|'[^'\n]+'|`[^`\n]+`|\S+)/g
+
+function displayContentForMessage(role: SessionMessage['role'], content: string): string {
+  if (role !== 'user') {
+    return content
+  }
+
+  const marker = content.match(ATTACHED_CONTEXT_MARKER_RE)
+
+  if (!marker || marker.index === undefined) {
+    return content.replace(CONTEXT_WARNINGS_MARKER_RE, '').trim()
+  }
+
+  const visibleText = content.slice(0, marker.index).replace(CONTEXT_WARNINGS_MARKER_RE, '').trim()
+  const attachedContext = content.slice(marker.index + marker[0].length)
+  const refs = [...new Set(Array.from(attachedContext.matchAll(CONTEXT_REF_RE)).map(match => match[0]))]
+
+  return [refs.join('\n'), visibleText].filter(Boolean).join('\n\n') || visibleText
+}
+
 export function appendTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] {
  const next = [...parts]
  const last = next.at(-1)
@ -363,6 +386,7 @@ export function toChatMessages(messages: SessionMessage[]): ChatMessage[] {
    }

    const content = message.content || message.text || message.context || message.name || ''
+    const displayContent = displayContentForMessage(message.role, content)
    const parts: ChatMessagePart[] = []

    const reasoning =
@ -374,8 +398,8 @@ export function toChatMessages(messages: SessionMessage[]): ChatMessage[] {
      parts.push(reasoningPart(reasoning))
    }

-    if (content) {
-      parts.push(textPart(content))
+    if (displayContent) {
+      parts.push(textPart(displayContent))
    }

    if (message.role === 'assistant' && Array.isArray(message.tool_calls)) {
--- a/apps/desktop/src/lib/chat-runtime.test.ts
+++ b/apps/desktop/src/lib/chat-runtime.test.ts
@ -0,0 +1,18 @@
+import { describe, expect, it } from 'vitest'
+
+import { coerceThinkingText } from './chat-runtime'
+
+describe('coerceThinkingText', () => {
+  it('strips streaming status prefixes from thinking deltas', () => {
+    expect(coerceThinkingText("◉_◉ processing... checking the user's request")).toBe("checking the user's request")
+    expect(coerceThinkingText('(¬‿¬) analyzing... reading the file')).toBe('reading the file')
+  })
+
+  it('drops empty thinking rewrite placeholder text', () => {
+    expect(
+      coerceThinkingText(
+        "◉_◉ processing... I don't see any current rewritten thinking or next thinking to process. Could you provide the thinking content you'd like me to rewrite?"
+      )
+    ).toBe('')
+  })
+})
--- a/apps/desktop/src/lib/chat-runtime.ts
+++ b/apps/desktop/src/lib/chat-runtime.ts
@ -2,6 +2,7 @@ import type { ThreadMessage } from '@assistant-ui/react'

 import type { QuickModelOption } from '@/app/chat/composer/types'
 import type { ClientSessionState, CommandDispatchResponse } from '@/app/types'
+import { formatRefValue } from '@/components/assistant-ui/directive-text'
 import { type ChatMessage, type ChatMessagePart, chatMessageText, textPart } from '@/lib/chat-messages'
 import type { ComposerAttachment } from '@/store/composer'
 import type { ModelOptionsResponse, SessionInfo } from '@/types/hermes'
@ -25,7 +26,11 @@ export const BUILTIN_PERSONALITIES = [
  'hype'
 ]

-const SPINNER_STATUS_RE = /^\s*[（(][^\s)）]{1,8}[)）]\s+[^.\n]{2,48}\.\.\.\s*/
+const THINKING_STATUS_PREFIX_RE =
+  /^\s*(?:(?:[^\s.]{1,16})\s+)?(?:processing|thinking|reasoning|analyzing|pondering|contemplating|musing|cogitating|ruminating|deliberating|mulling|reflecting|computing|synthesizing|formulating|brainstorming)\.\.\.\s*/i
+
+const EMPTY_THINKING_PLACEHOLDER_RE =
+  /\b(?:current rewritten thinking|next thinking to process|provide the thinking content|don't see any .*thinking)\b/i

 export function createClientSessionState(
  storedSessionId: string | null = null,
@ -102,7 +107,9 @@ export function coerceGatewayText(value: unknown): string {
 }

 export function coerceThinkingText(value: unknown): string {
-  return coerceGatewayText(value).replace(SPINNER_STATUS_RE, '').trim()
+  const text = coerceGatewayText(value).replace(THINKING_STATUS_PREFIX_RE, '').trim()
+
+  return EMPTY_THINKING_PLACEHOLDER_RE.test(text) ? '' : text
 }

 export function isImageGenerationTool(name?: string): boolean {
@ -135,7 +142,7 @@ export function attachmentDisplayText(attachment: ComposerAttachment): string |
  if (attachment.kind === 'image') {
    const id = attachment.detail || attachment.path || attachment.label

-    return id ? `@image:${id}` : null
+    return id ? `@image:${formatRefValue(id)}` : null
  }

  return null
--- a/apps/desktop/src/lib/speech-text.ts
+++ b/apps/desktop/src/lib/speech-text.ts
@ -0,0 +1,19 @@
+const EMOJI_RE = /[\p{Extended_Pictographic}\uFE0F\u200D]+/gu
+const FENCED_CODE_RE = /```[\s\S]*?(?:```|$)/g
+const INLINE_CODE_RE = /`([^`]+)`/g
+const MARKDOWN_LINK_RE = /\[([^\]]+)\]\(([^)]+)\)/g
+const URL_RE = /\bhttps?:\/\/\S+/gi
+
+export function sanitizeTextForSpeech(text: string): string {
+  return text
+    .replace(FENCED_CODE_RE, ' ')
+    .replace(MARKDOWN_LINK_RE, '$1')
+    .replace(INLINE_CODE_RE, '$1')
+    .replace(URL_RE, ' link ')
+    .replace(EMOJI_RE, ' ')
+    .replace(/^#{1,6}\s+/gm, '')
+    .replace(/[*_~>#]/g, '')
+    .replace(/^\s*[-+*]\s+/gm, '')
+    .replace(/\s+/g, ' ')
+    .trim()
+}
--- a/apps/desktop/src/lib/voice-playback.ts
+++ b/apps/desktop/src/lib/voice-playback.ts
@ -0,0 +1,96 @@
+import { speakText } from '@/hermes'
+import {
+  $voicePlayback,
+  setVoicePlaybackState,
+  type VoicePlaybackSource,
+  type VoicePlaybackState
+} from '@/store/voice-playback'
+
+import { sanitizeTextForSpeech } from './speech-text'
+
+let currentAudio: HTMLAudioElement | null = null
+let sequence = 0
+
+function currentState(status: VoicePlaybackState['status'], options?: VoicePlaybackOptions): VoicePlaybackState {
+  return {
+    messageId: options?.messageId ?? null,
+    sequence,
+    source: options?.source ?? null,
+    status
+  }
+}
+
+export interface VoicePlaybackOptions {
+  messageId?: string | null
+  source: VoicePlaybackSource
+}
+
+export function stopVoicePlayback() {
+  sequence += 1
+
+  if (currentAudio) {
+    currentAudio.pause()
+    currentAudio.src = ''
+    currentAudio = null
+  }
+
+  setVoicePlaybackState({
+    messageId: null,
+    sequence,
+    source: null,
+    status: 'idle'
+  })
+}
+
+export async function playSpeechText(text: string, options: VoicePlaybackOptions): Promise<boolean> {
+  stopVoicePlayback()
+
+  const speakableText = sanitizeTextForSpeech(text)
+
+  if (!speakableText) {
+    return false
+  }
+
+  const ownSequence = sequence
+  const isCurrent = () => ownSequence === sequence
+
+  setVoicePlaybackState(currentState('preparing', options))
+
+  try {
+    const response = await speakText(speakableText)
+
+    if (!isCurrent()) {
+      return false
+    }
+
+    const audio = new Audio(response.data_url)
+    currentAudio = audio
+    setVoicePlaybackState(currentState('speaking', options))
+
+    await new Promise<void>((resolve, reject) => {
+      audio.addEventListener('ended', () => resolve(), { once: true })
+      audio.addEventListener('error', () => reject(new Error('Playback failed')), { once: true })
+      void audio.play().catch(reject)
+    })
+
+    if (!isCurrent()) {
+      return false
+    }
+
+    currentAudio = null
+    setVoicePlaybackState(currentState('idle'))
+
+    return true
+  } catch (error) {
+    if (isCurrent()) {
+      currentAudio = null
+      setVoicePlaybackState(currentState('idle'))
+    }
+
+    throw error
+  }
+}
+
+export function isVoicePlaybackActive() {
+  return $voicePlayback.get().status !== 'idle'
+}
--- a/apps/desktop/src/store/notifications.ts
+++ b/apps/desktop/src/store/notifications.ts
@ -50,6 +50,13 @@ const ERROR_SUMMARIES: { test: (msg: string) => boolean; summarize: (msg: string
    test: msg => /neither voice_tools_openai_key nor openai_api_key is set/i.test(msg),
    summarize: () => 'OpenAI TTS needs VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY.'
  },
+  {
+    test: msg => /ELEVENLABS_API_KEY not set/i.test(msg) || /ElevenLabs STT API error \(HTTP 401\)/i.test(msg),
+    summarize: msg =>
+      /ELEVENLABS_API_KEY not set/i.test(msg)
+        ? 'ElevenLabs STT needs ELEVENLABS_API_KEY.'
+        : 'ElevenLabs rejected the API key (401).'
+  },
  {
    test: msg => /method not allowed/i.test(msg),
    summarize: () => 'The desktop backend does not support that audio endpoint yet. Restart Hermes Desktop.'
--- a/apps/desktop/src/store/voice-playback.ts
+++ b/apps/desktop/src/store/voice-playback.ts
@ -0,0 +1,22 @@
+import { atom } from 'nanostores'
+
+export type VoicePlaybackSource = 'read-aloud' | 'voice-conversation'
+export type VoicePlaybackStatus = 'idle' | 'preparing' | 'speaking'
+
+export interface VoicePlaybackState {
+  messageId: string | null
+  sequence: number
+  source: VoicePlaybackSource | null
+  status: VoicePlaybackStatus
+}
+
+export const $voicePlayback = atom<VoicePlaybackState>({
+  messageId: null,
+  sequence: 0,
+  source: null,
+  status: 'idle'
+})
+
+export function setVoicePlaybackState(next: VoicePlaybackState) {
+  $voicePlayback.set(next)
+}
--- a/apps/desktop/src/styles.css
+++ b/apps/desktop/src/styles.css
@ -184,6 +184,29 @@ button {
  -webkit-app-region: no-drag;
 }

+@keyframes voice-wave {
+  0%,
+  100% {
+    opacity: 0.45;
+    transform: scaleY(0.28);
+  }
+
+  35% {
+    opacity: 0.95;
+    transform: scaleY(1);
+  }
+
+  62% {
+    opacity: 0.7;
+    transform: scaleY(0.52);
+  }
+}
+
+.voice-wave-bar {
+  animation: voice-wave 860ms ease-in-out infinite;
+  transform-origin: center;
+}
+
 .composer-liquid-shell-wrap {
  pointer-events: none;
  border-radius: var(--composer-glass-radius, 20px);
--- a/apps/desktop/src/types/hermes.ts
+++ b/apps/desktop/src/types/hermes.ts
@ -168,6 +168,7 @@ export interface SessionRuntimeInfo {
  personality?: string
  provider?: string
  reasoning_effort?: string
+  running?: boolean
  service_tier?: string
  skills?: Record<string, string[]> | string[]
  tools?: Record<string, string[]>
--- a/apps/desktop/vite.config.ts
+++ b/apps/desktop/vite.config.ts
@ -4,6 +4,7 @@ import tailwindcss from '@tailwindcss/vite'
 import path from 'path'

 export default defineConfig({
+  base: './',
  plugins: [react(), tailwindcss()],
  resolve: {
    alias: {
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -830,7 +830,7 @@ DEFAULT_CONFIG = {
    
    "stt": {
        "enabled": True,
-        "provider": "local",  # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) | "mistral" (Voxtral Transcribe)
+        "provider": "local",  # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) | "mistral" (Voxtral Transcribe) | "elevenlabs" (Scribe)
        "local": {
            "model": "base",  # tiny, base, small, medium, large-v3
            "language": "",  # auto-detect by default; set to "en", "es", "fr", etc. to force
@ -841,6 +841,12 @@ DEFAULT_CONFIG = {
        "mistral": {
            "model": "voxtral-mini-latest",  # voxtral-mini-latest, voxtral-mini-2602
        },
+        "elevenlabs": {
+            "model_id": "scribe_v2",  # scribe_v2, scribe_v1
+            "language_code": "",  # auto-detect by default; set to "eng", "spa", "fra", etc. to force
+            "tag_audio_events": False,
+            "diarize": False,
+        },
    },

    "voice": {
@ -1791,9 +1797,10 @@ OPTIONAL_ENV_VARS = {
        "category": "tool",
    },
    "ELEVENLABS_API_KEY": {
-        "description": "ElevenLabs API key for premium text-to-speech voices",
+        "description": "ElevenLabs API key for premium text-to-speech voices and Scribe transcription",
        "prompt": "ElevenLabs API key",
        "url": "https://elevenlabs.io/",
+        "tools": ["elevenlabs_tts", "voice_transcription"],
        "password": True,
        "category": "tool",
    },
--- a/hermes_cli/web_server.py
+++ b/hermes_cli/web_server.py
@ -280,7 +280,12 @@ _SCHEMA_OVERRIDES: Dict[str, Dict[str, Any]] = {
    "stt.provider": {
        "type": "select",
        "description": "Speech-to-text provider",
-        "options": ["local", "openai", "mistral"],
+        "options": ["local", "groq", "openai", "mistral", "xai", "elevenlabs"],
+    },
+    "stt.elevenlabs.model_id": {
+        "type": "select",
+        "description": "ElevenLabs Scribe model",
+        "options": ["scribe_v2", "scribe_v1"],
    },
    "display.skin": {
        "type": "select",
--- a/tests/tools/test_transcription_dotenv_fallback.py
+++ b/tests/tools/test_transcription_dotenv_fallback.py
@ -24,6 +24,8 @@ def isolate_env(monkeypatch):
        "MISTRAL_API_KEY",
        "XAI_API_KEY",
        "XAI_STT_BASE_URL",
+        "ELEVENLABS_API_KEY",
+        "ELEVENLABS_STT_BASE_URL",
    ):
        monkeypatch.delenv(key, raising=False)

@ -87,6 +89,15 @@ class TestProviderSelectionGate:
                   return_value={"XAI_API_KEY": "dotenv-secret"}):
            assert tt._get_provider({"enabled": True, "provider": "xai"}) == "xai"

+    def test_explicit_elevenlabs_sees_dotenv(self):
+        from tools import transcription_tools as tt
+
+        with patch.object(tt, "_HAS_FASTER_WHISPER", False), \
+             patch.object(tt, "_has_local_command", return_value=False), \
+             patch("hermes_cli.config.load_env",
+                   return_value={"ELEVENLABS_API_KEY": "dotenv-secret"}):
+            assert tt._get_provider({"enabled": True, "provider": "elevenlabs"}) == "elevenlabs"
+
    def test_auto_detect_sees_dotenv_groq(self):
        """No local backend, no explicit provider — auto-detect should fall
        through to Groq when its key lives in dotenv only. Before the fix
@ -193,6 +204,33 @@ class TestTranscribeCallSitesReadDotenv:
        assert result["success"] is True
        assert captured["headers"]["Authorization"] == "Bearer xai-dotenv-key"

+    def test_transcribe_elevenlabs_forwards_dotenv_key(self):
+        from tools import transcription_tools as tt
+
+        captured: dict = {}
+
+        def fake_post(url, **kwargs):
+            captured["url"] = url
+            captured["headers"] = kwargs.get("headers", {})
+            response = MagicMock()
+            response.status_code = 200
+            response.json.return_value = {"text": "hello"}
+            return response
+
+        def fake_get_env_value(name, default=None):
+            if name == "ELEVENLABS_API_KEY":
+                return "elevenlabs-dotenv-key"
+            return None
+
+        with patch.object(tt, "get_env_value", side_effect=fake_get_env_value), \
+             patch.object(tt, "_load_stt_config", return_value={}), \
+             patch("requests.post", side_effect=fake_post), \
+             patch("builtins.open", MagicMock()):
+            result = tt._transcribe_elevenlabs("/tmp/fake.mp3", "scribe_v2")
+
+        assert result["success"] is True
+        assert captured["headers"]["xi-api-key"] == "elevenlabs-dotenv-key"
+

 class TestEndToEndRegressionGuard:
    """End-to-end probe: patch ``hermes_cli.config.load_env`` to simulate
--- a/tests/tools/test_transcription_tools.py
+++ b/tests/tools/test_transcription_tools.py
@ -49,6 +49,7 @@ def clean_env(monkeypatch):
    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
    monkeypatch.delenv("GROQ_API_KEY", raising=False)
    monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
+    monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False)
    monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False)
    monkeypatch.delenv("HERMES_LOCAL_STT_LANGUAGE", raising=False)

@ -1342,3 +1343,161 @@ class TestTranscribeAudioXAIDispatch:
            transcribe_audio(sample_ogg, model="custom-stt")

        assert mock_xai.call_args[0][1] == "custom-stt"
+
+
+# ============================================================================
+# _transcribe_elevenlabs
+# ============================================================================
+
+class TestTranscribeElevenLabs:
+    def test_no_key(self, monkeypatch):
+        monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False)
+        from tools.transcription_tools import _transcribe_elevenlabs
+        result = _transcribe_elevenlabs("/tmp/test.ogg", "scribe_v2")
+        assert result["success"] is False
+        assert "ELEVENLABS_API_KEY" in result["error"]
+
+    def test_successful_transcription(self, monkeypatch, sample_ogg):
+        monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key")
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"text": "hello from elevenlabs"}
+
+        config = {
+            "elevenlabs": {
+                "language_code": "eng",
+                "tag_audio_events": True,
+                "diarize": True,
+            }
+        }
+        with patch("tools.transcription_tools._load_stt_config", return_value=config), \
+             patch("requests.post", return_value=mock_response) as mock_post:
+            from tools.transcription_tools import _transcribe_elevenlabs
+            result = _transcribe_elevenlabs(sample_ogg, "scribe_v2")
+
+        assert result["success"] is True
+        assert result["transcript"] == "hello from elevenlabs"
+        assert result["provider"] == "elevenlabs"
+        call_kwargs = mock_post.call_args.kwargs
+        assert call_kwargs["headers"]["xi-api-key"] == "eleven-test-key"
+        assert call_kwargs["data"]["model_id"] == "scribe_v2"
+        assert call_kwargs["data"]["language_code"] == "eng"
+        assert call_kwargs["data"]["tag_audio_events"] == "true"
+        assert call_kwargs["data"]["diarize"] == "true"
+
+    def test_api_error_returns_failure(self, monkeypatch, sample_ogg):
+        monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key")
+
+        mock_response = MagicMock()
+        mock_response.status_code = 401
+        mock_response.json.return_value = {"detail": {"message": "Invalid API key"}}
+        mock_response.text = '{"detail": {"message": "Invalid API key"}}'
+
+        with patch("tools.transcription_tools._load_stt_config", return_value={}), \
+             patch("requests.post", return_value=mock_response):
+            from tools.transcription_tools import _transcribe_elevenlabs
+            result = _transcribe_elevenlabs(sample_ogg, "scribe_v2")
+
+        assert result["success"] is False
+        assert "HTTP 401" in result["error"]
+        assert "Invalid API key" in result["error"]
+
+    def test_empty_transcript_returns_failure(self, monkeypatch, sample_ogg):
+        monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key")
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"text": "   "}
+
+        with patch("tools.transcription_tools._load_stt_config", return_value={}), \
+             patch("requests.post", return_value=mock_response):
+            from tools.transcription_tools import _transcribe_elevenlabs
+            result = _transcribe_elevenlabs(sample_ogg, "scribe_v2")
+
+        assert result["success"] is False
+        assert "empty transcript" in result["error"]
+
+
+# ============================================================================
+# _get_provider — ElevenLabs
+# ============================================================================
+
+class TestGetProviderElevenLabs:
+    """ElevenLabs-specific provider selection tests."""
+
+    def test_elevenlabs_when_key_set(self, monkeypatch):
+        monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test")
+        from tools.transcription_tools import _get_provider
+        assert _get_provider({"provider": "elevenlabs"}) == "elevenlabs"
+
+    def test_elevenlabs_explicit_no_key_returns_none(self, monkeypatch):
+        """Explicit elevenlabs with no key returns none — no cross-provider fallback."""
+        monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False)
+        from tools.transcription_tools import _get_provider
+        assert _get_provider({"provider": "elevenlabs"}) == "none"
+
+    def test_auto_detect_elevenlabs_after_xai(self, monkeypatch):
+        """Auto-detect: elevenlabs is tried after xai when all above are unavailable."""
+        monkeypatch.delenv("GROQ_API_KEY", raising=False)
+        monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
+        monkeypatch.delenv("XAI_API_KEY", raising=False)
+        monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test")
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
+             patch("tools.transcription_tools._has_local_command", return_value=False), \
+             patch("tools.transcription_tools._HAS_OPENAI", False), \
+             patch("tools.transcription_tools._HAS_MISTRAL", False):
+            from tools.transcription_tools import _get_provider
+            assert _get_provider({}) == "elevenlabs"
+
+    def test_auto_detect_xai_preferred_over_elevenlabs(self, monkeypatch):
+        """Auto-detect: xai is preferred over elevenlabs."""
+        monkeypatch.setenv("XAI_API_KEY", "xai-test")
+        monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test")
+        with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
+             patch("tools.transcription_tools._has_local_command", return_value=False), \
+             patch("tools.transcription_tools._HAS_OPENAI", False), \
+             patch("tools.transcription_tools._HAS_MISTRAL", False):
+            from tools.transcription_tools import _get_provider
+            assert _get_provider({}) == "xai"
+
+
+# ============================================================================
+# transcribe_audio — ElevenLabs dispatch
+# ============================================================================
+
+class TestTranscribeAudioElevenLabsDispatch:
+    def test_dispatches_to_elevenlabs(self, sample_ogg):
+        with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "elevenlabs"}), \
+             patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \
+             patch("tools.transcription_tools._transcribe_elevenlabs",
+                   return_value={"success": True, "transcript": "hi", "provider": "elevenlabs"}) as mock_elevenlabs:
+            from tools.transcription_tools import transcribe_audio
+            result = transcribe_audio(sample_ogg)
+
+        assert result["success"] is True
+        assert result["provider"] == "elevenlabs"
+        mock_elevenlabs.assert_called_once()
+
+    def test_config_elevenlabs_model_used(self, sample_ogg):
+        config = {"provider": "elevenlabs", "elevenlabs": {"model_id": "scribe_v1"}}
+        with patch("tools.transcription_tools._load_stt_config", return_value=config), \
+             patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \
+             patch("tools.transcription_tools._transcribe_elevenlabs",
+                   return_value={"success": True, "transcript": "hi"}) as mock_elevenlabs:
+            from tools.transcription_tools import transcribe_audio
+            transcribe_audio(sample_ogg, model=None)
+
+        assert mock_elevenlabs.call_args[0][1] == "scribe_v1"
+
+    def test_model_override_passed_to_elevenlabs(self, sample_ogg):
+        with patch("tools.transcription_tools._load_stt_config", return_value={}), \
+             patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \
+             patch("tools.transcription_tools._transcribe_elevenlabs",
+                   return_value={"success": True, "transcript": "hi"}) as mock_elevenlabs:
+            from tools.transcription_tools import transcribe_audio
+            transcribe_audio(sample_ogg, model="scribe_v2")
+
+        assert mock_elevenlabs.call_args[0][1] == "scribe_v2"
--- a/tools/transcription_tools.py
+++ b/tools/transcription_tools.py
@ -11,6 +11,7 @@ Provides speech-to-text transcription with six providers:
  - **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``.
  - **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy,
    Inverse Text Normalization, diarization, 21 languages.
+  - **elevenlabs** — ElevenLabs Scribe API, requires ``ELEVENLABS_API_KEY``.

 Used by the messaging gateway to automatically transcribe voice messages
 sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
@ -84,6 +85,7 @@ DEFAULT_LOCAL_STT_LANGUAGE = "en"
 DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
 DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
 DEFAULT_MISTRAL_STT_MODEL = os.getenv("STT_MISTRAL_MODEL", "voxtral-mini-latest")
+DEFAULT_ELEVENLABS_STT_MODEL = os.getenv("STT_ELEVENLABS_MODEL", "scribe_v2")
 LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND"
 LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE"
 COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
@ -91,6 +93,7 @@ COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
 GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
 OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
 XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1")
+ELEVENLABS_STT_BASE_URL = os.getenv("ELEVENLABS_STT_BASE_URL", "https://api.elevenlabs.io/v1")

 SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
 LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
@ -268,9 +271,17 @@ def _get_provider(stt_config: dict) -> str:
            )
            return "none"

+        if provider == "elevenlabs":
+            if get_env_value("ELEVENLABS_API_KEY"):
+                return "elevenlabs"
+            logger.warning(
+                "STT provider 'elevenlabs' configured but ELEVENLABS_API_KEY not set"
+            )
+            return "none"
+
        return provider  # Unknown — let it fail downstream

-    # --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai -
+    # --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai > elevenlabs -

    if _HAS_FASTER_WHISPER:
        return "local"
@ -288,6 +299,9 @@ def _get_provider(stt_config: dict) -> str:
    if get_env_value("XAI_API_KEY"):
        logger.info("No local STT available, using xAI Grok STT API")
        return "xai"
+    if get_env_value("ELEVENLABS_API_KEY"):
+        logger.info("No local STT available, using ElevenLabs Scribe STT API")
+        return "elevenlabs"
    return "none"

 # ---------------------------------------------------------------------------
@ -781,6 +795,92 @@ def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]:
        return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"}


+# ---------------------------------------------------------------------------
+# Provider: ElevenLabs (Scribe STT API)
+# ---------------------------------------------------------------------------
+
+
+def _transcribe_elevenlabs(file_path: str, model_name: str) -> Dict[str, Any]:
+    """Transcribe using ElevenLabs Scribe STT API."""
+    api_key = get_env_value("ELEVENLABS_API_KEY")
+    if not api_key:
+        return {"success": False, "transcript": "", "error": "ELEVENLABS_API_KEY not set"}
+
+    stt_config = _load_stt_config()
+    elevenlabs_config = stt_config.get("elevenlabs", {})
+    base_url = str(
+        elevenlabs_config.get("base_url")
+        or get_env_value("ELEVENLABS_STT_BASE_URL")
+        or ELEVENLABS_STT_BASE_URL
+    ).strip().rstrip("/")
+    language_code = str(elevenlabs_config.get("language_code") or "").strip()
+    tag_audio_events = is_truthy_value(elevenlabs_config.get("tag_audio_events", False))
+    diarize = is_truthy_value(elevenlabs_config.get("diarize", False))
+
+    try:
+        import requests
+
+        data: Dict[str, str] = {
+            "model_id": model_name,
+            "tag_audio_events": "true" if tag_audio_events else "false",
+            "diarize": "true" if diarize else "false",
+        }
+        if language_code:
+            data["language_code"] = language_code
+
+        with open(file_path, "rb") as audio_file:
+            response = requests.post(
+                f"{base_url}/speech-to-text",
+                headers={"xi-api-key": api_key},
+                files={"file": (Path(file_path).name, audio_file)},
+                data=data,
+                timeout=120,
+            )
+
+        if response.status_code != 200:
+            detail = ""
+            try:
+                err_body = response.json()
+                error_value = err_body.get("detail") or err_body.get("error")
+                if isinstance(error_value, dict):
+                    detail = str(error_value.get("message") or error_value)
+                elif error_value:
+                    detail = str(error_value)
+                else:
+                    detail = response.text[:300]
+            except Exception:
+                detail = response.text[:300]
+            return {
+                "success": False,
+                "transcript": "",
+                "error": f"ElevenLabs STT API error (HTTP {response.status_code}): {detail}",
+            }
+
+        result = response.json()
+        transcript_text = _extract_transcript_text(result)
+        if not transcript_text:
+            return {
+                "success": False,
+                "transcript": "",
+                "error": "ElevenLabs STT returned empty transcript",
+            }
+
+        logger.info(
+            "Transcribed %s via ElevenLabs Scribe (%s, %d chars)",
+            Path(file_path).name,
+            model_name,
+            len(transcript_text),
+        )
+
+        return {"success": True, "transcript": transcript_text, "provider": "elevenlabs"}
+
+    except PermissionError:
+        return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
+    except Exception as e:
+        logger.error("ElevenLabs STT transcription failed: %s", e, exc_info=True)
+        return {"success": False, "transcript": "", "error": f"ElevenLabs STT transcription failed: {e}"}
+
+
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
@ -792,7 +892,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A

    Provider priority:
      1. User config (``stt.provider`` in config.yaml)
-      2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid)
+      2. Auto-detect: local > Groq > OpenAI > Mistral > xAI > ElevenLabs

    Args:
        file_path: Absolute path to the audio file to transcribe.
@ -854,6 +954,11 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
        model_name = model or "grok-stt"
        return _transcribe_xai(file_path, model_name)

+    if provider == "elevenlabs":
+        elevenlabs_cfg = stt_config.get("elevenlabs", {})
+        model_name = model or elevenlabs_cfg.get("model_id", DEFAULT_ELEVENLABS_STT_MODEL)
+        return _transcribe_elevenlabs(file_path, model_name)
+
    # No provider available
    return {
        "success": False,
@ -862,8 +967,9 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
            "No STT provider available. Install faster-whisper for free local "
            f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
            "set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral "
-            "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY "
-            "or OPENAI_API_KEY for the OpenAI Whisper API."
+            "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, set ELEVENLABS_API_KEY "
+            "for ElevenLabs Scribe, or set VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY for "
+            "the OpenAI Whisper API."
        ),
    }

--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -1409,6 +1409,7 @@ def _session_info(agent, session: dict | None = None) -> dict:
        "cwd": cwd,
        "branch": _git_branch_for_cwd(cwd),
        "personality": str(personality or ""),
+        "running": bool((session or {}).get("running")),
        "version": "",
        "release_date": "",
        "update_behind": None,