diff --git a/.env.example b/.env.example index 589978e6b5..d35c829d41 100644 --- a/.env.example +++ b/.env.example @@ -384,9 +384,9 @@ IMAGE_TOOLS_DEBUG=false # Default STT provider is "local" (faster-whisper) — runs on your machine, no API key needed. # Install with: pip install faster-whisper # Model downloads automatically on first use (~150 MB for "base"). -# To use cloud providers instead, set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY above. -# Provider priority: local > groq > openai -# Configure in config.yaml: stt.provider: local | groq | openai +# To use cloud providers instead, set GROQ_API_KEY, VOICE_TOOLS_OPENAI_KEY, or ELEVENLABS_API_KEY above. +# Provider priority: local > groq > openai > mistral > xai > elevenlabs +# Configure in config.yaml: stt.provider: local | groq | openai | mistral | xai | elevenlabs # ============================================================================= # STT ADVANCED OVERRIDES (optional) @@ -394,10 +394,12 @@ IMAGE_TOOLS_DEBUG=false # Override default STT models per provider (normally set via stt.model in config.yaml) # STT_GROQ_MODEL=whisper-large-v3-turbo # STT_OPENAI_MODEL=whisper-1 +# STT_ELEVENLABS_MODEL=scribe_v2 # Override STT provider endpoints (for proxies or self-hosted instances) # GROQ_BASE_URL=https://api.groq.com/openai/v1 # STT_OPENAI_BASE_URL=https://api.openai.com/v1 +# ELEVENLABS_STT_BASE_URL=https://api.elevenlabs.io/v1 # ============================================================================= # MICROSOFT TEAMS INTEGRATION diff --git a/apps/desktop/package-lock.json b/apps/desktop/package-lock.json index 2dabfaca5c..b8e9e9e77c 100644 --- a/apps/desktop/package-lock.json +++ b/apps/desktop/package-lock.json @@ -10,6 +10,7 @@ "dependencies": { "@assistant-ui/react": "^0.12.28", "@assistant-ui/react-streamdown": "^0.1.11", + "@audiowave/react": "^0.6.2", "@chenglou/pretext": "^0.0.6", "@nanostores/react": "^1.1.0", "@radix-ui/react-slot": "^1.2.4", @@ -305,6 +306,25 @@ } } }, + "node_modules/@audiowave/core": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/@audiowave/core/-/core-0.3.1.tgz", + "integrity": "sha512-KtC2MTWKp6Orkedty3I8IklVBVQ2IFaFWDJ1cz+UsACpX2x1gINwZGTRZT7bw/dx8KazNSMuVK5lm1jL67KQkQ==", + "license": "MIT" + }, + "node_modules/@audiowave/react": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/@audiowave/react/-/react-0.6.2.tgz", + "integrity": "sha512-hajG2Iv3mVxived9wXad8L0ZQF+HmYnB3IrfOkIdkTv4RxOJDXwFWMAd0zb7ZU1Qz0IEYZXCbASFWyuxEQ7PAw==", + "license": "MIT", + "dependencies": { + "@audiowave/core": "0.3.1" + }, + "peerDependencies": { + "react": ">=16.8.0", + "react-dom": ">=16.8.0" + } + }, "node_modules/@babel/code-frame": { "version": "7.29.0", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz", diff --git a/apps/desktop/package.json b/apps/desktop/package.json index 7a2672a007..26e12d29dc 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -23,6 +23,7 @@ "dependencies": { "@assistant-ui/react": "^0.12.28", "@assistant-ui/react-streamdown": "^0.1.11", + "@audiowave/react": "^0.6.2", "@chenglou/pretext": "^0.0.6", "@nanostores/react": "^1.1.0", "@radix-ui/react-slot": "^1.2.4", diff --git a/apps/desktop/src/app/chat/composer/constants.ts b/apps/desktop/src/app/chat/composer/constants.ts index 60b0d1d1e1..945f2a59e2 100644 --- a/apps/desktop/src/app/chat/composer/constants.ts +++ b/apps/desktop/src/app/chat/composer/constants.ts @@ -1,4 +1,3 @@ -import type { Unstable_TriggerItem } from '@assistant-ui/core' import type { Unstable_IconComponent } from '@assistant-ui/react' import { FileText, FolderOpen, ImageIcon, Link, type LucideIcon } from 'lucide-react' import type { CSSProperties } from 'react' @@ -37,7 +36,7 @@ export const DIRECTIVE_ICONS: Record = { } export const DIRECTIVE_POPOVER_CLASS = - 'absolute bottom-24 left-1/2 z-50 w-[min(calc(100vw-1.5rem),28rem)] max-h-[min(28rem,calc(100vh-8rem))] -translate-x-1/2 overflow-y-auto overscroll-contain rounded-2xl border border-border/70 bg-popover p-1.5 text-popover-foreground shadow-2xl' + 'absolute bottom-24 left-1/2 z-50 w-[min(calc(100vw-1.5rem),26rem)] max-h-[min(24rem,calc(100vh-8rem))] -translate-x-1/2 overflow-y-auto overscroll-contain rounded-2xl border border-border/60 bg-popover/95 p-1.5 text-popover-foreground shadow-2xl backdrop-blur-md ring-1 ring-black/5' export const PROMPT_SNIPPETS = [ { @@ -64,37 +63,6 @@ export const ASK_PLACEHOLDERS = [ 'Duck mode: gentle debugging, together.' ] -export const REF_ITEMS: Unstable_TriggerItem[] = [ - { - id: 'file:', - type: 'file', - label: 'File', - description: 'Attach a file path', - metadata: { icon: 'file' } - }, - { - id: 'folder:', - type: 'folder', - label: 'Folder', - description: 'Attach a folder path', - metadata: { icon: 'folder' } - }, - { - id: 'url:', - type: 'url', - label: 'URL', - description: 'Attach a web page', - metadata: { icon: 'url' } - }, - { - id: 'image:', - type: 'image', - label: 'Image', - description: 'Attach an image path', - metadata: { icon: 'image' } - } -] - export const EDGE_NEWLINES_RE = /^[\t ]*(?:\r\n|\r|\n)+|(?:\r\n|\r|\n)+[\t ]*$/g export const DEFAULT_MAX_RECORDING_SECONDS = 120 diff --git a/apps/desktop/src/app/chat/composer/context-menu.tsx b/apps/desktop/src/app/chat/composer/context-menu.tsx index 96851f9ac3..253b70e5a7 100644 --- a/apps/desktop/src/app/chat/composer/context-menu.tsx +++ b/apps/desktop/src/app/chat/composer/context-menu.tsx @@ -15,11 +15,10 @@ import { import { cn } from '@/lib/utils' import { GHOST_ICON_BTN, PROMPT_SNIPPETS } from './constants' -import type { ChatBarState, ContextSuggestion } from './types' +import type { ChatBarState } from './types' export function ContextMenu({ state, - onAddContextRef, onInsertText, onOpenUrlDialog, onPasteClipboardImage, @@ -28,7 +27,6 @@ export function ContextMenu({ onPickImages }: { state: ChatBarState - onAddContextRef?: (refText: string, label?: string, detail?: string) => void onInsertText: (text: string) => void onOpenUrlDialog: () => void onPasteClipboardImage?: () => void @@ -36,11 +34,6 @@ export function ContextMenu({ onPickFolders?: () => void onPickImages?: () => void }) { - const choose = (item: ContextSuggestion) => - onAddContextRef ? onAddContextRef(item.text, item.display, item.meta) : onInsertText(item.text) - - const suggestions = state.tools.suggestions?.slice(0, 8) ?? [] - return ( @@ -56,48 +49,28 @@ export function ContextMenu({ - - Add context + + + Attach + - Files + Files… - Folders + Folder… - Images + Images… - Image from clipboard + Paste image - URL + URL… - - - - Suggested files - - - {suggestions.length === 0 ? ( - - No suggestions - - ) : ( - suggestions.map(item => ( - choose(item)}> - - {item.display} - {item.meta && {item.meta}} - - )) - )} - - - @@ -111,6 +84,13 @@ export function ContextMenu({ ))} + + + +
+ Tip: type @ to reference files + inline. +
) diff --git a/apps/desktop/src/app/chat/composer/controls.tsx b/apps/desktop/src/app/chat/composer/controls.tsx index 56cf5a8a9d..c191762cd4 100644 --- a/apps/desktop/src/app/chat/composer/controls.tsx +++ b/apps/desktop/src/app/chat/composer/controls.tsx @@ -15,6 +15,7 @@ interface ConversationProps { status: ConversationStatus onEnd: () => void onStart: () => void + onStopTurn: () => void onToggleMute: () => void } @@ -80,6 +81,7 @@ function ConversationPill({ level, muted, onEnd, + onStopTurn, onToggleMute, status }: ConversationProps & { disabled: boolean }) { @@ -104,10 +106,10 @@ function ConversationPill({ aria-pressed={muted} className={cn(GHOST_ICON_BTN, 'p-0', muted && 'bg-muted text-muted-foreground')} disabled={disabled} - onClick={() => { - triggerHaptic('selection') - onToggleMute() - }} + onClick={() => { + triggerHaptic('selection') + onToggleMute() + }} size="icon" title={muted ? 'Unmute microphone' : 'Mute microphone'} type="button" @@ -115,6 +117,23 @@ function ConversationPill({ > {muted ? : } + {listening && ( + + )} - diff --git a/apps/desktop/src/app/chat/composer/voice-activity.tsx b/apps/desktop/src/app/chat/composer/voice-activity.tsx index f0f28ec3df..2f653bc198 100644 --- a/apps/desktop/src/app/chat/composer/voice-activity.tsx +++ b/apps/desktop/src/app/chat/composer/voice-activity.tsx @@ -1,6 +1,10 @@ -import { Loader2, Mic } from 'lucide-react' +import { useStore } from '@nanostores/react' +import { Loader2, Mic, Volume2, VolumeX } from 'lucide-react' +import { Button } from '@/components/ui/button' import { cn } from '@/lib/utils' +import { stopVoicePlayback } from '@/lib/voice-playback' +import { $voicePlayback } from '@/store/voice-playback' import type { VoiceActivityState } from './types' @@ -36,6 +40,25 @@ function VoiceLevelBars({ level, active }: { active: boolean; level: number }) { ) } +function PlaybackBars() { + const bars = [820, 940, 760, 880, 700, 980, 790] + + return ( + + ) +} + export function VoiceActivity({ state }: { @@ -75,3 +98,50 @@ export function VoiceActivity({ ) } + +export function VoicePlaybackActivity() { + const playback = useStore($voicePlayback) + + if (playback.status === 'idle') { + return null + } + + const preparing = playback.status === 'preparing' + + const title = preparing + ? 'Preparing audio' + : playback.source === 'voice-conversation' + ? 'Speaking response' + : 'Reading aloud' + + return ( +
+
+ {preparing ? : } +
+ +
+ {title} + {!preparing && } +
+ + +
+ ) +} diff --git a/apps/desktop/src/app/chat/hooks/use-composer-actions.ts b/apps/desktop/src/app/chat/hooks/use-composer-actions.ts index 279db6bd1e..16b3b3e930 100644 --- a/apps/desktop/src/app/chat/hooks/use-composer-actions.ts +++ b/apps/desktop/src/app/chat/hooks/use-composer-actions.ts @@ -1,5 +1,6 @@ import { useCallback } from 'react' +import { formatRefValue } from '@/components/assistant-ui/directive-text' import { attachmentId, contextPath, pathLabel } from '@/lib/chat-runtime' import { addComposerAttachment, @@ -57,7 +58,7 @@ export function useComposerActions({ activeSessionId, currentCwd, requestGateway kind, label: pathLabel(path), detail: rel, - refText: `@${kind}:${rel}`, + refText: `@${kind}:${formatRefValue(rel)}`, path }) } diff --git a/apps/desktop/src/app/chat/index.tsx b/apps/desktop/src/app/chat/index.tsx index 3d467a781e..0c7be68b31 100644 --- a/apps/desktop/src/app/chat/index.tsx +++ b/apps/desktop/src/app/chat/index.tsx @@ -8,13 +8,14 @@ import { useStore } from '@nanostores/react' import { useQuery } from '@tanstack/react-query' import { ChevronDown } from 'lucide-react' import type * as React from 'react' -import { Suspense, useMemo } from 'react' +import { Suspense, useMemo, useRef } from 'react' import { useLocation } from 'react-router-dom' import { Thread } from '@/components/assistant-ui/thread' import { NotificationStack } from '@/components/notifications' import { Button } from '@/components/ui/button' import { getGlobalModelOptions, type HermesGateway } from '@/hermes' +import type { ChatMessage } from '@/lib/chat-messages' import { quickModelOptions, sessionTitle, toRuntimeMessage } from '@/lib/chat-runtime' import { cn } from '@/lib/utils' import { $pinnedSessionIds } from '@/store/layout' @@ -57,7 +58,7 @@ interface ChatViewProps extends Omit, 'onSubmit'> { onPickFolders: () => void onPickImages: () => void onRemoveAttachment: (id: string) => void - onSubmit: (text: string) => void + onSubmit: (text: string) => Promise | void onChangeCwd: (cwd: string) => void onBrowseCwd: () => void onOpenModelPicker: () => void @@ -118,6 +119,7 @@ export function ChatView({ const pinnedSessionIds = useStore($pinnedSessionIds) const selectedSessionId = useStore($selectedStoredSessionId) const sessions = useStore($sessions) + const runtimeMessageCacheRef = useRef(new WeakMap()) const activeStoredSession = sessions.find(session => session.id === selectedSessionId) || null const isRoutedSessionView = Boolean(routeSessionId(location.pathname)) const selectedIsPinned = selectedSessionId ? pinnedSessionIds.includes(selectedSessionId) : false @@ -128,6 +130,7 @@ export function ChatView({ const loadingSession = isRoutedSessionView && messages.length === 0 const threadLoading = threadLoadingState(loadingSession, busy, awaitingResponse) const showChatBar = !loadingSession + const threadKey = selectedSessionId || activeSessionId || (isRoutedSessionView ? location.pathname : 'new') const title = activeStoredSession ? sessionTitle(activeStoredSession) : '' const modelOptionsQuery = useQuery({ @@ -190,7 +193,14 @@ export function ChatView({ parentId = branchParentByGroup.get(message.branchGroupId) ?? null } - items.push({ message: toRuntimeMessage(message), parentId }) + const cachedMessage = runtimeMessageCacheRef.current.get(message) + const runtimeMessage = cachedMessage ?? toRuntimeMessage(message) + + if (!cachedMessage) { + runtimeMessageCacheRef.current.set(message, runtimeMessage) + } + + items.push({ message: runtimeMessage, parentId }) if (!message.hidden) { visibleParentId = message.id @@ -248,6 +258,7 @@ export function ChatView({ intro={showIntro ? { personality: introPersonality, seed: introSeed } : undefined} loading={threadLoading} onBranchInNewChat={onBranchInNewChat} + sessionKey={threadKey} /> {showChatBar && ( }> diff --git a/apps/desktop/src/app/desktop-controller.tsx b/apps/desktop/src/app/desktop-controller.tsx index ca70bf4a91..2e407a2816 100644 --- a/apps/desktop/src/app/desktop-controller.tsx +++ b/apps/desktop/src/app/desktop-controller.tsx @@ -14,6 +14,7 @@ import { listSessions, setGlobalModel } from '../hermes' +import { formatRefValue } from '../components/assistant-ui/directive-text' import { toChatMessages } from '../lib/chat-messages' import { BUILTIN_PERSONALITIES, normalizePersonalityValue, personalityNamesFromConfig } from '../lib/chat-runtime' import { $pinnedSessionIds, pinSession, unpinSession } from '../store/layout' @@ -571,7 +572,7 @@ export function DesktopController() { gateway={gatewayRef.current} maxVoiceRecordingSeconds={voiceMaxRecordingSeconds} onAddContextRef={addContextRefAttachment} - onAddUrl={url => addContextRefAttachment(`@url:${url}`, url)} + onAddUrl={url => addContextRefAttachment(`@url:${formatRefValue(url)}`, url)} onBranchInNewChat={messageId => void branchInNewChat(messageId)} onBrowseCwd={() => void browseSessionCwd()} onCancel={() => void cancelRun()} @@ -589,7 +590,7 @@ export function DesktopController() { onReload={reloadFromMessage} onRemoveAttachment={id => void removeAttachment(id)} onSelectPersonality={name => void selectPersonality(name)} - onSubmit={text => void submitText(text)} + onSubmit={submitText} onThreadMessagesChange={handleThreadMessagesChange} onToggleSelectedPin={toggleSelectedPin} onTranscribeAudio={transcribeVoiceAudio} diff --git a/apps/desktop/src/app/session/hooks/use-message-stream.ts b/apps/desktop/src/app/session/hooks/use-message-stream.ts index c783ab0ce1..c95f87f742 100644 --- a/apps/desktop/src/app/session/hooks/use-message-stream.ts +++ b/apps/desktop/src/app/session/hooks/use-message-stream.ts @@ -1,6 +1,5 @@ import type { QueryClient } from '@tanstack/react-query' import { type MutableRefObject, useCallback } from 'react' -import { flushSync } from 'react-dom' import { appendReasoningPart, @@ -60,7 +59,6 @@ export function useMessageStream({ transform: (parts: ChatMessagePart[], message: ChatMessage) => ChatMessagePart[], seed: () => ChatMessagePart[], opts: { - sync?: boolean pending?: (message: ChatMessage) => boolean } = {} ) => { @@ -112,7 +110,7 @@ export function useMessageStream({ }) } - opts.sync ? flushSync(apply) : apply() + apply() }, [updateSessionState] ) @@ -126,8 +124,7 @@ export function useMessageStream({ mutateStream( sessionId, parts => appendTextPart(parts, delta), - () => [textPart(delta)], - { sync: true } + () => [textPart(delta)] ) }, [mutateStream] @@ -152,8 +149,7 @@ export function useMessageStream({ return appendReasoningPart(parts, delta) }, - () => [reasoningPart(delta)], - { sync: true } + () => [reasoningPart(delta)] ) }, [mutateStream] @@ -299,6 +295,7 @@ export function useMessageStream({ const apply = explicitSid ? isActiveEvent : !activeSessionIdRef.current const modelChanged = typeof payload?.model === 'string' const providerChanged = typeof payload?.provider === 'string' + const runningChanged = typeof payload?.running === 'boolean' if (apply) { if (modelChanged) { @@ -320,6 +317,35 @@ export function useMessageStream({ if (typeof payload?.personality === 'string') { setCurrentPersonality(normalizePersonalityValue(payload.personality)) } + + if (runningChanged && sessionId) { + updateSessionState(sessionId, state => { + const busy = Boolean(payload!.running) + + if (state.busy === busy && (busy || !state.awaitingResponse)) { + return state + } + + if (busy) { + return { + ...state, + busy + } + } + + if (state.awaitingResponse && !state.sawAssistantPayload) { + return state + } + + return { + ...state, + awaitingResponse: false, + busy, + pendingBranchGroup: null, + streamId: null + } + }) + } } void refreshHermesConfig() @@ -355,11 +381,11 @@ export function useMessageStream({ } } else if (event.type === 'reasoning.delta') { if (sessionId) { - appendReasoningDelta(sessionId, coerceGatewayText(payload?.text)) + appendReasoningDelta(sessionId, coerceThinkingText(payload?.text)) } } else if (event.type === 'reasoning.available') { if (sessionId) { - appendReasoningDelta(sessionId, coerceGatewayText(payload?.text), true) + appendReasoningDelta(sessionId, coerceThinkingText(payload?.text), true) } } else if (event.type === 'message.complete') { if (!sessionId) { diff --git a/apps/desktop/src/app/session/hooks/use-prompt-actions.ts b/apps/desktop/src/app/session/hooks/use-prompt-actions.ts index 625cd56ec7..d7d527d929 100644 --- a/apps/desktop/src/app/session/hooks/use-prompt-actions.ts +++ b/apps/desktop/src/app/session/hooks/use-prompt-actions.ts @@ -13,7 +13,7 @@ import { import { triggerHaptic } from '@/lib/haptics' import { $composerAttachments, clearComposerAttachments } from '@/store/composer' import { clearNotifications, notify, notifyError } from '@/store/notifications' -import { $busy, $messages, setAwaitingResponse, setBusy } from '@/store/session' +import { $busy, $messages, setAwaitingResponse, setBusy, setMessages } from '@/store/session' import type { ClientSessionState, SlashExecResponse } from '../../types' @@ -296,12 +296,34 @@ export function usePromptActions({ ) const cancelRun = useCallback(async () => { - if (!activeSessionId) { + const sessionId = activeSessionId || activeSessionIdRef.current + + busyRef.current = false + setBusy(false) + setAwaitingResponse(false) + + const finalizeMessages = (messages: ChatMessage[]) => + messages.map(message => + message.pending + ? { + ...message, + parts: chatMessageText(message).trim() + ? appendTextPart(message.parts, INTERRUPTED_MARKER) + : [...message.parts, textPart(INTERRUPTED_MARKER.trim())], + pending: false + } + : message + ) + + if (!sessionId) { + setMessages(finalizeMessages($messages.get())) + return } - updateSessionState(activeSessionId, state => { + updateSessionState(sessionId, state => { const streamId = state.streamId + const messages = streamId ? state.messages.map(message => message.id === streamId @@ -314,7 +336,7 @@ export function usePromptActions({ } : message ) - : state.messages + : finalizeMessages(state.messages) return { ...state, @@ -328,11 +350,11 @@ export function usePromptActions({ }) try { - await requestGateway('session.interrupt', { session_id: activeSessionId }) + await requestGateway('session.interrupt', { session_id: sessionId }) } catch (err) { notifyError(err, 'Stop failed') } - }, [activeSessionId, requestGateway, updateSessionState]) + }, [activeSessionId, activeSessionIdRef, busyRef, requestGateway, updateSessionState]) const reloadFromMessage = useCallback( async (parentId: string | null) => { diff --git a/apps/desktop/src/app/session/hooks/use-session-actions.ts b/apps/desktop/src/app/session/hooks/use-session-actions.ts index 8ab9e91b3b..a0737ea9d8 100644 --- a/apps/desktop/src/app/session/hooks/use-session-actions.ts +++ b/apps/desktop/src/app/session/hooks/use-session-actions.ts @@ -87,6 +87,11 @@ export function useSessionActions({ const createBackendSessionForSend = useCallback(async (): Promise => { const created = await requestGateway('session.create', { cols: 96 }) + + if (created.stored_session_id) { + navigate(sessionRoute(created.stored_session_id), { replace: true }) + } + setActiveSessionId(created.session_id) activeSessionIdRef.current = created.session_id ensureSessionState(created.session_id, created.stored_session_id ?? null) @@ -94,7 +99,6 @@ export function useSessionActions({ if (created.stored_session_id) { setSelectedStoredSessionId(created.stored_session_id) selectedStoredSessionIdRef.current = created.stored_session_id - navigate(sessionRoute(created.stored_session_id), { replace: true }) } if (created.info?.model) { diff --git a/apps/desktop/src/app/settings/constants.ts b/apps/desktop/src/app/settings/constants.ts index a854842dca..c663eb5c6e 100644 --- a/apps/desktop/src/app/settings/constants.ts +++ b/apps/desktop/src/app/settings/constants.ts @@ -60,6 +60,7 @@ export const ENUM_OPTIONS: Record = { 'context.engine': ['compressor', 'default', 'custom'], 'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'], 'memory.provider': ['', 'builtin', 'honcho'], + 'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'], 'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'], 'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'] } @@ -101,6 +102,10 @@ export const FIELD_LABELS: Record = { 'stt.provider': 'Speech-To-Text Provider', 'stt.local.model': 'Local Transcription Model', 'stt.local.language': 'Transcription Language', + 'stt.elevenlabs.model_id': 'ElevenLabs STT Model', + 'stt.elevenlabs.language_code': 'ElevenLabs Language', + 'stt.elevenlabs.tag_audio_events': 'Tag Audio Events', + 'stt.elevenlabs.diarize': 'Speaker Diarization', 'tts.provider': 'Text-To-Speech Provider', 'tts.edge.voice': 'Edge Voice', 'tts.openai.model': 'OpenAI TTS Model', @@ -157,6 +162,7 @@ export const FIELD_DESCRIPTIONS: Record = { 'compression.enabled': 'Summarize older context when conversations get large.', 'voice.auto_tts': 'Automatically speak assistant responses.', 'stt.enabled': 'Enable local or provider-backed speech transcription.', + 'stt.elevenlabs.language_code': 'Optional ISO-639-3 language code. Blank lets ElevenLabs auto-detect.', 'agent.max_turns': 'Upper bound for tool-calling turns before Hermes stops a run.' } @@ -241,6 +247,10 @@ export const SECTIONS: DesktopConfigSection[] = [ 'tts.elevenlabs.model_id', 'stt.local.model', 'stt.local.language', + 'stt.elevenlabs.model_id', + 'stt.elevenlabs.language_code', + 'stt.elevenlabs.tag_audio_events', + 'stt.elevenlabs.diarize', 'voice.record_key', 'voice.max_recording_seconds' ] diff --git a/apps/desktop/src/components/assistant-ui/directive-text.test.ts b/apps/desktop/src/components/assistant-ui/directive-text.test.ts new file mode 100644 index 0000000000..60c89f18b1 --- /dev/null +++ b/apps/desktop/src/components/assistant-ui/directive-text.test.ts @@ -0,0 +1,39 @@ +import { describe, expect, it } from 'vitest' + +import { formatRefValue, hermesDirectiveFormatter } from './directive-text' + +describe('formatRefValue', () => { + it('leaves simple paths untouched', () => { + expect(formatRefValue('src/index.ts')).toBe('src/index.ts') + expect(formatRefValue('https://example.com/post')).toBe('https://example.com/post') + }) + + it('wraps paths with whitespace in backticks', () => { + expect(formatRefValue('apple-touch-icon (1).png')).toBe('`apple-touch-icon (1).png`') + }) + + it('falls back to double quotes when value contains backticks', () => { + expect(formatRefValue('weird `name` (1).md')).toBe('"weird `name` (1).md"') + }) +}) + +describe('hermesDirectiveFormatter.parse', () => { + it('keeps quoted file paths whole when parsing', () => { + const segments = hermesDirectiveFormatter.parse('see @image:`apple-touch-icon (1).png` for the icon') + + expect(segments).toEqual([ + { kind: 'text', text: 'see ' }, + { kind: 'mention', type: 'image', label: 'apple-touch-icon (1).png', id: 'apple-touch-icon (1).png' }, + { kind: 'text', text: ' for the icon' } + ]) + }) + + it('still parses unquoted paths', () => { + const segments = hermesDirectiveFormatter.parse('@file:src/main.tsx the entry point') + + expect(segments).toEqual([ + { kind: 'mention', type: 'file', label: 'main.tsx', id: 'src/main.tsx' }, + { kind: 'text', text: ' the entry point' } + ]) + }) +}) diff --git a/apps/desktop/src/components/assistant-ui/directive-text.tsx b/apps/desktop/src/components/assistant-ui/directive-text.tsx index 383baed7c2..2c5c40d7e5 100644 --- a/apps/desktop/src/components/assistant-ui/directive-text.tsx +++ b/apps/desktop/src/components/assistant-ui/directive-text.tsx @@ -24,10 +24,63 @@ const ICONS: Record> = { * so they render as inline chips in user messages instead of raw text. * * Supported types: file, folder, url, image. Anything else stays plain text. + * + * Mirrors the Python `agent/context_references.REFERENCE_PATTERN` syntax: + * the value may be wrapped in backticks, single quotes, or double quotes so + * paths with spaces/parens/etc. survive parsing intact. */ -const CANONICAL_DIRECTIVE_RE = /:([\w-]{1,64})\[([^\]\n]{1,1024})\](?:\{name=([^}\n]{1,1024})\})?/gu +const CANONICAL_DIRECTIVE_RE = /:([\w-]{1,64})\[([^\]\n]{1,1024})\](?:\{name=([^}\n]{1,1024})\})?/g -const HERMES_DIRECTIVE_RE = /@(file|folder|url|image|tool):(\S+)/gu +const HERMES_DIRECTIVE_RE = new RegExp( + '@(file|folder|url|image|tool):(' + + '`[^`\\n]+`' + + '|"[^"\\n]+"' + + "|'[^'\\n]+'" + + '|\\S+' + + ')', + 'g' +) + +const TRAILING_PUNCTUATION_RE = /[,.;!?]+$/ + +function unwrapRefValue(raw: string): string { + if (raw.length < 2) { + return raw + } + + const head = raw[0] + const tail = raw[raw.length - 1] + + if ((head === '`' && tail === '`') || (head === '"' && tail === '"') || (head === "'" && tail === "'")) { + return raw.slice(1, -1) + } + + return raw.replace(TRAILING_PUNCTUATION_RE, '') +} + +function needsQuoting(value: string): boolean { + return /[\s()\[\]{}<>"'`]/.test(value) +} + +export function formatRefValue(value: string): string { + if (!needsQuoting(value)) { + return value + } + + if (!value.includes('`')) { + return `\`${value}\`` + } + + if (!value.includes('"')) { + return `"${value}"` + } + + if (!value.includes("'")) { + return `'${value}'` + } + + return value +} export const hermesDirectiveFormatter: Unstable_DirectiveFormatter = { serialize(item: Unstable_TriggerItem): string { @@ -35,7 +88,7 @@ export const hermesDirectiveFormatter: Unstable_DirectiveFormatter = { return `@${item.id}` } - return `@${item.type}:${item.id}` + return `@${item.type}:${formatRefValue(item.id)}` }, parse(text: string): readonly Unstable_DirectiveSegment[] { return parseDirectiveText(text) @@ -51,13 +104,17 @@ function parseDirectiveText(text: string): Unstable_DirectiveSegment[] { label: match[2] || match[3] || '', id: match[3] || match[2] || '' })), - ...Array.from(text.matchAll(HERMES_DIRECTIVE_RE)).map(match => ({ - start: match.index ?? 0, - end: (match.index ?? 0) + match[0].length, - type: match[1] || 'file', - label: shortLabel(match[1] as HermesRefType, match[2] || ''), - id: match[2] || '' - })) + ...Array.from(text.matchAll(HERMES_DIRECTIVE_RE)).map(match => { + const id = unwrapRefValue(match[2] || '') + + return { + start: match.index ?? 0, + end: (match.index ?? 0) + match[0].length, + type: match[1] || 'file', + label: shortLabel(match[1] as HermesRefType, id), + id + } + }) ] .filter(match => match.id) .sort((a, b) => a.start - b.start) @@ -136,14 +193,14 @@ const DirectiveChip: FC<{ return ( - {Icon && } + {Icon && } {label} ) diff --git a/apps/desktop/src/components/assistant-ui/intro.tsx b/apps/desktop/src/components/assistant-ui/intro.tsx index 22f1c50803..ab509ad6d6 100644 --- a/apps/desktop/src/components/assistant-ui/intro.tsx +++ b/apps/desktop/src/components/assistant-ui/intro.tsx @@ -19,6 +19,7 @@ export type IntroProps = { const NEUTRAL_PERSONALITIES = new Set(['', 'default', 'none', 'neutral']) const HERMES_FRAME_COUNT = 8 +const ASSET_BASE_URL = import.meta.env.BASE_URL || '/' const FALLBACK_COPY: IntroCopy[] = [ { @@ -154,6 +155,10 @@ function resolveCopy(personality?: string, seed?: number): IntroCopy { return pickCopy(copies, seed) } +function publicAssetPath(path: string): string { + return `${ASSET_BASE_URL}${path}`.replace(/([^:]\/)\/+/g, '$1') +} + export const Intro: FC = ({ personality, seed }) => { const [mountSeed] = useState(() => Math.floor(Math.random() * 100000)) const [frameOffset, setFrameOffset] = useState(0) @@ -184,7 +189,7 @@ export const Intro: FC = ({ personality, seed }) => { aria-hidden="true" className="h-full w-full scale-110 object-contain select-none" draggable={false} - src={`/hermes-frames/hermes-frame-${frameIndex}.png?v=matte-clean-6`} + src={publicAssetPath(`hermes-frames/hermes-frame-${frameIndex}.png?v=matte-clean-6`)} />

Hermes Agent

diff --git a/apps/desktop/src/components/assistant-ui/streaming.test.tsx b/apps/desktop/src/components/assistant-ui/streaming.test.tsx index e45c0d98df..80683a6984 100644 --- a/apps/desktop/src/components/assistant-ui/streaming.test.tsx +++ b/apps/desktop/src/components/assistant-ui/streaming.test.tsx @@ -1,19 +1,53 @@ import { AssistantRuntimeProvider, type ThreadMessage, useExternalStoreRuntime } from '@assistant-ui/react' -import { act, render, screen, waitFor } from '@testing-library/react' +import { act, fireEvent, render, screen, waitFor } from '@testing-library/react' import { useEffect, useState } from 'react' -import { describe, expect, it, vi } from 'vitest' +import { beforeEach, describe, expect, it, vi } from 'vitest' import { Thread } from './thread' const createdAt = new Date('2026-05-01T00:00:00.000Z') +const resizeObservers = new Set() + class TestResizeObserver { - observe() {} + private target: Element | null = null + + constructor(private readonly callback: ResizeObserverCallback) { + resizeObservers.add(this) + } + + observe(target: Element) { + this.target = target + } + unobserve() {} - disconnect() {} + + disconnect() { + resizeObservers.delete(this) + } + + trigger(height: number) { + if (!this.target) { + return + } + + this.callback( + [ + { + contentRect: { height } as DOMRectReadOnly, + target: this.target + } as ResizeObserverEntry + ], + this as unknown as ResizeObserver + ) + } } vi.stubGlobal('ResizeObserver', TestResizeObserver) +vi.stubGlobal('requestAnimationFrame', (callback: FrameRequestCallback) => + window.setTimeout(() => callback(performance.now()), 0) +) +vi.stubGlobal('cancelAnimationFrame', (id: number) => window.clearTimeout(id)) Element.prototype.scrollTo = function scrollTo() {} @@ -90,6 +124,10 @@ function StreamingHarness() { } describe('assistant-ui streaming renderer', () => { + beforeEach(() => { + resizeObservers.clear() + }) + it('renders assistant text incrementally before completion', async () => { const { container } = render() @@ -115,4 +153,42 @@ describe('assistant-ui streaming renderer', () => { expect(container.textContent).toContain('first chunk second chunk') }) }) + + it('does not pull the viewport back down after the user scrolls up during streaming', async () => { + const { container } = render() + + const viewport = container.querySelector('[data-slot="aui_thread-viewport"]') as HTMLDivElement + let scrollHeight = 1_000 + + Object.defineProperty(viewport, 'clientHeight', { configurable: true, value: 200 }) + Object.defineProperty(viewport, 'scrollHeight', { + configurable: true, + get: () => scrollHeight + }) + + await wait(80) + + await act(async () => { + viewport.scrollTop = 800 + fireEvent.scroll(viewport) + }) + await wait(0) + + await act(async () => { + fireEvent.wheel(viewport, { deltaY: -120 }) + viewport.scrollTop = 420 + fireEvent.scroll(viewport) + }) + + scrollHeight = 1_200 + + await act(async () => { + for (const observer of resizeObservers) { + observer.trigger(1_200) + } + }) + await wait(0) + + expect(viewport.scrollTop).toBe(420) + }) }) diff --git a/apps/desktop/src/components/assistant-ui/thread.tsx b/apps/desktop/src/components/assistant-ui/thread.tsx index 6218788fe0..33632ef0f0 100644 --- a/apps/desktop/src/components/assistant-ui/thread.tsx +++ b/apps/desktop/src/components/assistant-ui/thread.tsx @@ -8,18 +8,28 @@ import { type ToolCallMessagePartProps, useAuiState } from '@assistant-ui/react' +import { useStore } from '@nanostores/react' import { CheckIcon, ChevronLeftIcon, ChevronRightIcon, CopyIcon, GitBranchIcon, + Loader2Icon, MoreHorizontalIcon, RefreshCwIcon, Volume2Icon, VolumeXIcon } from 'lucide-react' -import { type FC, type ReactNode, useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react' +import { + type FC, + type ReactNode, + useCallback, + useEffect, + useLayoutEffect, + useRef, + useState +} from 'react' import { useElapsedSeconds } from '@/components/assistant-ui/activity-timer' import { ActivityTimerText } from '@/components/assistant-ui/activity-timer-text' @@ -38,11 +48,12 @@ import { DropdownMenuTrigger } from '@/components/ui/dropdown-menu' import { Loader } from '@/components/ui/loader' -import { speakText } from '@/hermes' import { triggerHaptic } from '@/lib/haptics' import { cn } from '@/lib/utils' +import { playSpeechText, stopVoicePlayback } from '@/lib/voice-playback' import { notifyError } from '@/store/notifications' import { setThreadScrolledUp } from '@/store/thread-scroll' +import { $voicePlayback } from '@/store/voice-playback' const THINKING_FACES = [ '(。•́︿•̀。)', @@ -119,12 +130,16 @@ export const Thread: FC<{ intro?: IntroProps loading?: ThreadLoadingState onBranchInNewChat?: (messageId: string) => void -}> = ({ intro, loading, onBranchInNewChat }) => { + sessionKey?: string | null +}> = ({ intro, loading, onBranchInNewChat, sessionKey }) => { const viewportRef = useRef(null) + const contentRef = useRef(null) const messageCount = useAuiState(s => s.thread.messages.length) const isRunning = useAuiState(s => s.thread.isRunning) const lastMessageId = useAuiState(s => s.thread.messages.at(-1)?.id ?? '') const shouldStickToBottomRef = useRef(true) + const scrollFrameRef = useRef(null) + const sessionKeyRef = useRef(sessionKey ?? null) const handleScroll = useCallback((event: React.UIEvent) => { const nearBottom = isNearBottom(event.currentTarget) @@ -132,8 +147,44 @@ export const Thread: FC<{ setThreadScrolledUp(!nearBottom) }, []) + const handleWheel = useCallback((event: React.WheelEvent) => { + if (event.deltaY < 0) { + shouldStickToBottomRef.current = false + setThreadScrolledUp(true) + } + }, []) + + const scrollToBottom = useCallback(() => { + const viewport = viewportRef.current + + if (!viewport) { + return + } + + viewport.scrollTop = viewport.scrollHeight + shouldStickToBottomRef.current = true + setThreadScrolledUp(false) + }, []) + + const scheduleScrollToBottom = useCallback(() => { + if (scrollFrameRef.current !== null) { + window.cancelAnimationFrame(scrollFrameRef.current) + } + + scrollFrameRef.current = window.requestAnimationFrame(() => { + scrollFrameRef.current = null + scrollToBottom() + }) + }, [scrollToBottom]) + useEffect(() => { - return () => setThreadScrolledUp(false) + return () => { + if (scrollFrameRef.current !== null) { + window.cancelAnimationFrame(scrollFrameRef.current) + } + + setThreadScrolledUp(false) + } }, []) useLayoutEffect(() => { @@ -143,16 +194,48 @@ export const Thread: FC<{ return } - const force = loading === 'session' + const nextSessionKey = sessionKey ?? null + const sessionChanged = sessionKeyRef.current !== nextSessionKey + sessionKeyRef.current = nextSessionKey + const force = loading === 'session' || sessionChanged if (!force && !shouldStickToBottomRef.current) { return } - viewport.scrollTop = viewport.scrollHeight - shouldStickToBottomRef.current = true - setThreadScrolledUp(false) - }, [isRunning, lastMessageId, loading, messageCount]) + scheduleScrollToBottom() + }, [isRunning, lastMessageId, loading, messageCount, scheduleScrollToBottom, sessionKey]) + + useLayoutEffect(() => { + const content = contentRef.current + const viewport = viewportRef.current + + if (!content || !viewport) { + return + } + + let previousHeight = content.getBoundingClientRect().height + + const observer = new ResizeObserver(entries => { + const height = entries[0]?.contentRect.height ?? content.getBoundingClientRect().height + + if (height === previousHeight) { + return + } + + previousHeight = height + + if (!shouldStickToBottomRef.current && !isNearBottom(viewport)) { + return + } + + scheduleScrollToBottom() + }) + + observer.observe(content) + + return () => observer.disconnect() + }, [scheduleScrollToBottom]) return ( @@ -160,15 +243,17 @@ export const Thread: FC<{ Boolean(intro) && s.thread.isEmpty}>{intro && } -
+
{() => } {loading === 'response' && } {loading === 'working' && } @@ -446,7 +531,7 @@ const AssistantActionBar: FC = ({ messageId, messageText, on Branch in new chat - + @@ -479,80 +564,39 @@ const CopyMessageButton: FC<{ text: string }> = ({ text }) => { ) } -let currentAudio: HTMLAudioElement | null = null +const ReadAloudItem: FC<{ messageId: string; text: string }> = ({ messageId, text }) => { + const voicePlayback = useStore($voicePlayback) -function stopCurrentAudio() { - if (!currentAudio) { - return - } + const readAloudStatus = + voicePlayback.source === 'read-aloud' && voicePlayback.messageId === messageId ? voicePlayback.status : 'idle' - currentAudio.pause() - currentAudio.src = '' - currentAudio = null -} - -const ReadAloudItem: FC<{ text: string }> = ({ text }) => { - const [reading, setReading] = useState(false) - const seqRef = useRef(0) - - const stop = useCallback(() => { - seqRef.current += 1 - stopCurrentAudio() - setReading(false) - }, []) + const isPreparing = readAloudStatus === 'preparing' + const isSpeaking = readAloudStatus === 'speaking' + const anyPlaybackActive = voicePlayback.status !== 'idle' + const Icon = isPreparing ? Loader2Icon : isSpeaking ? VolumeXIcon : Volume2Icon const read = useCallback(async () => { - if (!text) { + if (!text || $voicePlayback.get().status !== 'idle') { return } - stopCurrentAudio() - const seq = ++seqRef.current - const isCurrent = () => seq === seqRef.current - - const finish = () => { - if (!isCurrent()) { - return - } - - currentAudio = null - setReading(false) - } - - setReading(true) - try { - const { data_url } = await speakText(text) - - if (!isCurrent()) { - return - } - - const audio = new Audio(data_url) - currentAudio = audio - audio.addEventListener('ended', finish, { once: true }) - audio.addEventListener('error', finish, { once: true }) - await audio.play() + await playSpeechText(text, { messageId, source: 'read-aloud' }) } catch (error) { - if (isCurrent()) { - notifyError(error, 'Read aloud failed') - finish() - } + notifyError(error, 'Read aloud failed') } - }, [text]) - - const Icon = reading ? VolumeXIcon : Volume2Icon + }, [messageId, text]) return ( { e.preventDefault() - void (reading ? stop() : read()) + void (isSpeaking ? stopVoicePlayback() : read()) }} > - - {reading ? 'Stop reading' : 'Read aloud'} + + {isPreparing ? 'Preparing audio...' : isSpeaking ? 'Stop reading' : 'Read aloud'} ) } diff --git a/apps/desktop/src/lib/chat-messages.test.ts b/apps/desktop/src/lib/chat-messages.test.ts new file mode 100644 index 0000000000..f0b742de03 --- /dev/null +++ b/apps/desktop/src/lib/chat-messages.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, it } from 'vitest' + +import { chatMessageText, toChatMessages } from './chat-messages' + +describe('toChatMessages', () => { + it('hides attached context payloads from user message display', () => { + const [message] = toChatMessages([ + { + role: 'user', + content: + 'what is this file\n\n--- Attached Context ---\n\n📄 @file:tsconfig.tsbuildinfo (981 tokens)\n```json\n{"root":["./src/main.tsx"]}\n```', + timestamp: 1 + } + ]) + + expect(chatMessageText(message)).toBe('@file:tsconfig.tsbuildinfo\n\nwhat is this file') + }) +}) diff --git a/apps/desktop/src/lib/chat-messages.ts b/apps/desktop/src/lib/chat-messages.ts index c02f3f02d4..d891df7688 100644 --- a/apps/desktop/src/lib/chat-messages.ts +++ b/apps/desktop/src/lib/chat-messages.ts @@ -29,6 +29,7 @@ export type GatewayEventPayload = { todos?: unknown model?: string provider?: string + running?: boolean cwd?: string branch?: string personality?: string @@ -49,6 +50,28 @@ export function chatMessageText(message: ChatMessage): string { .join('') } +const ATTACHED_CONTEXT_MARKER_RE = /(?:^|\n)--- Attached Context ---\s*\n/ +const CONTEXT_WARNINGS_MARKER_RE = /(?:^|\n)--- Context Warnings ---[\s\S]*$/ +const CONTEXT_REF_RE = /@(file|folder|url|image|tool):(?:"[^"\n]+"|'[^'\n]+'|`[^`\n]+`|\S+)/g + +function displayContentForMessage(role: SessionMessage['role'], content: string): string { + if (role !== 'user') { + return content + } + + const marker = content.match(ATTACHED_CONTEXT_MARKER_RE) + + if (!marker || marker.index === undefined) { + return content.replace(CONTEXT_WARNINGS_MARKER_RE, '').trim() + } + + const visibleText = content.slice(0, marker.index).replace(CONTEXT_WARNINGS_MARKER_RE, '').trim() + const attachedContext = content.slice(marker.index + marker[0].length) + const refs = [...new Set(Array.from(attachedContext.matchAll(CONTEXT_REF_RE)).map(match => match[0]))] + + return [refs.join('\n'), visibleText].filter(Boolean).join('\n\n') || visibleText +} + export function appendTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] { const next = [...parts] const last = next.at(-1) @@ -363,6 +386,7 @@ export function toChatMessages(messages: SessionMessage[]): ChatMessage[] { } const content = message.content || message.text || message.context || message.name || '' + const displayContent = displayContentForMessage(message.role, content) const parts: ChatMessagePart[] = [] const reasoning = @@ -374,8 +398,8 @@ export function toChatMessages(messages: SessionMessage[]): ChatMessage[] { parts.push(reasoningPart(reasoning)) } - if (content) { - parts.push(textPart(content)) + if (displayContent) { + parts.push(textPart(displayContent)) } if (message.role === 'assistant' && Array.isArray(message.tool_calls)) { diff --git a/apps/desktop/src/lib/chat-runtime.test.ts b/apps/desktop/src/lib/chat-runtime.test.ts new file mode 100644 index 0000000000..c06ea6f324 --- /dev/null +++ b/apps/desktop/src/lib/chat-runtime.test.ts @@ -0,0 +1,18 @@ +import { describe, expect, it } from 'vitest' + +import { coerceThinkingText } from './chat-runtime' + +describe('coerceThinkingText', () => { + it('strips streaming status prefixes from thinking deltas', () => { + expect(coerceThinkingText("◉_◉ processing... checking the user's request")).toBe("checking the user's request") + expect(coerceThinkingText('(¬‿¬) analyzing... reading the file')).toBe('reading the file') + }) + + it('drops empty thinking rewrite placeholder text', () => { + expect( + coerceThinkingText( + "◉_◉ processing... I don't see any current rewritten thinking or next thinking to process. Could you provide the thinking content you'd like me to rewrite?" + ) + ).toBe('') + }) +}) diff --git a/apps/desktop/src/lib/chat-runtime.ts b/apps/desktop/src/lib/chat-runtime.ts index 488155de05..011cd6001a 100644 --- a/apps/desktop/src/lib/chat-runtime.ts +++ b/apps/desktop/src/lib/chat-runtime.ts @@ -2,6 +2,7 @@ import type { ThreadMessage } from '@assistant-ui/react' import type { QuickModelOption } from '@/app/chat/composer/types' import type { ClientSessionState, CommandDispatchResponse } from '@/app/types' +import { formatRefValue } from '@/components/assistant-ui/directive-text' import { type ChatMessage, type ChatMessagePart, chatMessageText, textPart } from '@/lib/chat-messages' import type { ComposerAttachment } from '@/store/composer' import type { ModelOptionsResponse, SessionInfo } from '@/types/hermes' @@ -25,7 +26,11 @@ export const BUILTIN_PERSONALITIES = [ 'hype' ] -const SPINNER_STATUS_RE = /^\s*[((][^\s))]{1,8}[))]\s+[^.\n]{2,48}\.\.\.\s*/ +const THINKING_STATUS_PREFIX_RE = + /^\s*(?:(?:[^\s.]{1,16})\s+)?(?:processing|thinking|reasoning|analyzing|pondering|contemplating|musing|cogitating|ruminating|deliberating|mulling|reflecting|computing|synthesizing|formulating|brainstorming)\.\.\.\s*/i + +const EMPTY_THINKING_PLACEHOLDER_RE = + /\b(?:current rewritten thinking|next thinking to process|provide the thinking content|don't see any .*thinking)\b/i export function createClientSessionState( storedSessionId: string | null = null, @@ -102,7 +107,9 @@ export function coerceGatewayText(value: unknown): string { } export function coerceThinkingText(value: unknown): string { - return coerceGatewayText(value).replace(SPINNER_STATUS_RE, '').trim() + const text = coerceGatewayText(value).replace(THINKING_STATUS_PREFIX_RE, '').trim() + + return EMPTY_THINKING_PLACEHOLDER_RE.test(text) ? '' : text } export function isImageGenerationTool(name?: string): boolean { @@ -135,7 +142,7 @@ export function attachmentDisplayText(attachment: ComposerAttachment): string | if (attachment.kind === 'image') { const id = attachment.detail || attachment.path || attachment.label - return id ? `@image:${id}` : null + return id ? `@image:${formatRefValue(id)}` : null } return null diff --git a/apps/desktop/src/lib/speech-text.ts b/apps/desktop/src/lib/speech-text.ts new file mode 100644 index 0000000000..d2b6a5852f --- /dev/null +++ b/apps/desktop/src/lib/speech-text.ts @@ -0,0 +1,19 @@ +const EMOJI_RE = /[\p{Extended_Pictographic}\uFE0F\u200D]+/gu +const FENCED_CODE_RE = /```[\s\S]*?(?:```|$)/g +const INLINE_CODE_RE = /`([^`]+)`/g +const MARKDOWN_LINK_RE = /\[([^\]]+)\]\(([^)]+)\)/g +const URL_RE = /\bhttps?:\/\/\S+/gi + +export function sanitizeTextForSpeech(text: string): string { + return text + .replace(FENCED_CODE_RE, ' ') + .replace(MARKDOWN_LINK_RE, '$1') + .replace(INLINE_CODE_RE, '$1') + .replace(URL_RE, ' link ') + .replace(EMOJI_RE, ' ') + .replace(/^#{1,6}\s+/gm, '') + .replace(/[*_~>#]/g, '') + .replace(/^\s*[-+*]\s+/gm, '') + .replace(/\s+/g, ' ') + .trim() +} diff --git a/apps/desktop/src/lib/voice-playback.ts b/apps/desktop/src/lib/voice-playback.ts new file mode 100644 index 0000000000..5afffe4ae6 --- /dev/null +++ b/apps/desktop/src/lib/voice-playback.ts @@ -0,0 +1,96 @@ +import { speakText } from '@/hermes' +import { + $voicePlayback, + setVoicePlaybackState, + type VoicePlaybackSource, + type VoicePlaybackState +} from '@/store/voice-playback' + +import { sanitizeTextForSpeech } from './speech-text' + +let currentAudio: HTMLAudioElement | null = null +let sequence = 0 + +function currentState(status: VoicePlaybackState['status'], options?: VoicePlaybackOptions): VoicePlaybackState { + return { + messageId: options?.messageId ?? null, + sequence, + source: options?.source ?? null, + status + } +} + +export interface VoicePlaybackOptions { + messageId?: string | null + source: VoicePlaybackSource +} + +export function stopVoicePlayback() { + sequence += 1 + + if (currentAudio) { + currentAudio.pause() + currentAudio.src = '' + currentAudio = null + } + + setVoicePlaybackState({ + messageId: null, + sequence, + source: null, + status: 'idle' + }) +} + +export async function playSpeechText(text: string, options: VoicePlaybackOptions): Promise { + stopVoicePlayback() + + const speakableText = sanitizeTextForSpeech(text) + + if (!speakableText) { + return false + } + + const ownSequence = sequence + const isCurrent = () => ownSequence === sequence + + setVoicePlaybackState(currentState('preparing', options)) + + try { + const response = await speakText(speakableText) + + if (!isCurrent()) { + return false + } + + const audio = new Audio(response.data_url) + currentAudio = audio + setVoicePlaybackState(currentState('speaking', options)) + + await new Promise((resolve, reject) => { + audio.addEventListener('ended', () => resolve(), { once: true }) + audio.addEventListener('error', () => reject(new Error('Playback failed')), { once: true }) + void audio.play().catch(reject) + }) + + if (!isCurrent()) { + return false + } + + currentAudio = null + setVoicePlaybackState(currentState('idle')) + + return true + } catch (error) { + if (isCurrent()) { + currentAudio = null + setVoicePlaybackState(currentState('idle')) + } + + throw error + } +} + +export function isVoicePlaybackActive() { + return $voicePlayback.get().status !== 'idle' +} diff --git a/apps/desktop/src/store/notifications.ts b/apps/desktop/src/store/notifications.ts index b2afaab9ba..91adbf9279 100644 --- a/apps/desktop/src/store/notifications.ts +++ b/apps/desktop/src/store/notifications.ts @@ -50,6 +50,13 @@ const ERROR_SUMMARIES: { test: (msg: string) => boolean; summarize: (msg: string test: msg => /neither voice_tools_openai_key nor openai_api_key is set/i.test(msg), summarize: () => 'OpenAI TTS needs VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY.' }, + { + test: msg => /ELEVENLABS_API_KEY not set/i.test(msg) || /ElevenLabs STT API error \(HTTP 401\)/i.test(msg), + summarize: msg => + /ELEVENLABS_API_KEY not set/i.test(msg) + ? 'ElevenLabs STT needs ELEVENLABS_API_KEY.' + : 'ElevenLabs rejected the API key (401).' + }, { test: msg => /method not allowed/i.test(msg), summarize: () => 'The desktop backend does not support that audio endpoint yet. Restart Hermes Desktop.' diff --git a/apps/desktop/src/store/voice-playback.ts b/apps/desktop/src/store/voice-playback.ts new file mode 100644 index 0000000000..475a8c0daf --- /dev/null +++ b/apps/desktop/src/store/voice-playback.ts @@ -0,0 +1,22 @@ +import { atom } from 'nanostores' + +export type VoicePlaybackSource = 'read-aloud' | 'voice-conversation' +export type VoicePlaybackStatus = 'idle' | 'preparing' | 'speaking' + +export interface VoicePlaybackState { + messageId: string | null + sequence: number + source: VoicePlaybackSource | null + status: VoicePlaybackStatus +} + +export const $voicePlayback = atom({ + messageId: null, + sequence: 0, + source: null, + status: 'idle' +}) + +export function setVoicePlaybackState(next: VoicePlaybackState) { + $voicePlayback.set(next) +} diff --git a/apps/desktop/src/styles.css b/apps/desktop/src/styles.css index 3ddebe6d5f..9d63d7c7b4 100644 --- a/apps/desktop/src/styles.css +++ b/apps/desktop/src/styles.css @@ -184,6 +184,29 @@ button { -webkit-app-region: no-drag; } +@keyframes voice-wave { + 0%, + 100% { + opacity: 0.45; + transform: scaleY(0.28); + } + + 35% { + opacity: 0.95; + transform: scaleY(1); + } + + 62% { + opacity: 0.7; + transform: scaleY(0.52); + } +} + +.voice-wave-bar { + animation: voice-wave 860ms ease-in-out infinite; + transform-origin: center; +} + .composer-liquid-shell-wrap { pointer-events: none; border-radius: var(--composer-glass-radius, 20px); diff --git a/apps/desktop/src/types/hermes.ts b/apps/desktop/src/types/hermes.ts index a7628acdd4..5f125e6d32 100644 --- a/apps/desktop/src/types/hermes.ts +++ b/apps/desktop/src/types/hermes.ts @@ -168,6 +168,7 @@ export interface SessionRuntimeInfo { personality?: string provider?: string reasoning_effort?: string + running?: boolean service_tier?: string skills?: Record | string[] tools?: Record diff --git a/apps/desktop/vite.config.ts b/apps/desktop/vite.config.ts index 2307808397..e678a904b6 100644 --- a/apps/desktop/vite.config.ts +++ b/apps/desktop/vite.config.ts @@ -4,6 +4,7 @@ import tailwindcss from '@tailwindcss/vite' import path from 'path' export default defineConfig({ + base: './', plugins: [react(), tailwindcss()], resolve: { alias: { diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 605ab04de6..c7e7730e8b 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -830,7 +830,7 @@ DEFAULT_CONFIG = { "stt": { "enabled": True, - "provider": "local", # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) | "mistral" (Voxtral Transcribe) + "provider": "local", # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) | "mistral" (Voxtral Transcribe) | "elevenlabs" (Scribe) "local": { "model": "base", # tiny, base, small, medium, large-v3 "language": "", # auto-detect by default; set to "en", "es", "fr", etc. to force @@ -841,6 +841,12 @@ DEFAULT_CONFIG = { "mistral": { "model": "voxtral-mini-latest", # voxtral-mini-latest, voxtral-mini-2602 }, + "elevenlabs": { + "model_id": "scribe_v2", # scribe_v2, scribe_v1 + "language_code": "", # auto-detect by default; set to "eng", "spa", "fra", etc. to force + "tag_audio_events": False, + "diarize": False, + }, }, "voice": { @@ -1791,9 +1797,10 @@ OPTIONAL_ENV_VARS = { "category": "tool", }, "ELEVENLABS_API_KEY": { - "description": "ElevenLabs API key for premium text-to-speech voices", + "description": "ElevenLabs API key for premium text-to-speech voices and Scribe transcription", "prompt": "ElevenLabs API key", "url": "https://elevenlabs.io/", + "tools": ["elevenlabs_tts", "voice_transcription"], "password": True, "category": "tool", }, diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 652731a8bc..1f073ae62c 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -280,7 +280,12 @@ _SCHEMA_OVERRIDES: Dict[str, Dict[str, Any]] = { "stt.provider": { "type": "select", "description": "Speech-to-text provider", - "options": ["local", "openai", "mistral"], + "options": ["local", "groq", "openai", "mistral", "xai", "elevenlabs"], + }, + "stt.elevenlabs.model_id": { + "type": "select", + "description": "ElevenLabs Scribe model", + "options": ["scribe_v2", "scribe_v1"], }, "display.skin": { "type": "select", diff --git a/tests/tools/test_transcription_dotenv_fallback.py b/tests/tools/test_transcription_dotenv_fallback.py index 39f5ca108e..081aa483ce 100644 --- a/tests/tools/test_transcription_dotenv_fallback.py +++ b/tests/tools/test_transcription_dotenv_fallback.py @@ -24,6 +24,8 @@ def isolate_env(monkeypatch): "MISTRAL_API_KEY", "XAI_API_KEY", "XAI_STT_BASE_URL", + "ELEVENLABS_API_KEY", + "ELEVENLABS_STT_BASE_URL", ): monkeypatch.delenv(key, raising=False) @@ -87,6 +89,15 @@ class TestProviderSelectionGate: return_value={"XAI_API_KEY": "dotenv-secret"}): assert tt._get_provider({"enabled": True, "provider": "xai"}) == "xai" + def test_explicit_elevenlabs_sees_dotenv(self): + from tools import transcription_tools as tt + + with patch.object(tt, "_HAS_FASTER_WHISPER", False), \ + patch.object(tt, "_has_local_command", return_value=False), \ + patch("hermes_cli.config.load_env", + return_value={"ELEVENLABS_API_KEY": "dotenv-secret"}): + assert tt._get_provider({"enabled": True, "provider": "elevenlabs"}) == "elevenlabs" + def test_auto_detect_sees_dotenv_groq(self): """No local backend, no explicit provider — auto-detect should fall through to Groq when its key lives in dotenv only. Before the fix @@ -193,6 +204,33 @@ class TestTranscribeCallSitesReadDotenv: assert result["success"] is True assert captured["headers"]["Authorization"] == "Bearer xai-dotenv-key" + def test_transcribe_elevenlabs_forwards_dotenv_key(self): + from tools import transcription_tools as tt + + captured: dict = {} + + def fake_post(url, **kwargs): + captured["url"] = url + captured["headers"] = kwargs.get("headers", {}) + response = MagicMock() + response.status_code = 200 + response.json.return_value = {"text": "hello"} + return response + + def fake_get_env_value(name, default=None): + if name == "ELEVENLABS_API_KEY": + return "elevenlabs-dotenv-key" + return None + + with patch.object(tt, "get_env_value", side_effect=fake_get_env_value), \ + patch.object(tt, "_load_stt_config", return_value={}), \ + patch("requests.post", side_effect=fake_post), \ + patch("builtins.open", MagicMock()): + result = tt._transcribe_elevenlabs("/tmp/fake.mp3", "scribe_v2") + + assert result["success"] is True + assert captured["headers"]["xi-api-key"] == "elevenlabs-dotenv-key" + class TestEndToEndRegressionGuard: """End-to-end probe: patch ``hermes_cli.config.load_env`` to simulate diff --git a/tests/tools/test_transcription_tools.py b/tests/tools/test_transcription_tools.py index 5e4a9ad716..c075cfa9eb 100644 --- a/tests/tools/test_transcription_tools.py +++ b/tests/tools/test_transcription_tools.py @@ -49,6 +49,7 @@ def clean_env(monkeypatch): monkeypatch.delenv("OPENAI_API_KEY", raising=False) monkeypatch.delenv("GROQ_API_KEY", raising=False) monkeypatch.delenv("MISTRAL_API_KEY", raising=False) + monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False) monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False) monkeypatch.delenv("HERMES_LOCAL_STT_LANGUAGE", raising=False) @@ -1342,3 +1343,161 @@ class TestTranscribeAudioXAIDispatch: transcribe_audio(sample_ogg, model="custom-stt") assert mock_xai.call_args[0][1] == "custom-stt" + + +# ============================================================================ +# _transcribe_elevenlabs +# ============================================================================ + +class TestTranscribeElevenLabs: + def test_no_key(self, monkeypatch): + monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False) + from tools.transcription_tools import _transcribe_elevenlabs + result = _transcribe_elevenlabs("/tmp/test.ogg", "scribe_v2") + assert result["success"] is False + assert "ELEVENLABS_API_KEY" in result["error"] + + def test_successful_transcription(self, monkeypatch, sample_ogg): + monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"text": "hello from elevenlabs"} + + config = { + "elevenlabs": { + "language_code": "eng", + "tag_audio_events": True, + "diarize": True, + } + } + with patch("tools.transcription_tools._load_stt_config", return_value=config), \ + patch("requests.post", return_value=mock_response) as mock_post: + from tools.transcription_tools import _transcribe_elevenlabs + result = _transcribe_elevenlabs(sample_ogg, "scribe_v2") + + assert result["success"] is True + assert result["transcript"] == "hello from elevenlabs" + assert result["provider"] == "elevenlabs" + call_kwargs = mock_post.call_args.kwargs + assert call_kwargs["headers"]["xi-api-key"] == "eleven-test-key" + assert call_kwargs["data"]["model_id"] == "scribe_v2" + assert call_kwargs["data"]["language_code"] == "eng" + assert call_kwargs["data"]["tag_audio_events"] == "true" + assert call_kwargs["data"]["diarize"] == "true" + + def test_api_error_returns_failure(self, monkeypatch, sample_ogg): + monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key") + + mock_response = MagicMock() + mock_response.status_code = 401 + mock_response.json.return_value = {"detail": {"message": "Invalid API key"}} + mock_response.text = '{"detail": {"message": "Invalid API key"}}' + + with patch("tools.transcription_tools._load_stt_config", return_value={}), \ + patch("requests.post", return_value=mock_response): + from tools.transcription_tools import _transcribe_elevenlabs + result = _transcribe_elevenlabs(sample_ogg, "scribe_v2") + + assert result["success"] is False + assert "HTTP 401" in result["error"] + assert "Invalid API key" in result["error"] + + def test_empty_transcript_returns_failure(self, monkeypatch, sample_ogg): + monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key") + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"text": " "} + + with patch("tools.transcription_tools._load_stt_config", return_value={}), \ + patch("requests.post", return_value=mock_response): + from tools.transcription_tools import _transcribe_elevenlabs + result = _transcribe_elevenlabs(sample_ogg, "scribe_v2") + + assert result["success"] is False + assert "empty transcript" in result["error"] + + +# ============================================================================ +# _get_provider — ElevenLabs +# ============================================================================ + +class TestGetProviderElevenLabs: + """ElevenLabs-specific provider selection tests.""" + + def test_elevenlabs_when_key_set(self, monkeypatch): + monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test") + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "elevenlabs"}) == "elevenlabs" + + def test_elevenlabs_explicit_no_key_returns_none(self, monkeypatch): + """Explicit elevenlabs with no key returns none — no cross-provider fallback.""" + monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False) + from tools.transcription_tools import _get_provider + assert _get_provider({"provider": "elevenlabs"}) == "none" + + def test_auto_detect_elevenlabs_after_xai(self, monkeypatch): + """Auto-detect: elevenlabs is tried after xai when all above are unavailable.""" + monkeypatch.delenv("GROQ_API_KEY", raising=False) + monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("MISTRAL_API_KEY", raising=False) + monkeypatch.delenv("XAI_API_KEY", raising=False) + monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test") + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \ + patch("tools.transcription_tools._has_local_command", return_value=False), \ + patch("tools.transcription_tools._HAS_OPENAI", False), \ + patch("tools.transcription_tools._HAS_MISTRAL", False): + from tools.transcription_tools import _get_provider + assert _get_provider({}) == "elevenlabs" + + def test_auto_detect_xai_preferred_over_elevenlabs(self, monkeypatch): + """Auto-detect: xai is preferred over elevenlabs.""" + monkeypatch.setenv("XAI_API_KEY", "xai-test") + monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test") + with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \ + patch("tools.transcription_tools._has_local_command", return_value=False), \ + patch("tools.transcription_tools._HAS_OPENAI", False), \ + patch("tools.transcription_tools._HAS_MISTRAL", False): + from tools.transcription_tools import _get_provider + assert _get_provider({}) == "xai" + + +# ============================================================================ +# transcribe_audio — ElevenLabs dispatch +# ============================================================================ + +class TestTranscribeAudioElevenLabsDispatch: + def test_dispatches_to_elevenlabs(self, sample_ogg): + with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "elevenlabs"}), \ + patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \ + patch("tools.transcription_tools._transcribe_elevenlabs", + return_value={"success": True, "transcript": "hi", "provider": "elevenlabs"}) as mock_elevenlabs: + from tools.transcription_tools import transcribe_audio + result = transcribe_audio(sample_ogg) + + assert result["success"] is True + assert result["provider"] == "elevenlabs" + mock_elevenlabs.assert_called_once() + + def test_config_elevenlabs_model_used(self, sample_ogg): + config = {"provider": "elevenlabs", "elevenlabs": {"model_id": "scribe_v1"}} + with patch("tools.transcription_tools._load_stt_config", return_value=config), \ + patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \ + patch("tools.transcription_tools._transcribe_elevenlabs", + return_value={"success": True, "transcript": "hi"}) as mock_elevenlabs: + from tools.transcription_tools import transcribe_audio + transcribe_audio(sample_ogg, model=None) + + assert mock_elevenlabs.call_args[0][1] == "scribe_v1" + + def test_model_override_passed_to_elevenlabs(self, sample_ogg): + with patch("tools.transcription_tools._load_stt_config", return_value={}), \ + patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \ + patch("tools.transcription_tools._transcribe_elevenlabs", + return_value={"success": True, "transcript": "hi"}) as mock_elevenlabs: + from tools.transcription_tools import transcribe_audio + transcribe_audio(sample_ogg, model="scribe_v2") + + assert mock_elevenlabs.call_args[0][1] == "scribe_v2" diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index 663345eb74..0323b1c27e 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -11,6 +11,7 @@ Provides speech-to-text transcription with six providers: - **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``. - **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy, Inverse Text Normalization, diarization, 21 languages. + - **elevenlabs** — ElevenLabs Scribe API, requires ``ELEVENLABS_API_KEY``. Used by the messaging gateway to automatically transcribe voice messages sent by users on Telegram, Discord, WhatsApp, Slack, and Signal. @@ -84,6 +85,7 @@ DEFAULT_LOCAL_STT_LANGUAGE = "en" DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1") DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo") DEFAULT_MISTRAL_STT_MODEL = os.getenv("STT_MISTRAL_MODEL", "voxtral-mini-latest") +DEFAULT_ELEVENLABS_STT_MODEL = os.getenv("STT_ELEVENLABS_MODEL", "scribe_v2") LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND" LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE" COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin") @@ -91,6 +93,7 @@ COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin") GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1") OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1") XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1") +ELEVENLABS_STT_BASE_URL = os.getenv("ELEVENLABS_STT_BASE_URL", "https://api.elevenlabs.io/v1") SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"} LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"} @@ -268,9 +271,17 @@ def _get_provider(stt_config: dict) -> str: ) return "none" + if provider == "elevenlabs": + if get_env_value("ELEVENLABS_API_KEY"): + return "elevenlabs" + logger.warning( + "STT provider 'elevenlabs' configured but ELEVENLABS_API_KEY not set" + ) + return "none" + return provider # Unknown — let it fail downstream - # --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai - + # --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai > elevenlabs - if _HAS_FASTER_WHISPER: return "local" @@ -288,6 +299,9 @@ def _get_provider(stt_config: dict) -> str: if get_env_value("XAI_API_KEY"): logger.info("No local STT available, using xAI Grok STT API") return "xai" + if get_env_value("ELEVENLABS_API_KEY"): + logger.info("No local STT available, using ElevenLabs Scribe STT API") + return "elevenlabs" return "none" # --------------------------------------------------------------------------- @@ -781,6 +795,92 @@ def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]: return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"} +# --------------------------------------------------------------------------- +# Provider: ElevenLabs (Scribe STT API) +# --------------------------------------------------------------------------- + + +def _transcribe_elevenlabs(file_path: str, model_name: str) -> Dict[str, Any]: + """Transcribe using ElevenLabs Scribe STT API.""" + api_key = get_env_value("ELEVENLABS_API_KEY") + if not api_key: + return {"success": False, "transcript": "", "error": "ELEVENLABS_API_KEY not set"} + + stt_config = _load_stt_config() + elevenlabs_config = stt_config.get("elevenlabs", {}) + base_url = str( + elevenlabs_config.get("base_url") + or get_env_value("ELEVENLABS_STT_BASE_URL") + or ELEVENLABS_STT_BASE_URL + ).strip().rstrip("/") + language_code = str(elevenlabs_config.get("language_code") or "").strip() + tag_audio_events = is_truthy_value(elevenlabs_config.get("tag_audio_events", False)) + diarize = is_truthy_value(elevenlabs_config.get("diarize", False)) + + try: + import requests + + data: Dict[str, str] = { + "model_id": model_name, + "tag_audio_events": "true" if tag_audio_events else "false", + "diarize": "true" if diarize else "false", + } + if language_code: + data["language_code"] = language_code + + with open(file_path, "rb") as audio_file: + response = requests.post( + f"{base_url}/speech-to-text", + headers={"xi-api-key": api_key}, + files={"file": (Path(file_path).name, audio_file)}, + data=data, + timeout=120, + ) + + if response.status_code != 200: + detail = "" + try: + err_body = response.json() + error_value = err_body.get("detail") or err_body.get("error") + if isinstance(error_value, dict): + detail = str(error_value.get("message") or error_value) + elif error_value: + detail = str(error_value) + else: + detail = response.text[:300] + except Exception: + detail = response.text[:300] + return { + "success": False, + "transcript": "", + "error": f"ElevenLabs STT API error (HTTP {response.status_code}): {detail}", + } + + result = response.json() + transcript_text = _extract_transcript_text(result) + if not transcript_text: + return { + "success": False, + "transcript": "", + "error": "ElevenLabs STT returned empty transcript", + } + + logger.info( + "Transcribed %s via ElevenLabs Scribe (%s, %d chars)", + Path(file_path).name, + model_name, + len(transcript_text), + ) + + return {"success": True, "transcript": transcript_text, "provider": "elevenlabs"} + + except PermissionError: + return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"} + except Exception as e: + logger.error("ElevenLabs STT transcription failed: %s", e, exc_info=True) + return {"success": False, "transcript": "", "error": f"ElevenLabs STT transcription failed: {e}"} + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- @@ -792,7 +892,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A Provider priority: 1. User config (``stt.provider`` in config.yaml) - 2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid) + 2. Auto-detect: local > Groq > OpenAI > Mistral > xAI > ElevenLabs Args: file_path: Absolute path to the audio file to transcribe. @@ -854,6 +954,11 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A model_name = model or "grok-stt" return _transcribe_xai(file_path, model_name) + if provider == "elevenlabs": + elevenlabs_cfg = stt_config.get("elevenlabs", {}) + model_name = model or elevenlabs_cfg.get("model_id", DEFAULT_ELEVENLABS_STT_MODEL) + return _transcribe_elevenlabs(file_path, model_name) + # No provider available return { "success": False, @@ -862,8 +967,9 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A "No STT provider available. Install faster-whisper for free local " f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, " "set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral " - "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY " - "or OPENAI_API_KEY for the OpenAI Whisper API." + "Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, set ELEVENLABS_API_KEY " + "for ElevenLabs Scribe, or set VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY for " + "the OpenAI Whisper API." ), } diff --git a/tui_gateway/server.py b/tui_gateway/server.py index cf14660c19..e7a72b7d61 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -1409,6 +1409,7 @@ def _session_info(agent, session: dict | None = None) -> dict: "cwd": cwd, "branch": _git_branch_for_cwd(cwd), "personality": str(personality or ""), + "running": bool((session or {}).get("running")), "version": "", "release_date": "", "update_behind": None,