mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-14 09:11:54 +00:00
feat: lots of speech stuff
This commit is contained in:
parent
9f3d393a4d
commit
d5d7b5c6dc
41 changed files with 1405 additions and 361 deletions
|
|
@ -384,9 +384,9 @@ IMAGE_TOOLS_DEBUG=false
|
|||
# Default STT provider is "local" (faster-whisper) — runs on your machine, no API key needed.
|
||||
# Install with: pip install faster-whisper
|
||||
# Model downloads automatically on first use (~150 MB for "base").
|
||||
# To use cloud providers instead, set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY above.
|
||||
# Provider priority: local > groq > openai
|
||||
# Configure in config.yaml: stt.provider: local | groq | openai
|
||||
# To use cloud providers instead, set GROQ_API_KEY, VOICE_TOOLS_OPENAI_KEY, or ELEVENLABS_API_KEY above.
|
||||
# Provider priority: local > groq > openai > mistral > xai > elevenlabs
|
||||
# Configure in config.yaml: stt.provider: local | groq | openai | mistral | xai | elevenlabs
|
||||
|
||||
# =============================================================================
|
||||
# STT ADVANCED OVERRIDES (optional)
|
||||
|
|
@ -394,10 +394,12 @@ IMAGE_TOOLS_DEBUG=false
|
|||
# Override default STT models per provider (normally set via stt.model in config.yaml)
|
||||
# STT_GROQ_MODEL=whisper-large-v3-turbo
|
||||
# STT_OPENAI_MODEL=whisper-1
|
||||
# STT_ELEVENLABS_MODEL=scribe_v2
|
||||
|
||||
# Override STT provider endpoints (for proxies or self-hosted instances)
|
||||
# GROQ_BASE_URL=https://api.groq.com/openai/v1
|
||||
# STT_OPENAI_BASE_URL=https://api.openai.com/v1
|
||||
# ELEVENLABS_STT_BASE_URL=https://api.elevenlabs.io/v1
|
||||
|
||||
# =============================================================================
|
||||
# MICROSOFT TEAMS INTEGRATION
|
||||
|
|
|
|||
20
apps/desktop/package-lock.json
generated
20
apps/desktop/package-lock.json
generated
|
|
@ -10,6 +10,7 @@
|
|||
"dependencies": {
|
||||
"@assistant-ui/react": "^0.12.28",
|
||||
"@assistant-ui/react-streamdown": "^0.1.11",
|
||||
"@audiowave/react": "^0.6.2",
|
||||
"@chenglou/pretext": "^0.0.6",
|
||||
"@nanostores/react": "^1.1.0",
|
||||
"@radix-ui/react-slot": "^1.2.4",
|
||||
|
|
@ -305,6 +306,25 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@audiowave/core": {
|
||||
"version": "0.3.1",
|
||||
"resolved": "https://registry.npmjs.org/@audiowave/core/-/core-0.3.1.tgz",
|
||||
"integrity": "sha512-KtC2MTWKp6Orkedty3I8IklVBVQ2IFaFWDJ1cz+UsACpX2x1gINwZGTRZT7bw/dx8KazNSMuVK5lm1jL67KQkQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@audiowave/react": {
|
||||
"version": "0.6.2",
|
||||
"resolved": "https://registry.npmjs.org/@audiowave/react/-/react-0.6.2.tgz",
|
||||
"integrity": "sha512-hajG2Iv3mVxived9wXad8L0ZQF+HmYnB3IrfOkIdkTv4RxOJDXwFWMAd0zb7ZU1Qz0IEYZXCbASFWyuxEQ7PAw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@audiowave/core": "0.3.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": ">=16.8.0",
|
||||
"react-dom": ">=16.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@babel/code-frame": {
|
||||
"version": "7.29.0",
|
||||
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz",
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@
|
|||
"dependencies": {
|
||||
"@assistant-ui/react": "^0.12.28",
|
||||
"@assistant-ui/react-streamdown": "^0.1.11",
|
||||
"@audiowave/react": "^0.6.2",
|
||||
"@chenglou/pretext": "^0.0.6",
|
||||
"@nanostores/react": "^1.1.0",
|
||||
"@radix-ui/react-slot": "^1.2.4",
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
import type { Unstable_TriggerItem } from '@assistant-ui/core'
|
||||
import type { Unstable_IconComponent } from '@assistant-ui/react'
|
||||
import { FileText, FolderOpen, ImageIcon, Link, type LucideIcon } from 'lucide-react'
|
||||
import type { CSSProperties } from 'react'
|
||||
|
|
@ -37,7 +36,7 @@ export const DIRECTIVE_ICONS: Record<string, Unstable_IconComponent> = {
|
|||
}
|
||||
|
||||
export const DIRECTIVE_POPOVER_CLASS =
|
||||
'absolute bottom-24 left-1/2 z-50 w-[min(calc(100vw-1.5rem),28rem)] max-h-[min(28rem,calc(100vh-8rem))] -translate-x-1/2 overflow-y-auto overscroll-contain rounded-2xl border border-border/70 bg-popover p-1.5 text-popover-foreground shadow-2xl'
|
||||
'absolute bottom-24 left-1/2 z-50 w-[min(calc(100vw-1.5rem),26rem)] max-h-[min(24rem,calc(100vh-8rem))] -translate-x-1/2 overflow-y-auto overscroll-contain rounded-2xl border border-border/60 bg-popover/95 p-1.5 text-popover-foreground shadow-2xl backdrop-blur-md ring-1 ring-black/5'
|
||||
|
||||
export const PROMPT_SNIPPETS = [
|
||||
{
|
||||
|
|
@ -64,37 +63,6 @@ export const ASK_PLACEHOLDERS = [
|
|||
'Duck mode: gentle debugging, together.'
|
||||
]
|
||||
|
||||
export const REF_ITEMS: Unstable_TriggerItem[] = [
|
||||
{
|
||||
id: 'file:',
|
||||
type: 'file',
|
||||
label: 'File',
|
||||
description: 'Attach a file path',
|
||||
metadata: { icon: 'file' }
|
||||
},
|
||||
{
|
||||
id: 'folder:',
|
||||
type: 'folder',
|
||||
label: 'Folder',
|
||||
description: 'Attach a folder path',
|
||||
metadata: { icon: 'folder' }
|
||||
},
|
||||
{
|
||||
id: 'url:',
|
||||
type: 'url',
|
||||
label: 'URL',
|
||||
description: 'Attach a web page',
|
||||
metadata: { icon: 'url' }
|
||||
},
|
||||
{
|
||||
id: 'image:',
|
||||
type: 'image',
|
||||
label: 'Image',
|
||||
description: 'Attach an image path',
|
||||
metadata: { icon: 'image' }
|
||||
}
|
||||
]
|
||||
|
||||
export const EDGE_NEWLINES_RE = /^[\t ]*(?:\r\n|\r|\n)+|(?:\r\n|\r|\n)+[\t ]*$/g
|
||||
export const DEFAULT_MAX_RECORDING_SECONDS = 120
|
||||
|
||||
|
|
|
|||
|
|
@ -15,11 +15,10 @@ import {
|
|||
import { cn } from '@/lib/utils'
|
||||
|
||||
import { GHOST_ICON_BTN, PROMPT_SNIPPETS } from './constants'
|
||||
import type { ChatBarState, ContextSuggestion } from './types'
|
||||
import type { ChatBarState } from './types'
|
||||
|
||||
export function ContextMenu({
|
||||
state,
|
||||
onAddContextRef,
|
||||
onInsertText,
|
||||
onOpenUrlDialog,
|
||||
onPasteClipboardImage,
|
||||
|
|
@ -28,7 +27,6 @@ export function ContextMenu({
|
|||
onPickImages
|
||||
}: {
|
||||
state: ChatBarState
|
||||
onAddContextRef?: (refText: string, label?: string, detail?: string) => void
|
||||
onInsertText: (text: string) => void
|
||||
onOpenUrlDialog: () => void
|
||||
onPasteClipboardImage?: () => void
|
||||
|
|
@ -36,11 +34,6 @@ export function ContextMenu({
|
|||
onPickFolders?: () => void
|
||||
onPickImages?: () => void
|
||||
}) {
|
||||
const choose = (item: ContextSuggestion) =>
|
||||
onAddContextRef ? onAddContextRef(item.text, item.display, item.meta) : onInsertText(item.text)
|
||||
|
||||
const suggestions = state.tools.suggestions?.slice(0, 8) ?? []
|
||||
|
||||
return (
|
||||
<DropdownMenu>
|
||||
<DropdownMenuTrigger asChild>
|
||||
|
|
@ -56,48 +49,28 @@ export function ContextMenu({
|
|||
<Plus size={18} />
|
||||
</Button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent align="start" className="w-64" side="top" sideOffset={10}>
|
||||
<DropdownMenuLabel className="text-xs text-muted-foreground">Add context</DropdownMenuLabel>
|
||||
<DropdownMenuContent align="start" className="w-60" side="top" sideOffset={10}>
|
||||
<DropdownMenuLabel className="text-[0.7rem] font-medium uppercase tracking-wide text-muted-foreground/85">
|
||||
Attach
|
||||
</DropdownMenuLabel>
|
||||
<ContextMenuItem disabled={!onPickFiles} icon={FileText} onSelect={onPickFiles}>
|
||||
Files
|
||||
Files…
|
||||
</ContextMenuItem>
|
||||
<ContextMenuItem disabled={!onPickFolders} icon={FolderOpen} onSelect={onPickFolders}>
|
||||
Folders
|
||||
Folder…
|
||||
</ContextMenuItem>
|
||||
<ContextMenuItem disabled={!onPickImages} icon={ImageIcon} onSelect={onPickImages}>
|
||||
Images
|
||||
Images…
|
||||
</ContextMenuItem>
|
||||
<ContextMenuItem disabled={!onPasteClipboardImage} icon={Clipboard} onSelect={onPasteClipboardImage}>
|
||||
Image from clipboard
|
||||
Paste image
|
||||
</ContextMenuItem>
|
||||
<ContextMenuItem icon={Link} onSelect={onOpenUrlDialog}>
|
||||
URL
|
||||
URL…
|
||||
</ContextMenuItem>
|
||||
|
||||
<DropdownMenuSeparator />
|
||||
|
||||
<DropdownMenuSub>
|
||||
<DropdownMenuSubTrigger>
|
||||
<FileText />
|
||||
<span>Suggested files</span>
|
||||
</DropdownMenuSubTrigger>
|
||||
<DropdownMenuSubContent className="w-72">
|
||||
{suggestions.length === 0 ? (
|
||||
<DropdownMenuItem disabled>
|
||||
<span className="text-muted-foreground">No suggestions</span>
|
||||
</DropdownMenuItem>
|
||||
) : (
|
||||
suggestions.map(item => (
|
||||
<DropdownMenuItem key={item.text} onSelect={() => choose(item)}>
|
||||
<FileText />
|
||||
<span className="min-w-0 flex-1 truncate">{item.display}</span>
|
||||
{item.meta && <span className="max-w-28 truncate text-xs text-muted-foreground">{item.meta}</span>}
|
||||
</DropdownMenuItem>
|
||||
))
|
||||
)}
|
||||
</DropdownMenuSubContent>
|
||||
</DropdownMenuSub>
|
||||
|
||||
<DropdownMenuSub>
|
||||
<DropdownMenuSubTrigger>
|
||||
<MessageSquareText />
|
||||
|
|
@ -111,6 +84,13 @@ export function ContextMenu({
|
|||
))}
|
||||
</DropdownMenuSubContent>
|
||||
</DropdownMenuSub>
|
||||
|
||||
<DropdownMenuSeparator />
|
||||
|
||||
<div className="px-2 py-1 text-[0.7rem] text-muted-foreground/80">
|
||||
Tip: type <kbd className="rounded bg-muted/70 px-1 py-px font-mono text-[0.65rem]">@</kbd> to reference files
|
||||
inline.
|
||||
</div>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
)
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ interface ConversationProps {
|
|||
status: ConversationStatus
|
||||
onEnd: () => void
|
||||
onStart: () => void
|
||||
onStopTurn: () => void
|
||||
onToggleMute: () => void
|
||||
}
|
||||
|
||||
|
|
@ -80,6 +81,7 @@ function ConversationPill({
|
|||
level,
|
||||
muted,
|
||||
onEnd,
|
||||
onStopTurn,
|
||||
onToggleMute,
|
||||
status
|
||||
}: ConversationProps & { disabled: boolean }) {
|
||||
|
|
@ -104,10 +106,10 @@ function ConversationPill({
|
|||
aria-pressed={muted}
|
||||
className={cn(GHOST_ICON_BTN, 'p-0', muted && 'bg-muted text-muted-foreground')}
|
||||
disabled={disabled}
|
||||
onClick={() => {
|
||||
triggerHaptic('selection')
|
||||
onToggleMute()
|
||||
}}
|
||||
onClick={() => {
|
||||
triggerHaptic('selection')
|
||||
onToggleMute()
|
||||
}}
|
||||
size="icon"
|
||||
title={muted ? 'Unmute microphone' : 'Mute microphone'}
|
||||
type="button"
|
||||
|
|
@ -115,6 +117,23 @@ function ConversationPill({
|
|||
>
|
||||
{muted ? <MicOff size={16} /> : <Mic size={16} />}
|
||||
</Button>
|
||||
{listening && (
|
||||
<Button
|
||||
aria-label="Stop listening and send"
|
||||
className="h-8 shrink-0 gap-1.5 rounded-full px-2.5 text-xs text-muted-foreground hover:bg-accent hover:text-foreground"
|
||||
disabled={disabled}
|
||||
onClick={() => {
|
||||
triggerHaptic('submit')
|
||||
onStopTurn()
|
||||
}}
|
||||
title="Stop listening and send"
|
||||
type="button"
|
||||
variant="ghost"
|
||||
>
|
||||
<Square className="fill-current" size={11} />
|
||||
<span>Stop</span>
|
||||
</Button>
|
||||
)}
|
||||
<Button
|
||||
aria-label="End voice conversation"
|
||||
className="h-8 gap-1.5 rounded-full bg-primary px-3 text-xs font-medium text-primary-foreground hover:bg-primary/90"
|
||||
|
|
|
|||
|
|
@ -5,9 +5,9 @@ import {
|
|||
type Unstable_MentionCategory,
|
||||
type Unstable_MentionDirective
|
||||
} from '@assistant-ui/react'
|
||||
import { ChevronDown } from 'lucide-react'
|
||||
import { FileText } from 'lucide-react'
|
||||
|
||||
import { DIRECTIVE_POPOVER_CLASS, REF_ITEMS } from './constants'
|
||||
import { DIRECTIVE_POPOVER_CLASS } from './constants'
|
||||
import type { ContextSuggestion } from './types'
|
||||
|
||||
export function DirectivePopover({
|
||||
|
|
@ -24,80 +24,73 @@ export function DirectivePopover({
|
|||
return (
|
||||
<ComposerPrimitive.Unstable_TriggerPopover adapter={adapter} char="@" className={DIRECTIVE_POPOVER_CLASS}>
|
||||
<ComposerPrimitive.Unstable_TriggerPopover.Directive {...directive} />
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverCategories>
|
||||
{categories => (
|
||||
<div className="grid gap-1">
|
||||
{categories.map(c => (
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverCategoryItem
|
||||
categoryId={c.id}
|
||||
className="flex w-full items-center justify-between rounded-xl px-3 py-2 text-left text-sm hover:bg-accent data-highlighted:bg-accent"
|
||||
key={c.id}
|
||||
>
|
||||
<span>{c.label}</span>
|
||||
<ChevronDown className="-rotate-90 size-3.5 text-muted-foreground" />
|
||||
</ComposerPrimitive.Unstable_TriggerPopoverCategoryItem>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</ComposerPrimitive.Unstable_TriggerPopoverCategories>
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverItems>
|
||||
{items => (
|
||||
<div className="grid gap-1">
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverBack className="mb-1 text-xs text-muted-foreground hover:text-foreground">
|
||||
Back
|
||||
</ComposerPrimitive.Unstable_TriggerPopoverBack>
|
||||
{items.map((item, index) => {
|
||||
const Icon = directiveIcon(item, iconMap, Fallback)
|
||||
<div className="grid gap-0.5">
|
||||
<div className="px-2 pb-1 pt-0.5 text-[0.7rem] font-medium uppercase tracking-wide text-muted-foreground/80">
|
||||
Reference a file
|
||||
</div>
|
||||
{items.length === 0 ? (
|
||||
<div className="px-3 py-3 text-sm text-muted-foreground">
|
||||
<p>No file suggestions yet.</p>
|
||||
<p className="mt-1 text-xs text-muted-foreground/80">
|
||||
Keep typing to filter, or click <span className="font-medium text-foreground/80">+</span> to attach
|
||||
files, folders, or a URL.
|
||||
</p>
|
||||
</div>
|
||||
) : (
|
||||
items.map((item, index) => {
|
||||
const Icon = directiveIcon(item, iconMap, Fallback)
|
||||
|
||||
return (
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverItem
|
||||
className="flex w-full items-center gap-2 rounded-xl px-3 py-2 text-left text-sm hover:bg-accent data-highlighted:bg-accent"
|
||||
index={index}
|
||||
item={item}
|
||||
key={`${item.type}:${item.id}`}
|
||||
>
|
||||
<Icon className="size-4 shrink-0 text-muted-foreground" />
|
||||
<span className="grid min-w-0 flex-1 gap-0.5">
|
||||
<span className="truncate font-medium">{item.label}</span>
|
||||
{item.description && (
|
||||
<span className="truncate text-xs text-muted-foreground">{item.description}</span>
|
||||
)}
|
||||
</span>
|
||||
</ComposerPrimitive.Unstable_TriggerPopoverItem>
|
||||
)
|
||||
})}
|
||||
return (
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverItem
|
||||
className="flex w-full items-center gap-2 rounded-xl px-2.5 py-1.5 text-left text-sm transition-colors hover:bg-accent/70 data-highlighted:bg-accent"
|
||||
index={index}
|
||||
item={item}
|
||||
key={`${item.type}:${item.id}`}
|
||||
>
|
||||
<Icon className="size-4 shrink-0 text-muted-foreground/80" />
|
||||
<span className="grid min-w-0 flex-1 gap-0.5">
|
||||
<span className="truncate font-medium text-foreground">{item.label}</span>
|
||||
{item.description && (
|
||||
<span className="truncate text-[0.72rem] text-muted-foreground/85">{item.description}</span>
|
||||
)}
|
||||
</span>
|
||||
</ComposerPrimitive.Unstable_TriggerPopoverItem>
|
||||
)
|
||||
})
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</ComposerPrimitive.Unstable_TriggerPopoverItems>
|
||||
</ComposerPrimitive.Unstable_TriggerPopover>
|
||||
)
|
||||
}
|
||||
|
||||
export function buildMentionCategories(suggestions: ContextSuggestion[] | undefined): Unstable_MentionCategory[] {
|
||||
const items = (suggestions ?? [])
|
||||
.map(s => {
|
||||
const match = s.text.match(/^@(file|folder|url|image):(.+)$/)
|
||||
const items: Unstable_TriggerItem[] = []
|
||||
|
||||
if (!match) {
|
||||
return null
|
||||
}
|
||||
for (const s of suggestions ?? []) {
|
||||
const match = s.text.match(/^@(file|folder|url|image):(.+)$/)
|
||||
|
||||
const [, type, id] = match
|
||||
if (!match) {
|
||||
continue
|
||||
}
|
||||
|
||||
return {
|
||||
id,
|
||||
type,
|
||||
label: s.display || id,
|
||||
description: s.meta,
|
||||
metadata: { icon: type }
|
||||
}
|
||||
const [, type, id] = match
|
||||
|
||||
items.push({
|
||||
id,
|
||||
type,
|
||||
label: s.display || id,
|
||||
description: s.meta,
|
||||
metadata: { icon: type }
|
||||
})
|
||||
.filter((item): item is NonNullable<typeof item> => Boolean(item))
|
||||
}
|
||||
|
||||
return [
|
||||
{ id: 'refs', label: 'Hermes refs', items: REF_ITEMS },
|
||||
...(items.length ? [{ id: 'context', label: 'Suggested files', items }] : [])
|
||||
]
|
||||
return [{ id: 'context', label: 'References', items }]
|
||||
}
|
||||
|
||||
function directiveIcon(
|
||||
item: Unstable_TriggerItem,
|
||||
iconMap: Record<string, Unstable_IconComponent>,
|
||||
|
|
@ -106,5 +99,5 @@ function directiveIcon(
|
|||
const meta = item.metadata as Record<string, unknown> | undefined
|
||||
const key = typeof meta?.icon === 'string' ? meta.icon : item.type
|
||||
|
||||
return iconMap[key] ?? iconMap[item.type] ?? fallback
|
||||
return iconMap[key] ?? iconMap[item.type] ?? fallback ?? FileText
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import { useCallback, useEffect, useRef, useState } from 'react'
|
||||
|
||||
import { speakText } from '@/hermes'
|
||||
import { playSpeechText, stopVoicePlayback } from '@/lib/voice-playback'
|
||||
import { notify, notifyError } from '@/store/notifications'
|
||||
|
||||
import {
|
||||
|
|
@ -14,13 +14,19 @@ import { useMicRecorder } from './use-mic-recorder'
|
|||
|
||||
export type ConversationStatus = 'idle' | 'listening' | 'transcribing' | 'thinking' | 'speaking'
|
||||
|
||||
interface PendingVoiceResponse {
|
||||
id: string
|
||||
pending: boolean
|
||||
text: string
|
||||
}
|
||||
|
||||
interface VoiceConversationOptions {
|
||||
busy: boolean
|
||||
enabled: boolean
|
||||
onFatalError?: () => void
|
||||
onSubmit: (text: string) => void
|
||||
onSubmit: (text: string) => Promise<void> | void
|
||||
onTranscribeAudio?: (audio: Blob) => Promise<string>
|
||||
pendingResponseText: () => string | null
|
||||
pendingResponse: () => PendingVoiceResponse | null
|
||||
consumePendingResponse: () => void
|
||||
}
|
||||
|
||||
|
|
@ -30,16 +36,19 @@ export function useVoiceConversation({
|
|||
onFatalError,
|
||||
onSubmit,
|
||||
onTranscribeAudio,
|
||||
pendingResponseText,
|
||||
pendingResponse,
|
||||
consumePendingResponse
|
||||
}: VoiceConversationOptions) {
|
||||
const { handle, level } = useMicRecorder()
|
||||
const [status, setStatus] = useState<ConversationStatus>('idle')
|
||||
const [muted, setMuted] = useState(false)
|
||||
const audioRef = useRef<HTMLAudioElement | null>(null)
|
||||
const turnTimeoutRef = useRef<number | null>(null)
|
||||
const pendingStartRef = useRef(false)
|
||||
const lastSpokenRef = useRef<string | null>(null)
|
||||
const turnClosingRef = useRef(false)
|
||||
const awaitingSpokenResponseRef = useRef(false)
|
||||
const responseIdRef = useRef<string | null>(null)
|
||||
const spokenSourceLengthRef = useRef(0)
|
||||
const speechBufferRef = useRef('')
|
||||
const enabledRef = useRef(enabled)
|
||||
const mutedRef = useRef(muted)
|
||||
const busyRef = useRef(busy)
|
||||
|
|
@ -69,36 +78,74 @@ export function useVoiceConversation({
|
|||
}
|
||||
}
|
||||
|
||||
const stopAudio = useCallback(() => {
|
||||
const audio = audioRef.current
|
||||
const resetSpeechBuffer = () => {
|
||||
responseIdRef.current = null
|
||||
spokenSourceLengthRef.current = 0
|
||||
speechBufferRef.current = ''
|
||||
}
|
||||
|
||||
if (audio) {
|
||||
audio.pause()
|
||||
audio.src = ''
|
||||
audioRef.current = null
|
||||
}
|
||||
}, [])
|
||||
|
||||
const handleTurn = useCallback(async () => {
|
||||
clearTurnTimeout()
|
||||
setStatus('transcribing')
|
||||
const result = await handle.stop()
|
||||
|
||||
if (!result || !result.heardSpeech || !onTranscribeAudio) {
|
||||
if (enabledRef.current && !mutedRef.current && !busyRef.current && statusRef.current !== 'speaking') {
|
||||
pendingStartRef.current = true
|
||||
}
|
||||
|
||||
setStatus('idle')
|
||||
const appendSpeechText = (text: string) => {
|
||||
const cleaned = text
|
||||
|
||||
if (!cleaned) {
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const transcript = (await onTranscribeAudio(result.audio)).trim()
|
||||
speechBufferRef.current = `${speechBufferRef.current} ${cleaned}`.trim()
|
||||
}
|
||||
|
||||
if (!transcript) {
|
||||
if (enabledRef.current) {
|
||||
const takeSpeechChunk = (force = false): string | null => {
|
||||
const buffer = speechBufferRef.current.replace(/\s+/g, ' ').trim()
|
||||
|
||||
if (!buffer) {
|
||||
speechBufferRef.current = ''
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
const sentence = buffer.match(/^(.+?[.!?。!?])(?:\s+|$)/)
|
||||
|
||||
if (sentence?.[1] && (sentence[1].length >= 8 || force)) {
|
||||
const chunk = sentence[1].trim()
|
||||
speechBufferRef.current = buffer.slice(sentence[1].length).trim()
|
||||
|
||||
return chunk
|
||||
}
|
||||
|
||||
if (!force && buffer.length > 220) {
|
||||
const softBoundary = Math.max(buffer.lastIndexOf(', ', 180), buffer.lastIndexOf('; ', 180), buffer.lastIndexOf(': ', 180))
|
||||
|
||||
if (softBoundary > 80) {
|
||||
const chunk = buffer.slice(0, softBoundary + 1).trim()
|
||||
speechBufferRef.current = buffer.slice(softBoundary + 1).trim()
|
||||
|
||||
return chunk
|
||||
}
|
||||
}
|
||||
|
||||
if (!force) {
|
||||
return null
|
||||
}
|
||||
|
||||
speechBufferRef.current = ''
|
||||
|
||||
return buffer
|
||||
}
|
||||
|
||||
const handleTurn = useCallback(async (forceTranscribe = false) => {
|
||||
if (turnClosingRef.current) {
|
||||
return
|
||||
}
|
||||
|
||||
turnClosingRef.current = true
|
||||
clearTurnTimeout()
|
||||
setStatus('transcribing')
|
||||
|
||||
try {
|
||||
const result = await handle.stop()
|
||||
|
||||
if (!result || (!result.heardSpeech && !forceTranscribe) || !onTranscribeAudio) {
|
||||
if (enabledRef.current && !mutedRef.current && !busyRef.current && statusRef.current !== 'speaking') {
|
||||
pendingStartRef.current = true
|
||||
}
|
||||
|
||||
|
|
@ -107,16 +154,34 @@ export function useVoiceConversation({
|
|||
return
|
||||
}
|
||||
|
||||
onSubmit(transcript)
|
||||
setStatus('thinking')
|
||||
} catch (error) {
|
||||
notifyError(error, 'Voice transcription failed')
|
||||
try {
|
||||
const transcript = (await onTranscribeAudio(result.audio)).trim()
|
||||
|
||||
if (enabledRef.current && !mutedRef.current && !busyRef.current) {
|
||||
pendingStartRef.current = true
|
||||
if (!transcript) {
|
||||
if (enabledRef.current) {
|
||||
pendingStartRef.current = true
|
||||
}
|
||||
|
||||
setStatus('idle')
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
awaitingSpokenResponseRef.current = true
|
||||
resetSpeechBuffer()
|
||||
await onSubmit(transcript)
|
||||
setStatus('thinking')
|
||||
} catch (error) {
|
||||
notifyError(error, 'Voice transcription failed')
|
||||
|
||||
if (enabledRef.current && !mutedRef.current && !busyRef.current) {
|
||||
pendingStartRef.current = true
|
||||
}
|
||||
|
||||
setStatus('idle')
|
||||
}
|
||||
|
||||
setStatus('idle')
|
||||
} finally {
|
||||
turnClosingRef.current = false
|
||||
}
|
||||
}, [handle, onSubmit, onTranscribeAudio])
|
||||
|
||||
|
|
@ -158,24 +223,13 @@ export function useVoiceConversation({
|
|||
|
||||
const speak = useCallback(
|
||||
async (text: string) => {
|
||||
stopAudio()
|
||||
setStatus('speaking')
|
||||
|
||||
try {
|
||||
const response = await speakText(text)
|
||||
const audio = new Audio(response.data_url)
|
||||
audioRef.current = audio
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
audio.addEventListener('ended', () => resolve(), { once: true })
|
||||
audio.addEventListener('error', () => reject(new Error('Playback failed')), { once: true })
|
||||
void audio.play().catch(reject)
|
||||
})
|
||||
await playSpeechText(text, { source: 'voice-conversation' })
|
||||
} catch (error) {
|
||||
notifyError(error, 'Voice playback failed')
|
||||
} finally {
|
||||
audioRef.current = null
|
||||
|
||||
if (enabledRef.current) {
|
||||
pendingStartRef.current = true
|
||||
setStatus('idle')
|
||||
|
|
@ -184,7 +238,7 @@ export function useVoiceConversation({
|
|||
}
|
||||
}
|
||||
},
|
||||
[stopAudio]
|
||||
[]
|
||||
)
|
||||
|
||||
const start = useCallback(async () => {
|
||||
|
|
@ -200,20 +254,31 @@ export function useVoiceConversation({
|
|||
}
|
||||
|
||||
setMuted(false)
|
||||
lastSpokenRef.current = null
|
||||
awaitingSpokenResponseRef.current = false
|
||||
resetSpeechBuffer()
|
||||
consumePendingResponse()
|
||||
pendingStartRef.current = true
|
||||
}, [onFatalError, onTranscribeAudio])
|
||||
await startListening()
|
||||
}, [consumePendingResponse, onFatalError, onTranscribeAudio, startListening])
|
||||
|
||||
const end = useCallback(async () => {
|
||||
pendingStartRef.current = false
|
||||
clearTurnTimeout()
|
||||
stopAudio()
|
||||
stopVoicePlayback()
|
||||
handle.cancel()
|
||||
lastSpokenRef.current = null
|
||||
turnClosingRef.current = false
|
||||
awaitingSpokenResponseRef.current = false
|
||||
resetSpeechBuffer()
|
||||
consumePendingResponse()
|
||||
setMuted(false)
|
||||
setStatus('idle')
|
||||
}, [consumePendingResponse, handle, stopAudio])
|
||||
}, [consumePendingResponse, handle])
|
||||
|
||||
const stopTurn = useCallback(() => {
|
||||
if (statusRef.current === 'listening') {
|
||||
void handleTurn(true)
|
||||
}
|
||||
}, [handleTurn])
|
||||
|
||||
const toggleMute = useCallback(() => {
|
||||
setMuted(value => {
|
||||
|
|
@ -231,22 +296,77 @@ export function useVoiceConversation({
|
|||
})
|
||||
}, [handle])
|
||||
|
||||
// Drive the loop: speak any new assistant response, otherwise start listening
|
||||
// when the agent is idle and we're between turns.
|
||||
useEffect(() => {
|
||||
if (!enabled) {
|
||||
return
|
||||
}
|
||||
|
||||
const onKeyDown = (event: KeyboardEvent) => {
|
||||
if (event.code !== 'Space' || event.repeat || event.metaKey || event.ctrlKey || event.altKey) {
|
||||
return
|
||||
}
|
||||
|
||||
if (statusRef.current !== 'listening') {
|
||||
return
|
||||
}
|
||||
|
||||
event.preventDefault()
|
||||
stopTurn()
|
||||
}
|
||||
|
||||
window.addEventListener('keydown', onKeyDown, { capture: true })
|
||||
|
||||
return () => window.removeEventListener('keydown', onKeyDown, { capture: true })
|
||||
}, [enabled, stopTurn])
|
||||
|
||||
// Drive the loop: after a voice-submitted turn, speak stable chunks as the
|
||||
// assistant stream grows. Otherwise start listening when idle between turns.
|
||||
useEffect(() => {
|
||||
if (!enabled || muted) {
|
||||
return
|
||||
}
|
||||
|
||||
const text = pendingResponseText()
|
||||
const trimmed = text?.trim() ?? ''
|
||||
if (awaitingSpokenResponseRef.current && status !== 'speaking') {
|
||||
const response = pendingResponse()
|
||||
|
||||
if (trimmed && trimmed !== lastSpokenRef.current && status !== 'speaking') {
|
||||
lastSpokenRef.current = trimmed
|
||||
consumePendingResponse()
|
||||
void speak(trimmed)
|
||||
if (response) {
|
||||
if (response.id !== responseIdRef.current) {
|
||||
resetSpeechBuffer()
|
||||
responseIdRef.current = response.id
|
||||
}
|
||||
|
||||
return
|
||||
if (response.text.length > spokenSourceLengthRef.current) {
|
||||
appendSpeechText(response.text.slice(spokenSourceLengthRef.current))
|
||||
spokenSourceLengthRef.current = response.text.length
|
||||
}
|
||||
|
||||
const chunk = takeSpeechChunk(!response.pending && !busy)
|
||||
|
||||
if (chunk) {
|
||||
void speak(chunk)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
if (!response.pending && !busy) {
|
||||
awaitingSpokenResponseRef.current = false
|
||||
consumePendingResponse()
|
||||
resetSpeechBuffer()
|
||||
pendingStartRef.current = true
|
||||
setStatus('idle')
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if (!busy && status === 'thinking') {
|
||||
awaitingSpokenResponseRef.current = false
|
||||
resetSpeechBuffer()
|
||||
pendingStartRef.current = true
|
||||
setStatus('idle')
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if (busy || status !== 'idle') {
|
||||
|
|
@ -256,7 +376,7 @@ export function useVoiceConversation({
|
|||
if (pendingStartRef.current) {
|
||||
void startListening()
|
||||
}
|
||||
}, [busy, consumePendingResponse, enabled, muted, pendingResponseText, speak, startListening, status])
|
||||
}, [busy, consumePendingResponse, enabled, muted, pendingResponse, speak, startListening, status])
|
||||
|
||||
useEffect(() => {
|
||||
if (enabled && !wasEnabledRef.current) {
|
||||
|
|
@ -270,5 +390,5 @@ export function useVoiceConversation({
|
|||
wasEnabledRef.current = enabled
|
||||
}, [enabled, end, start])
|
||||
|
||||
return { end, level, muted, start, status, toggleMute }
|
||||
return { end, level, muted, start, status, stopTurn, toggleMute }
|
||||
}
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ import { useVoiceConversation } from './hooks/use-voice-conversation'
|
|||
import { useVoiceRecorder } from './hooks/use-voice-recorder'
|
||||
import type { ChatBarProps } from './types'
|
||||
import { UrlDialog } from './url-dialog'
|
||||
import { VoiceActivity } from './voice-activity'
|
||||
import { VoiceActivity, VoicePlaybackActivity } from './voice-activity'
|
||||
|
||||
function trimPastedEdgeNewlines(text: string): string {
|
||||
return text.replace(EDGE_NEWLINES_RE, '')
|
||||
|
|
@ -45,7 +45,6 @@ export function ChatBar({
|
|||
maxRecordingSeconds = DEFAULT_MAX_RECORDING_SECONDS,
|
||||
state,
|
||||
onCancel,
|
||||
onAddContextRef,
|
||||
onAddUrl,
|
||||
onPasteClipboardImage,
|
||||
onPickFiles,
|
||||
|
|
@ -203,7 +202,7 @@ export function ChatBar({
|
|||
onCancel()
|
||||
} else if (draft.trim() || attachments.length > 0) {
|
||||
triggerHaptic('submit')
|
||||
onSubmit(draft)
|
||||
void onSubmit(draft)
|
||||
aui.composer().setText('')
|
||||
}
|
||||
|
||||
|
|
@ -235,9 +234,9 @@ export function ChatBar({
|
|||
onTranscribeAudio
|
||||
})
|
||||
|
||||
const pendingResponseText = () => {
|
||||
const pendingResponse = () => {
|
||||
const messages = $messages.get()
|
||||
const last = messages.findLast(m => m.role === 'assistant' && !m.pending && !m.hidden)
|
||||
const last = messages.findLast(m => m.role === 'assistant' && !m.hidden)
|
||||
|
||||
if (!last || last.id === lastSpokenIdRef.current) {
|
||||
return null
|
||||
|
|
@ -249,9 +248,11 @@ export function ChatBar({
|
|||
return null
|
||||
}
|
||||
|
||||
lastSpokenIdRef.current = last.id
|
||||
|
||||
return text
|
||||
return {
|
||||
id: last.id,
|
||||
pending: Boolean(last.pending),
|
||||
text
|
||||
}
|
||||
}
|
||||
|
||||
const consumePendingResponse = () => {
|
||||
|
|
@ -263,13 +264,13 @@ export function ChatBar({
|
|||
}
|
||||
}
|
||||
|
||||
const submitVoiceTurn = (text: string) => {
|
||||
const submitVoiceTurn = async (text: string) => {
|
||||
if (busy) {
|
||||
return
|
||||
}
|
||||
|
||||
triggerHaptic('submit')
|
||||
onSubmit(text)
|
||||
await onSubmit(text)
|
||||
aui.composer().setText('')
|
||||
draftRef.current = ''
|
||||
}
|
||||
|
|
@ -281,12 +282,11 @@ export function ChatBar({
|
|||
onFatalError: () => setVoiceConversationActive(false),
|
||||
onSubmit: submitVoiceTurn,
|
||||
onTranscribeAudio,
|
||||
pendingResponseText
|
||||
pendingResponse
|
||||
})
|
||||
|
||||
const contextMenu = (
|
||||
<ContextMenu
|
||||
onAddContextRef={onAddContextRef}
|
||||
onInsertText={insertText}
|
||||
onOpenUrlDialog={() => {
|
||||
triggerHaptic('open')
|
||||
|
|
@ -313,6 +313,7 @@ export function ChatBar({
|
|||
void conversation.end()
|
||||
},
|
||||
onStart: () => setVoiceConversationActive(true),
|
||||
onStopTurn: conversation.stopTurn,
|
||||
onToggleMute: conversation.toggleMute,
|
||||
status: conversation.status
|
||||
}}
|
||||
|
|
@ -343,14 +344,12 @@ export function ChatBar({
|
|||
return (
|
||||
<>
|
||||
<ComposerPrimitive.Unstable_TriggerPopoverRoot>
|
||||
{mentionCategories.length > 0 && (
|
||||
<DirectivePopover
|
||||
adapter={mention.adapter}
|
||||
directive={mention.directive}
|
||||
fallbackIcon={mention.fallbackIcon ?? FileText}
|
||||
iconMap={mention.iconMap ?? DIRECTIVE_ICONS}
|
||||
/>
|
||||
)}
|
||||
<DirectivePopover
|
||||
adapter={mention.adapter}
|
||||
directive={mention.directive}
|
||||
fallbackIcon={mention.fallbackIcon ?? FileText}
|
||||
iconMap={mention.iconMap ?? DIRECTIVE_ICONS}
|
||||
/>
|
||||
<ComposerPrimitive.Root
|
||||
className={cn(SHELL, 'group/composer pb-8 pt-2')}
|
||||
onSubmit={e => {
|
||||
|
|
@ -407,6 +406,7 @@ export function ChatBar({
|
|||
style={{ ...COMPOSER_BACKDROP_STYLE, borderRadius: `${glassTweaks.liquid.cornerRadius}px` }}
|
||||
>
|
||||
<VoiceActivity state={voiceActivityState} />
|
||||
<VoicePlaybackActivity />
|
||||
{attachments.length > 0 && <AttachmentList attachments={attachments} onRemove={onRemoveAttachment} />}
|
||||
{stacked ? (
|
||||
<>
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ export interface ChatBarProps {
|
|||
onPickFolders?: () => void
|
||||
onPickImages?: () => void
|
||||
onRemoveAttachment?: (id: string) => void
|
||||
onSubmit: (value: string) => void
|
||||
onSubmit: (value: string) => Promise<void> | void
|
||||
onTranscribeAudio?: (audio: Blob) => Promise<string>
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
import { Globe } from 'lucide-react'
|
||||
import type * as React from 'react'
|
||||
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle } from '@/components/ui/dialog'
|
||||
import { Input } from '@/components/ui/input'
|
||||
|
||||
const URL_HINT = /^https?:\/\//i
|
||||
|
||||
export function UrlDialog({
|
||||
inputRef,
|
||||
onChange,
|
||||
|
|
@ -19,14 +22,23 @@ export function UrlDialog({
|
|||
open: boolean
|
||||
value: string
|
||||
}) {
|
||||
const trimmed = value.trim()
|
||||
const looksLikeUrl = trimmed.length > 0 && URL_HINT.test(trimmed)
|
||||
|
||||
return (
|
||||
<Dialog onOpenChange={onOpenChange} open={open}>
|
||||
<DialogContent className="max-w-md">
|
||||
<DialogHeader>
|
||||
<DialogTitle>Add URL Context</DialogTitle>
|
||||
<DialogDescription>
|
||||
Hermes will fetch this URL via the existing @url context resolver when you send the prompt.
|
||||
</DialogDescription>
|
||||
<DialogContent className="max-w-md gap-5">
|
||||
<DialogHeader className="flex-row items-center gap-3 sm:items-center">
|
||||
<span
|
||||
aria-hidden
|
||||
className="grid size-9 shrink-0 place-items-center rounded-xl bg-[color-mix(in_srgb,var(--dt-primary)_14%,transparent)] text-primary ring-1 ring-inset ring-primary/15"
|
||||
>
|
||||
<Globe className="size-4" />
|
||||
</span>
|
||||
<div className="grid gap-0.5 text-left">
|
||||
<DialogTitle>Attach a URL</DialogTitle>
|
||||
<DialogDescription>Hermes will fetch the page and include it as context for this turn.</DialogDescription>
|
||||
</div>
|
||||
</DialogHeader>
|
||||
<form
|
||||
className="grid gap-4"
|
||||
|
|
@ -35,18 +47,29 @@ export function UrlDialog({
|
|||
onSubmit()
|
||||
}}
|
||||
>
|
||||
<Input
|
||||
onChange={e => onChange(e.target.value)}
|
||||
placeholder="https://example.com"
|
||||
ref={inputRef}
|
||||
value={value}
|
||||
/>
|
||||
<div className="grid gap-1.5">
|
||||
<Input
|
||||
autoComplete="off"
|
||||
autoCorrect="off"
|
||||
inputMode="url"
|
||||
onChange={e => onChange(e.target.value)}
|
||||
placeholder="https://example.com/post"
|
||||
ref={inputRef}
|
||||
spellCheck={false}
|
||||
value={value}
|
||||
/>
|
||||
{trimmed.length > 0 && !looksLikeUrl && (
|
||||
<p className="text-xs text-muted-foreground/85">
|
||||
Include the full URL, e.g. <span className="font-mono">https://…</span>
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
<DialogFooter>
|
||||
<Button onClick={() => onOpenChange(false)} type="button" variant="ghost">
|
||||
Cancel
|
||||
</Button>
|
||||
<Button disabled={!value.trim()} type="submit">
|
||||
Add URL
|
||||
<Button disabled={!looksLikeUrl} type="submit">
|
||||
Attach
|
||||
</Button>
|
||||
</DialogFooter>
|
||||
</form>
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
import { Loader2, Mic } from 'lucide-react'
|
||||
import { useStore } from '@nanostores/react'
|
||||
import { Loader2, Mic, Volume2, VolumeX } from 'lucide-react'
|
||||
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { cn } from '@/lib/utils'
|
||||
import { stopVoicePlayback } from '@/lib/voice-playback'
|
||||
import { $voicePlayback } from '@/store/voice-playback'
|
||||
|
||||
import type { VoiceActivityState } from './types'
|
||||
|
||||
|
|
@ -36,6 +40,25 @@ function VoiceLevelBars({ level, active }: { active: boolean; level: number }) {
|
|||
)
|
||||
}
|
||||
|
||||
function PlaybackBars() {
|
||||
const bars = [820, 940, 760, 880, 700, 980, 790]
|
||||
|
||||
return (
|
||||
<div aria-hidden="true" className="flex h-4 items-center gap-0.75">
|
||||
{bars.map((duration, index) => (
|
||||
<span
|
||||
className="voice-wave-bar h-full w-0.5 rounded-full bg-current"
|
||||
key={index}
|
||||
style={{
|
||||
animationDelay: `${index * -110}ms`,
|
||||
animationDuration: `${duration}ms`
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export function VoiceActivity({
|
||||
state
|
||||
}: {
|
||||
|
|
@ -75,3 +98,50 @@ export function VoiceActivity({
|
|||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export function VoicePlaybackActivity() {
|
||||
const playback = useStore($voicePlayback)
|
||||
|
||||
if (playback.status === 'idle') {
|
||||
return null
|
||||
}
|
||||
|
||||
const preparing = playback.status === 'preparing'
|
||||
|
||||
const title = preparing
|
||||
? 'Preparing audio'
|
||||
: playback.source === 'voice-conversation'
|
||||
? 'Speaking response'
|
||||
: 'Reading aloud'
|
||||
|
||||
return (
|
||||
<div
|
||||
aria-live="polite"
|
||||
className={cn(
|
||||
'flex h-8 items-center gap-2 rounded-xl border border-primary/20 bg-primary/10 px-2.5 text-xs text-primary',
|
||||
'shadow-[inset_0_1px_0_rgba(255,255,255,0.35)] backdrop-blur-sm'
|
||||
)}
|
||||
role="status"
|
||||
>
|
||||
<div className="flex size-5 shrink-0 items-center justify-center rounded-full bg-primary/15 text-primary">
|
||||
{preparing ? <Loader2 className="animate-spin" size={12} /> : <Volume2 size={12} />}
|
||||
</div>
|
||||
|
||||
<div className="flex min-w-0 flex-1 items-center gap-2">
|
||||
<span className="truncate font-medium text-foreground/85">{title}</span>
|
||||
{!preparing && <PlaybackBars />}
|
||||
</div>
|
||||
|
||||
<Button
|
||||
className="h-6 shrink-0 gap-1 rounded-full px-2 text-[0.6875rem]"
|
||||
onClick={stopVoicePlayback}
|
||||
size="sm"
|
||||
type="button"
|
||||
variant="ghost"
|
||||
>
|
||||
<VolumeX size={12} />
|
||||
Stop
|
||||
</Button>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { useCallback } from 'react'
|
||||
|
||||
import { formatRefValue } from '@/components/assistant-ui/directive-text'
|
||||
import { attachmentId, contextPath, pathLabel } from '@/lib/chat-runtime'
|
||||
import {
|
||||
addComposerAttachment,
|
||||
|
|
@ -57,7 +58,7 @@ export function useComposerActions({ activeSessionId, currentCwd, requestGateway
|
|||
kind,
|
||||
label: pathLabel(path),
|
||||
detail: rel,
|
||||
refText: `@${kind}:${rel}`,
|
||||
refText: `@${kind}:${formatRefValue(rel)}`,
|
||||
path
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,13 +8,14 @@ import { useStore } from '@nanostores/react'
|
|||
import { useQuery } from '@tanstack/react-query'
|
||||
import { ChevronDown } from 'lucide-react'
|
||||
import type * as React from 'react'
|
||||
import { Suspense, useMemo } from 'react'
|
||||
import { Suspense, useMemo, useRef } from 'react'
|
||||
import { useLocation } from 'react-router-dom'
|
||||
|
||||
import { Thread } from '@/components/assistant-ui/thread'
|
||||
import { NotificationStack } from '@/components/notifications'
|
||||
import { Button } from '@/components/ui/button'
|
||||
import { getGlobalModelOptions, type HermesGateway } from '@/hermes'
|
||||
import type { ChatMessage } from '@/lib/chat-messages'
|
||||
import { quickModelOptions, sessionTitle, toRuntimeMessage } from '@/lib/chat-runtime'
|
||||
import { cn } from '@/lib/utils'
|
||||
import { $pinnedSessionIds } from '@/store/layout'
|
||||
|
|
@ -57,7 +58,7 @@ interface ChatViewProps extends Omit<React.ComponentProps<'div'>, 'onSubmit'> {
|
|||
onPickFolders: () => void
|
||||
onPickImages: () => void
|
||||
onRemoveAttachment: (id: string) => void
|
||||
onSubmit: (text: string) => void
|
||||
onSubmit: (text: string) => Promise<void> | void
|
||||
onChangeCwd: (cwd: string) => void
|
||||
onBrowseCwd: () => void
|
||||
onOpenModelPicker: () => void
|
||||
|
|
@ -118,6 +119,7 @@ export function ChatView({
|
|||
const pinnedSessionIds = useStore($pinnedSessionIds)
|
||||
const selectedSessionId = useStore($selectedStoredSessionId)
|
||||
const sessions = useStore($sessions)
|
||||
const runtimeMessageCacheRef = useRef(new WeakMap<ChatMessage, ThreadMessage>())
|
||||
const activeStoredSession = sessions.find(session => session.id === selectedSessionId) || null
|
||||
const isRoutedSessionView = Boolean(routeSessionId(location.pathname))
|
||||
const selectedIsPinned = selectedSessionId ? pinnedSessionIds.includes(selectedSessionId) : false
|
||||
|
|
@ -128,6 +130,7 @@ export function ChatView({
|
|||
const loadingSession = isRoutedSessionView && messages.length === 0
|
||||
const threadLoading = threadLoadingState(loadingSession, busy, awaitingResponse)
|
||||
const showChatBar = !loadingSession
|
||||
const threadKey = selectedSessionId || activeSessionId || (isRoutedSessionView ? location.pathname : 'new')
|
||||
const title = activeStoredSession ? sessionTitle(activeStoredSession) : ''
|
||||
|
||||
const modelOptionsQuery = useQuery<ModelOptionsResponse>({
|
||||
|
|
@ -190,7 +193,14 @@ export function ChatView({
|
|||
parentId = branchParentByGroup.get(message.branchGroupId) ?? null
|
||||
}
|
||||
|
||||
items.push({ message: toRuntimeMessage(message), parentId })
|
||||
const cachedMessage = runtimeMessageCacheRef.current.get(message)
|
||||
const runtimeMessage = cachedMessage ?? toRuntimeMessage(message)
|
||||
|
||||
if (!cachedMessage) {
|
||||
runtimeMessageCacheRef.current.set(message, runtimeMessage)
|
||||
}
|
||||
|
||||
items.push({ message: runtimeMessage, parentId })
|
||||
|
||||
if (!message.hidden) {
|
||||
visibleParentId = message.id
|
||||
|
|
@ -248,6 +258,7 @@ export function ChatView({
|
|||
intro={showIntro ? { personality: introPersonality, seed: introSeed } : undefined}
|
||||
loading={threadLoading}
|
||||
onBranchInNewChat={onBranchInNewChat}
|
||||
sessionKey={threadKey}
|
||||
/>
|
||||
{showChatBar && (
|
||||
<Suspense fallback={<ChatBarFallback />}>
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import {
|
|||
listSessions,
|
||||
setGlobalModel
|
||||
} from '../hermes'
|
||||
import { formatRefValue } from '../components/assistant-ui/directive-text'
|
||||
import { toChatMessages } from '../lib/chat-messages'
|
||||
import { BUILTIN_PERSONALITIES, normalizePersonalityValue, personalityNamesFromConfig } from '../lib/chat-runtime'
|
||||
import { $pinnedSessionIds, pinSession, unpinSession } from '../store/layout'
|
||||
|
|
@ -571,7 +572,7 @@ export function DesktopController() {
|
|||
gateway={gatewayRef.current}
|
||||
maxVoiceRecordingSeconds={voiceMaxRecordingSeconds}
|
||||
onAddContextRef={addContextRefAttachment}
|
||||
onAddUrl={url => addContextRefAttachment(`@url:${url}`, url)}
|
||||
onAddUrl={url => addContextRefAttachment(`@url:${formatRefValue(url)}`, url)}
|
||||
onBranchInNewChat={messageId => void branchInNewChat(messageId)}
|
||||
onBrowseCwd={() => void browseSessionCwd()}
|
||||
onCancel={() => void cancelRun()}
|
||||
|
|
@ -589,7 +590,7 @@ export function DesktopController() {
|
|||
onReload={reloadFromMessage}
|
||||
onRemoveAttachment={id => void removeAttachment(id)}
|
||||
onSelectPersonality={name => void selectPersonality(name)}
|
||||
onSubmit={text => void submitText(text)}
|
||||
onSubmit={submitText}
|
||||
onThreadMessagesChange={handleThreadMessagesChange}
|
||||
onToggleSelectedPin={toggleSelectedPin}
|
||||
onTranscribeAudio={transcribeVoiceAudio}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
import type { QueryClient } from '@tanstack/react-query'
|
||||
import { type MutableRefObject, useCallback } from 'react'
|
||||
import { flushSync } from 'react-dom'
|
||||
|
||||
import {
|
||||
appendReasoningPart,
|
||||
|
|
@ -60,7 +59,6 @@ export function useMessageStream({
|
|||
transform: (parts: ChatMessagePart[], message: ChatMessage) => ChatMessagePart[],
|
||||
seed: () => ChatMessagePart[],
|
||||
opts: {
|
||||
sync?: boolean
|
||||
pending?: (message: ChatMessage) => boolean
|
||||
} = {}
|
||||
) => {
|
||||
|
|
@ -112,7 +110,7 @@ export function useMessageStream({
|
|||
})
|
||||
}
|
||||
|
||||
opts.sync ? flushSync(apply) : apply()
|
||||
apply()
|
||||
},
|
||||
[updateSessionState]
|
||||
)
|
||||
|
|
@ -126,8 +124,7 @@ export function useMessageStream({
|
|||
mutateStream(
|
||||
sessionId,
|
||||
parts => appendTextPart(parts, delta),
|
||||
() => [textPart(delta)],
|
||||
{ sync: true }
|
||||
() => [textPart(delta)]
|
||||
)
|
||||
},
|
||||
[mutateStream]
|
||||
|
|
@ -152,8 +149,7 @@ export function useMessageStream({
|
|||
|
||||
return appendReasoningPart(parts, delta)
|
||||
},
|
||||
() => [reasoningPart(delta)],
|
||||
{ sync: true }
|
||||
() => [reasoningPart(delta)]
|
||||
)
|
||||
},
|
||||
[mutateStream]
|
||||
|
|
@ -299,6 +295,7 @@ export function useMessageStream({
|
|||
const apply = explicitSid ? isActiveEvent : !activeSessionIdRef.current
|
||||
const modelChanged = typeof payload?.model === 'string'
|
||||
const providerChanged = typeof payload?.provider === 'string'
|
||||
const runningChanged = typeof payload?.running === 'boolean'
|
||||
|
||||
if (apply) {
|
||||
if (modelChanged) {
|
||||
|
|
@ -320,6 +317,35 @@ export function useMessageStream({
|
|||
if (typeof payload?.personality === 'string') {
|
||||
setCurrentPersonality(normalizePersonalityValue(payload.personality))
|
||||
}
|
||||
|
||||
if (runningChanged && sessionId) {
|
||||
updateSessionState(sessionId, state => {
|
||||
const busy = Boolean(payload!.running)
|
||||
|
||||
if (state.busy === busy && (busy || !state.awaitingResponse)) {
|
||||
return state
|
||||
}
|
||||
|
||||
if (busy) {
|
||||
return {
|
||||
...state,
|
||||
busy
|
||||
}
|
||||
}
|
||||
|
||||
if (state.awaitingResponse && !state.sawAssistantPayload) {
|
||||
return state
|
||||
}
|
||||
|
||||
return {
|
||||
...state,
|
||||
awaitingResponse: false,
|
||||
busy,
|
||||
pendingBranchGroup: null,
|
||||
streamId: null
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
void refreshHermesConfig()
|
||||
|
|
@ -355,11 +381,11 @@ export function useMessageStream({
|
|||
}
|
||||
} else if (event.type === 'reasoning.delta') {
|
||||
if (sessionId) {
|
||||
appendReasoningDelta(sessionId, coerceGatewayText(payload?.text))
|
||||
appendReasoningDelta(sessionId, coerceThinkingText(payload?.text))
|
||||
}
|
||||
} else if (event.type === 'reasoning.available') {
|
||||
if (sessionId) {
|
||||
appendReasoningDelta(sessionId, coerceGatewayText(payload?.text), true)
|
||||
appendReasoningDelta(sessionId, coerceThinkingText(payload?.text), true)
|
||||
}
|
||||
} else if (event.type === 'message.complete') {
|
||||
if (!sessionId) {
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ import {
|
|||
import { triggerHaptic } from '@/lib/haptics'
|
||||
import { $composerAttachments, clearComposerAttachments } from '@/store/composer'
|
||||
import { clearNotifications, notify, notifyError } from '@/store/notifications'
|
||||
import { $busy, $messages, setAwaitingResponse, setBusy } from '@/store/session'
|
||||
import { $busy, $messages, setAwaitingResponse, setBusy, setMessages } from '@/store/session'
|
||||
|
||||
import type { ClientSessionState, SlashExecResponse } from '../../types'
|
||||
|
||||
|
|
@ -296,12 +296,34 @@ export function usePromptActions({
|
|||
)
|
||||
|
||||
const cancelRun = useCallback(async () => {
|
||||
if (!activeSessionId) {
|
||||
const sessionId = activeSessionId || activeSessionIdRef.current
|
||||
|
||||
busyRef.current = false
|
||||
setBusy(false)
|
||||
setAwaitingResponse(false)
|
||||
|
||||
const finalizeMessages = (messages: ChatMessage[]) =>
|
||||
messages.map(message =>
|
||||
message.pending
|
||||
? {
|
||||
...message,
|
||||
parts: chatMessageText(message).trim()
|
||||
? appendTextPart(message.parts, INTERRUPTED_MARKER)
|
||||
: [...message.parts, textPart(INTERRUPTED_MARKER.trim())],
|
||||
pending: false
|
||||
}
|
||||
: message
|
||||
)
|
||||
|
||||
if (!sessionId) {
|
||||
setMessages(finalizeMessages($messages.get()))
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
updateSessionState(activeSessionId, state => {
|
||||
updateSessionState(sessionId, state => {
|
||||
const streamId = state.streamId
|
||||
|
||||
const messages = streamId
|
||||
? state.messages.map(message =>
|
||||
message.id === streamId
|
||||
|
|
@ -314,7 +336,7 @@ export function usePromptActions({
|
|||
}
|
||||
: message
|
||||
)
|
||||
: state.messages
|
||||
: finalizeMessages(state.messages)
|
||||
|
||||
return {
|
||||
...state,
|
||||
|
|
@ -328,11 +350,11 @@ export function usePromptActions({
|
|||
})
|
||||
|
||||
try {
|
||||
await requestGateway('session.interrupt', { session_id: activeSessionId })
|
||||
await requestGateway('session.interrupt', { session_id: sessionId })
|
||||
} catch (err) {
|
||||
notifyError(err, 'Stop failed')
|
||||
}
|
||||
}, [activeSessionId, requestGateway, updateSessionState])
|
||||
}, [activeSessionId, activeSessionIdRef, busyRef, requestGateway, updateSessionState])
|
||||
|
||||
const reloadFromMessage = useCallback(
|
||||
async (parentId: string | null) => {
|
||||
|
|
|
|||
|
|
@ -87,6 +87,11 @@ export function useSessionActions({
|
|||
|
||||
const createBackendSessionForSend = useCallback(async (): Promise<string | null> => {
|
||||
const created = await requestGateway<SessionCreateResponse>('session.create', { cols: 96 })
|
||||
|
||||
if (created.stored_session_id) {
|
||||
navigate(sessionRoute(created.stored_session_id), { replace: true })
|
||||
}
|
||||
|
||||
setActiveSessionId(created.session_id)
|
||||
activeSessionIdRef.current = created.session_id
|
||||
ensureSessionState(created.session_id, created.stored_session_id ?? null)
|
||||
|
|
@ -94,7 +99,6 @@ export function useSessionActions({
|
|||
if (created.stored_session_id) {
|
||||
setSelectedStoredSessionId(created.stored_session_id)
|
||||
selectedStoredSessionIdRef.current = created.stored_session_id
|
||||
navigate(sessionRoute(created.stored_session_id), { replace: true })
|
||||
}
|
||||
|
||||
if (created.info?.model) {
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ export const ENUM_OPTIONS: Record<string, string[]> = {
|
|||
'context.engine': ['compressor', 'default', 'custom'],
|
||||
'delegation.reasoning_effort': ['', 'minimal', 'low', 'medium', 'high', 'xhigh'],
|
||||
'memory.provider': ['', 'builtin', 'honcho'],
|
||||
'stt.elevenlabs.model_id': ['scribe_v2', 'scribe_v1'],
|
||||
'stt.local.model': ['tiny', 'base', 'small', 'medium', 'large-v3'],
|
||||
'tts.openai.voice': ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
|
||||
}
|
||||
|
|
@ -101,6 +102,10 @@ export const FIELD_LABELS: Record<string, string> = {
|
|||
'stt.provider': 'Speech-To-Text Provider',
|
||||
'stt.local.model': 'Local Transcription Model',
|
||||
'stt.local.language': 'Transcription Language',
|
||||
'stt.elevenlabs.model_id': 'ElevenLabs STT Model',
|
||||
'stt.elevenlabs.language_code': 'ElevenLabs Language',
|
||||
'stt.elevenlabs.tag_audio_events': 'Tag Audio Events',
|
||||
'stt.elevenlabs.diarize': 'Speaker Diarization',
|
||||
'tts.provider': 'Text-To-Speech Provider',
|
||||
'tts.edge.voice': 'Edge Voice',
|
||||
'tts.openai.model': 'OpenAI TTS Model',
|
||||
|
|
@ -157,6 +162,7 @@ export const FIELD_DESCRIPTIONS: Record<string, string> = {
|
|||
'compression.enabled': 'Summarize older context when conversations get large.',
|
||||
'voice.auto_tts': 'Automatically speak assistant responses.',
|
||||
'stt.enabled': 'Enable local or provider-backed speech transcription.',
|
||||
'stt.elevenlabs.language_code': 'Optional ISO-639-3 language code. Blank lets ElevenLabs auto-detect.',
|
||||
'agent.max_turns': 'Upper bound for tool-calling turns before Hermes stops a run.'
|
||||
}
|
||||
|
||||
|
|
@ -241,6 +247,10 @@ export const SECTIONS: DesktopConfigSection[] = [
|
|||
'tts.elevenlabs.model_id',
|
||||
'stt.local.model',
|
||||
'stt.local.language',
|
||||
'stt.elevenlabs.model_id',
|
||||
'stt.elevenlabs.language_code',
|
||||
'stt.elevenlabs.tag_audio_events',
|
||||
'stt.elevenlabs.diarize',
|
||||
'voice.record_key',
|
||||
'voice.max_recording_seconds'
|
||||
]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,39 @@
|
|||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import { formatRefValue, hermesDirectiveFormatter } from './directive-text'
|
||||
|
||||
describe('formatRefValue', () => {
|
||||
it('leaves simple paths untouched', () => {
|
||||
expect(formatRefValue('src/index.ts')).toBe('src/index.ts')
|
||||
expect(formatRefValue('https://example.com/post')).toBe('https://example.com/post')
|
||||
})
|
||||
|
||||
it('wraps paths with whitespace in backticks', () => {
|
||||
expect(formatRefValue('apple-touch-icon (1).png')).toBe('`apple-touch-icon (1).png`')
|
||||
})
|
||||
|
||||
it('falls back to double quotes when value contains backticks', () => {
|
||||
expect(formatRefValue('weird `name` (1).md')).toBe('"weird `name` (1).md"')
|
||||
})
|
||||
})
|
||||
|
||||
describe('hermesDirectiveFormatter.parse', () => {
|
||||
it('keeps quoted file paths whole when parsing', () => {
|
||||
const segments = hermesDirectiveFormatter.parse('see @image:`apple-touch-icon (1).png` for the icon')
|
||||
|
||||
expect(segments).toEqual([
|
||||
{ kind: 'text', text: 'see ' },
|
||||
{ kind: 'mention', type: 'image', label: 'apple-touch-icon (1).png', id: 'apple-touch-icon (1).png' },
|
||||
{ kind: 'text', text: ' for the icon' }
|
||||
])
|
||||
})
|
||||
|
||||
it('still parses unquoted paths', () => {
|
||||
const segments = hermesDirectiveFormatter.parse('@file:src/main.tsx the entry point')
|
||||
|
||||
expect(segments).toEqual([
|
||||
{ kind: 'mention', type: 'file', label: 'main.tsx', id: 'src/main.tsx' },
|
||||
{ kind: 'text', text: ' the entry point' }
|
||||
])
|
||||
})
|
||||
})
|
||||
|
|
@ -24,10 +24,63 @@ const ICONS: Record<HermesRefType, ComponentType<{ className?: string }>> = {
|
|||
* so they render as inline chips in user messages instead of raw text.
|
||||
*
|
||||
* Supported types: file, folder, url, image. Anything else stays plain text.
|
||||
*
|
||||
* Mirrors the Python `agent/context_references.REFERENCE_PATTERN` syntax:
|
||||
* the value may be wrapped in backticks, single quotes, or double quotes so
|
||||
* paths with spaces/parens/etc. survive parsing intact.
|
||||
*/
|
||||
const CANONICAL_DIRECTIVE_RE = /:([\w-]{1,64})\[([^\]\n]{1,1024})\](?:\{name=([^}\n]{1,1024})\})?/gu
|
||||
const CANONICAL_DIRECTIVE_RE = /:([\w-]{1,64})\[([^\]\n]{1,1024})\](?:\{name=([^}\n]{1,1024})\})?/g
|
||||
|
||||
const HERMES_DIRECTIVE_RE = /@(file|folder|url|image|tool):(\S+)/gu
|
||||
const HERMES_DIRECTIVE_RE = new RegExp(
|
||||
'@(file|folder|url|image|tool):(' +
|
||||
'`[^`\\n]+`' +
|
||||
'|"[^"\\n]+"' +
|
||||
"|'[^'\\n]+'" +
|
||||
'|\\S+' +
|
||||
')',
|
||||
'g'
|
||||
)
|
||||
|
||||
const TRAILING_PUNCTUATION_RE = /[,.;!?]+$/
|
||||
|
||||
function unwrapRefValue(raw: string): string {
|
||||
if (raw.length < 2) {
|
||||
return raw
|
||||
}
|
||||
|
||||
const head = raw[0]
|
||||
const tail = raw[raw.length - 1]
|
||||
|
||||
if ((head === '`' && tail === '`') || (head === '"' && tail === '"') || (head === "'" && tail === "'")) {
|
||||
return raw.slice(1, -1)
|
||||
}
|
||||
|
||||
return raw.replace(TRAILING_PUNCTUATION_RE, '')
|
||||
}
|
||||
|
||||
function needsQuoting(value: string): boolean {
|
||||
return /[\s()\[\]{}<>"'`]/.test(value)
|
||||
}
|
||||
|
||||
export function formatRefValue(value: string): string {
|
||||
if (!needsQuoting(value)) {
|
||||
return value
|
||||
}
|
||||
|
||||
if (!value.includes('`')) {
|
||||
return `\`${value}\``
|
||||
}
|
||||
|
||||
if (!value.includes('"')) {
|
||||
return `"${value}"`
|
||||
}
|
||||
|
||||
if (!value.includes("'")) {
|
||||
return `'${value}'`
|
||||
}
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
export const hermesDirectiveFormatter: Unstable_DirectiveFormatter = {
|
||||
serialize(item: Unstable_TriggerItem): string {
|
||||
|
|
@ -35,7 +88,7 @@ export const hermesDirectiveFormatter: Unstable_DirectiveFormatter = {
|
|||
return `@${item.id}`
|
||||
}
|
||||
|
||||
return `@${item.type}:${item.id}`
|
||||
return `@${item.type}:${formatRefValue(item.id)}`
|
||||
},
|
||||
parse(text: string): readonly Unstable_DirectiveSegment[] {
|
||||
return parseDirectiveText(text)
|
||||
|
|
@ -51,13 +104,17 @@ function parseDirectiveText(text: string): Unstable_DirectiveSegment[] {
|
|||
label: match[2] || match[3] || '',
|
||||
id: match[3] || match[2] || ''
|
||||
})),
|
||||
...Array.from(text.matchAll(HERMES_DIRECTIVE_RE)).map(match => ({
|
||||
start: match.index ?? 0,
|
||||
end: (match.index ?? 0) + match[0].length,
|
||||
type: match[1] || 'file',
|
||||
label: shortLabel(match[1] as HermesRefType, match[2] || ''),
|
||||
id: match[2] || ''
|
||||
}))
|
||||
...Array.from(text.matchAll(HERMES_DIRECTIVE_RE)).map(match => {
|
||||
const id = unwrapRefValue(match[2] || '')
|
||||
|
||||
return {
|
||||
start: match.index ?? 0,
|
||||
end: (match.index ?? 0) + match[0].length,
|
||||
type: match[1] || 'file',
|
||||
label: shortLabel(match[1] as HermesRefType, id),
|
||||
id
|
||||
}
|
||||
})
|
||||
]
|
||||
.filter(match => match.id)
|
||||
.sort((a, b) => a.start - b.start)
|
||||
|
|
@ -136,14 +193,14 @@ const DirectiveChip: FC<{
|
|||
return (
|
||||
<span
|
||||
className={cn(
|
||||
'mx-0.5 inline-flex max-w-56 items-center gap-1 rounded-full border border-border/80 bg-background/95 px-1.5 py-0.5 align-[0.05em] text-[0.82em] font-medium leading-none text-foreground shadow-sm ring-1 ring-black/3'
|
||||
'mx-0.5 inline-flex max-w-64 items-center gap-1 rounded-full bg-[color-mix(in_srgb,var(--dt-primary)_16%,transparent)] px-2 py-0.5 align-[0.02em] text-[0.92em] font-semibold leading-tight text-primary ring-1 ring-inset ring-primary/10'
|
||||
)}
|
||||
data-directive-id={id}
|
||||
data-directive-type={type}
|
||||
data-slot="aui_directive-chip"
|
||||
title={id}
|
||||
>
|
||||
{Icon && <Icon className="size-3 shrink-0 text-muted-foreground" />}
|
||||
{Icon && <Icon className="size-3.5 shrink-0 text-primary" />}
|
||||
<span className="truncate">{label}</span>
|
||||
</span>
|
||||
)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ export type IntroProps = {
|
|||
const NEUTRAL_PERSONALITIES = new Set(['', 'default', 'none', 'neutral'])
|
||||
|
||||
const HERMES_FRAME_COUNT = 8
|
||||
const ASSET_BASE_URL = import.meta.env.BASE_URL || '/'
|
||||
|
||||
const FALLBACK_COPY: IntroCopy[] = [
|
||||
{
|
||||
|
|
@ -154,6 +155,10 @@ function resolveCopy(personality?: string, seed?: number): IntroCopy {
|
|||
return pickCopy(copies, seed)
|
||||
}
|
||||
|
||||
function publicAssetPath(path: string): string {
|
||||
return `${ASSET_BASE_URL}${path}`.replace(/([^:]\/)\/+/g, '$1')
|
||||
}
|
||||
|
||||
export const Intro: FC<IntroProps> = ({ personality, seed }) => {
|
||||
const [mountSeed] = useState(() => Math.floor(Math.random() * 100000))
|
||||
const [frameOffset, setFrameOffset] = useState(0)
|
||||
|
|
@ -184,7 +189,7 @@ export const Intro: FC<IntroProps> = ({ personality, seed }) => {
|
|||
aria-hidden="true"
|
||||
className="h-full w-full scale-110 object-contain select-none"
|
||||
draggable={false}
|
||||
src={`/hermes-frames/hermes-frame-${frameIndex}.png?v=matte-clean-6`}
|
||||
src={publicAssetPath(`hermes-frames/hermes-frame-${frameIndex}.png?v=matte-clean-6`)}
|
||||
/>
|
||||
</button>
|
||||
<p className="mb-3 text-xs font-medium uppercase tracking-[0.18em] text-muted-foreground/75">Hermes Agent</p>
|
||||
|
|
|
|||
|
|
@ -1,19 +1,53 @@
|
|||
import { AssistantRuntimeProvider, type ThreadMessage, useExternalStoreRuntime } from '@assistant-ui/react'
|
||||
import { act, render, screen, waitFor } from '@testing-library/react'
|
||||
import { act, fireEvent, render, screen, waitFor } from '@testing-library/react'
|
||||
import { useEffect, useState } from 'react'
|
||||
import { describe, expect, it, vi } from 'vitest'
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest'
|
||||
|
||||
import { Thread } from './thread'
|
||||
|
||||
const createdAt = new Date('2026-05-01T00:00:00.000Z')
|
||||
|
||||
const resizeObservers = new Set<TestResizeObserver>()
|
||||
|
||||
class TestResizeObserver {
|
||||
observe() {}
|
||||
private target: Element | null = null
|
||||
|
||||
constructor(private readonly callback: ResizeObserverCallback) {
|
||||
resizeObservers.add(this)
|
||||
}
|
||||
|
||||
observe(target: Element) {
|
||||
this.target = target
|
||||
}
|
||||
|
||||
unobserve() {}
|
||||
disconnect() {}
|
||||
|
||||
disconnect() {
|
||||
resizeObservers.delete(this)
|
||||
}
|
||||
|
||||
trigger(height: number) {
|
||||
if (!this.target) {
|
||||
return
|
||||
}
|
||||
|
||||
this.callback(
|
||||
[
|
||||
{
|
||||
contentRect: { height } as DOMRectReadOnly,
|
||||
target: this.target
|
||||
} as ResizeObserverEntry
|
||||
],
|
||||
this as unknown as ResizeObserver
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
vi.stubGlobal('ResizeObserver', TestResizeObserver)
|
||||
vi.stubGlobal('requestAnimationFrame', (callback: FrameRequestCallback) =>
|
||||
window.setTimeout(() => callback(performance.now()), 0)
|
||||
)
|
||||
vi.stubGlobal('cancelAnimationFrame', (id: number) => window.clearTimeout(id))
|
||||
|
||||
Element.prototype.scrollTo = function scrollTo() {}
|
||||
|
||||
|
|
@ -90,6 +124,10 @@ function StreamingHarness() {
|
|||
}
|
||||
|
||||
describe('assistant-ui streaming renderer', () => {
|
||||
beforeEach(() => {
|
||||
resizeObservers.clear()
|
||||
})
|
||||
|
||||
it('renders assistant text incrementally before completion', async () => {
|
||||
const { container } = render(<StreamingHarness />)
|
||||
|
||||
|
|
@ -115,4 +153,42 @@ describe('assistant-ui streaming renderer', () => {
|
|||
expect(container.textContent).toContain('first chunk second chunk')
|
||||
})
|
||||
})
|
||||
|
||||
it('does not pull the viewport back down after the user scrolls up during streaming', async () => {
|
||||
const { container } = render(<StreamingHarness />)
|
||||
|
||||
const viewport = container.querySelector('[data-slot="aui_thread-viewport"]') as HTMLDivElement
|
||||
let scrollHeight = 1_000
|
||||
|
||||
Object.defineProperty(viewport, 'clientHeight', { configurable: true, value: 200 })
|
||||
Object.defineProperty(viewport, 'scrollHeight', {
|
||||
configurable: true,
|
||||
get: () => scrollHeight
|
||||
})
|
||||
|
||||
await wait(80)
|
||||
|
||||
await act(async () => {
|
||||
viewport.scrollTop = 800
|
||||
fireEvent.scroll(viewport)
|
||||
})
|
||||
await wait(0)
|
||||
|
||||
await act(async () => {
|
||||
fireEvent.wheel(viewport, { deltaY: -120 })
|
||||
viewport.scrollTop = 420
|
||||
fireEvent.scroll(viewport)
|
||||
})
|
||||
|
||||
scrollHeight = 1_200
|
||||
|
||||
await act(async () => {
|
||||
for (const observer of resizeObservers) {
|
||||
observer.trigger(1_200)
|
||||
}
|
||||
})
|
||||
await wait(0)
|
||||
|
||||
expect(viewport.scrollTop).toBe(420)
|
||||
})
|
||||
})
|
||||
|
|
|
|||
|
|
@ -8,18 +8,28 @@ import {
|
|||
type ToolCallMessagePartProps,
|
||||
useAuiState
|
||||
} from '@assistant-ui/react'
|
||||
import { useStore } from '@nanostores/react'
|
||||
import {
|
||||
CheckIcon,
|
||||
ChevronLeftIcon,
|
||||
ChevronRightIcon,
|
||||
CopyIcon,
|
||||
GitBranchIcon,
|
||||
Loader2Icon,
|
||||
MoreHorizontalIcon,
|
||||
RefreshCwIcon,
|
||||
Volume2Icon,
|
||||
VolumeXIcon
|
||||
} from 'lucide-react'
|
||||
import { type FC, type ReactNode, useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
|
||||
import {
|
||||
type FC,
|
||||
type ReactNode,
|
||||
useCallback,
|
||||
useEffect,
|
||||
useLayoutEffect,
|
||||
useRef,
|
||||
useState
|
||||
} from 'react'
|
||||
|
||||
import { useElapsedSeconds } from '@/components/assistant-ui/activity-timer'
|
||||
import { ActivityTimerText } from '@/components/assistant-ui/activity-timer-text'
|
||||
|
|
@ -38,11 +48,12 @@ import {
|
|||
DropdownMenuTrigger
|
||||
} from '@/components/ui/dropdown-menu'
|
||||
import { Loader } from '@/components/ui/loader'
|
||||
import { speakText } from '@/hermes'
|
||||
import { triggerHaptic } from '@/lib/haptics'
|
||||
import { cn } from '@/lib/utils'
|
||||
import { playSpeechText, stopVoicePlayback } from '@/lib/voice-playback'
|
||||
import { notifyError } from '@/store/notifications'
|
||||
import { setThreadScrolledUp } from '@/store/thread-scroll'
|
||||
import { $voicePlayback } from '@/store/voice-playback'
|
||||
|
||||
const THINKING_FACES = [
|
||||
'(。•́︿•̀。)',
|
||||
|
|
@ -119,12 +130,16 @@ export const Thread: FC<{
|
|||
intro?: IntroProps
|
||||
loading?: ThreadLoadingState
|
||||
onBranchInNewChat?: (messageId: string) => void
|
||||
}> = ({ intro, loading, onBranchInNewChat }) => {
|
||||
sessionKey?: string | null
|
||||
}> = ({ intro, loading, onBranchInNewChat, sessionKey }) => {
|
||||
const viewportRef = useRef<HTMLDivElement | null>(null)
|
||||
const contentRef = useRef<HTMLDivElement | null>(null)
|
||||
const messageCount = useAuiState(s => s.thread.messages.length)
|
||||
const isRunning = useAuiState(s => s.thread.isRunning)
|
||||
const lastMessageId = useAuiState(s => s.thread.messages.at(-1)?.id ?? '')
|
||||
const shouldStickToBottomRef = useRef(true)
|
||||
const scrollFrameRef = useRef<number | null>(null)
|
||||
const sessionKeyRef = useRef<string | null>(sessionKey ?? null)
|
||||
|
||||
const handleScroll = useCallback((event: React.UIEvent<HTMLDivElement>) => {
|
||||
const nearBottom = isNearBottom(event.currentTarget)
|
||||
|
|
@ -132,8 +147,44 @@ export const Thread: FC<{
|
|||
setThreadScrolledUp(!nearBottom)
|
||||
}, [])
|
||||
|
||||
const handleWheel = useCallback((event: React.WheelEvent<HTMLDivElement>) => {
|
||||
if (event.deltaY < 0) {
|
||||
shouldStickToBottomRef.current = false
|
||||
setThreadScrolledUp(true)
|
||||
}
|
||||
}, [])
|
||||
|
||||
const scrollToBottom = useCallback(() => {
|
||||
const viewport = viewportRef.current
|
||||
|
||||
if (!viewport) {
|
||||
return
|
||||
}
|
||||
|
||||
viewport.scrollTop = viewport.scrollHeight
|
||||
shouldStickToBottomRef.current = true
|
||||
setThreadScrolledUp(false)
|
||||
}, [])
|
||||
|
||||
const scheduleScrollToBottom = useCallback(() => {
|
||||
if (scrollFrameRef.current !== null) {
|
||||
window.cancelAnimationFrame(scrollFrameRef.current)
|
||||
}
|
||||
|
||||
scrollFrameRef.current = window.requestAnimationFrame(() => {
|
||||
scrollFrameRef.current = null
|
||||
scrollToBottom()
|
||||
})
|
||||
}, [scrollToBottom])
|
||||
|
||||
useEffect(() => {
|
||||
return () => setThreadScrolledUp(false)
|
||||
return () => {
|
||||
if (scrollFrameRef.current !== null) {
|
||||
window.cancelAnimationFrame(scrollFrameRef.current)
|
||||
}
|
||||
|
||||
setThreadScrolledUp(false)
|
||||
}
|
||||
}, [])
|
||||
|
||||
useLayoutEffect(() => {
|
||||
|
|
@ -143,16 +194,48 @@ export const Thread: FC<{
|
|||
return
|
||||
}
|
||||
|
||||
const force = loading === 'session'
|
||||
const nextSessionKey = sessionKey ?? null
|
||||
const sessionChanged = sessionKeyRef.current !== nextSessionKey
|
||||
sessionKeyRef.current = nextSessionKey
|
||||
const force = loading === 'session' || sessionChanged
|
||||
|
||||
if (!force && !shouldStickToBottomRef.current) {
|
||||
return
|
||||
}
|
||||
|
||||
viewport.scrollTop = viewport.scrollHeight
|
||||
shouldStickToBottomRef.current = true
|
||||
setThreadScrolledUp(false)
|
||||
}, [isRunning, lastMessageId, loading, messageCount])
|
||||
scheduleScrollToBottom()
|
||||
}, [isRunning, lastMessageId, loading, messageCount, scheduleScrollToBottom, sessionKey])
|
||||
|
||||
useLayoutEffect(() => {
|
||||
const content = contentRef.current
|
||||
const viewport = viewportRef.current
|
||||
|
||||
if (!content || !viewport) {
|
||||
return
|
||||
}
|
||||
|
||||
let previousHeight = content.getBoundingClientRect().height
|
||||
|
||||
const observer = new ResizeObserver(entries => {
|
||||
const height = entries[0]?.contentRect.height ?? content.getBoundingClientRect().height
|
||||
|
||||
if (height === previousHeight) {
|
||||
return
|
||||
}
|
||||
|
||||
previousHeight = height
|
||||
|
||||
if (!shouldStickToBottomRef.current && !isNearBottom(viewport)) {
|
||||
return
|
||||
}
|
||||
|
||||
scheduleScrollToBottom()
|
||||
})
|
||||
|
||||
observer.observe(content)
|
||||
|
||||
return () => observer.disconnect()
|
||||
}, [scheduleScrollToBottom])
|
||||
|
||||
return (
|
||||
<GeneratedImageProvider>
|
||||
|
|
@ -160,15 +243,17 @@ export const Thread: FC<{
|
|||
<AuiIf condition={s => Boolean(intro) && s.thread.isEmpty}>{intro && <Intro {...intro} />}</AuiIf>
|
||||
|
||||
<ThreadPrimitive.Viewport
|
||||
className="h-full min-h-0 overflow-y-auto overscroll-contain px-[clamp(1rem,10%,12rem)] pt-[calc(var(--vsq)*19)] scroll-smooth"
|
||||
autoScroll={false}
|
||||
className="h-full min-h-0 overflow-y-auto overscroll-contain px-[clamp(1rem,10%,12rem)] pt-[calc(var(--vsq)*19)]"
|
||||
data-slot="aui_thread-viewport"
|
||||
onScroll={handleScroll}
|
||||
onWheel={handleWheel}
|
||||
ref={viewportRef}
|
||||
scrollToBottomOnInitialize
|
||||
scrollToBottomOnRunStart
|
||||
scrollToBottomOnThreadSwitch
|
||||
>
|
||||
<div className="flex w-full flex-col gap-3">
|
||||
<div className="flex w-full flex-col gap-3" ref={contentRef}>
|
||||
<ThreadPrimitive.Messages>{() => <ThreadMessage onBranchInNewChat={onBranchInNewChat} />}</ThreadPrimitive.Messages>
|
||||
{loading === 'response' && <ResponseLoadingIndicator />}
|
||||
{loading === 'working' && <WorkingIndicator />}
|
||||
|
|
@ -446,7 +531,7 @@ const AssistantActionBar: FC<MessageActionProps> = ({ messageId, messageText, on
|
|||
<GitBranchIcon />
|
||||
Branch in new chat
|
||||
</DropdownMenuItem>
|
||||
<ReadAloudItem text={messageText} />
|
||||
<ReadAloudItem messageId={messageId} text={messageText} />
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
</ActionBarPrimitive.Root>
|
||||
|
|
@ -479,80 +564,39 @@ const CopyMessageButton: FC<{ text: string }> = ({ text }) => {
|
|||
)
|
||||
}
|
||||
|
||||
let currentAudio: HTMLAudioElement | null = null
|
||||
const ReadAloudItem: FC<{ messageId: string; text: string }> = ({ messageId, text }) => {
|
||||
const voicePlayback = useStore($voicePlayback)
|
||||
|
||||
function stopCurrentAudio() {
|
||||
if (!currentAudio) {
|
||||
return
|
||||
}
|
||||
const readAloudStatus =
|
||||
voicePlayback.source === 'read-aloud' && voicePlayback.messageId === messageId ? voicePlayback.status : 'idle'
|
||||
|
||||
currentAudio.pause()
|
||||
currentAudio.src = ''
|
||||
currentAudio = null
|
||||
}
|
||||
|
||||
const ReadAloudItem: FC<{ text: string }> = ({ text }) => {
|
||||
const [reading, setReading] = useState(false)
|
||||
const seqRef = useRef(0)
|
||||
|
||||
const stop = useCallback(() => {
|
||||
seqRef.current += 1
|
||||
stopCurrentAudio()
|
||||
setReading(false)
|
||||
}, [])
|
||||
const isPreparing = readAloudStatus === 'preparing'
|
||||
const isSpeaking = readAloudStatus === 'speaking'
|
||||
const anyPlaybackActive = voicePlayback.status !== 'idle'
|
||||
const Icon = isPreparing ? Loader2Icon : isSpeaking ? VolumeXIcon : Volume2Icon
|
||||
|
||||
const read = useCallback(async () => {
|
||||
if (!text) {
|
||||
if (!text || $voicePlayback.get().status !== 'idle') {
|
||||
return
|
||||
}
|
||||
|
||||
stopCurrentAudio()
|
||||
const seq = ++seqRef.current
|
||||
const isCurrent = () => seq === seqRef.current
|
||||
|
||||
const finish = () => {
|
||||
if (!isCurrent()) {
|
||||
return
|
||||
}
|
||||
|
||||
currentAudio = null
|
||||
setReading(false)
|
||||
}
|
||||
|
||||
setReading(true)
|
||||
|
||||
try {
|
||||
const { data_url } = await speakText(text)
|
||||
|
||||
if (!isCurrent()) {
|
||||
return
|
||||
}
|
||||
|
||||
const audio = new Audio(data_url)
|
||||
currentAudio = audio
|
||||
audio.addEventListener('ended', finish, { once: true })
|
||||
audio.addEventListener('error', finish, { once: true })
|
||||
await audio.play()
|
||||
await playSpeechText(text, { messageId, source: 'read-aloud' })
|
||||
} catch (error) {
|
||||
if (isCurrent()) {
|
||||
notifyError(error, 'Read aloud failed')
|
||||
finish()
|
||||
}
|
||||
notifyError(error, 'Read aloud failed')
|
||||
}
|
||||
}, [text])
|
||||
|
||||
const Icon = reading ? VolumeXIcon : Volume2Icon
|
||||
}, [messageId, text])
|
||||
|
||||
return (
|
||||
<DropdownMenuItem
|
||||
disabled={!reading && !text}
|
||||
disabled={isPreparing || (!isSpeaking && (anyPlaybackActive || !text))}
|
||||
onSelect={e => {
|
||||
e.preventDefault()
|
||||
void (reading ? stop() : read())
|
||||
void (isSpeaking ? stopVoicePlayback() : read())
|
||||
}}
|
||||
>
|
||||
<Icon />
|
||||
{reading ? 'Stop reading' : 'Read aloud'}
|
||||
<Icon className={isPreparing ? 'animate-spin' : undefined} />
|
||||
{isPreparing ? 'Preparing audio...' : isSpeaking ? 'Stop reading' : 'Read aloud'}
|
||||
</DropdownMenuItem>
|
||||
)
|
||||
}
|
||||
|
|
|
|||
18
apps/desktop/src/lib/chat-messages.test.ts
Normal file
18
apps/desktop/src/lib/chat-messages.test.ts
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import { chatMessageText, toChatMessages } from './chat-messages'
|
||||
|
||||
describe('toChatMessages', () => {
|
||||
it('hides attached context payloads from user message display', () => {
|
||||
const [message] = toChatMessages([
|
||||
{
|
||||
role: 'user',
|
||||
content:
|
||||
'what is this file\n\n--- Attached Context ---\n\n📄 @file:tsconfig.tsbuildinfo (981 tokens)\n```json\n{"root":["./src/main.tsx"]}\n```',
|
||||
timestamp: 1
|
||||
}
|
||||
])
|
||||
|
||||
expect(chatMessageText(message)).toBe('@file:tsconfig.tsbuildinfo\n\nwhat is this file')
|
||||
})
|
||||
})
|
||||
|
|
@ -29,6 +29,7 @@ export type GatewayEventPayload = {
|
|||
todos?: unknown
|
||||
model?: string
|
||||
provider?: string
|
||||
running?: boolean
|
||||
cwd?: string
|
||||
branch?: string
|
||||
personality?: string
|
||||
|
|
@ -49,6 +50,28 @@ export function chatMessageText(message: ChatMessage): string {
|
|||
.join('')
|
||||
}
|
||||
|
||||
const ATTACHED_CONTEXT_MARKER_RE = /(?:^|\n)--- Attached Context ---\s*\n/
|
||||
const CONTEXT_WARNINGS_MARKER_RE = /(?:^|\n)--- Context Warnings ---[\s\S]*$/
|
||||
const CONTEXT_REF_RE = /@(file|folder|url|image|tool):(?:"[^"\n]+"|'[^'\n]+'|`[^`\n]+`|\S+)/g
|
||||
|
||||
function displayContentForMessage(role: SessionMessage['role'], content: string): string {
|
||||
if (role !== 'user') {
|
||||
return content
|
||||
}
|
||||
|
||||
const marker = content.match(ATTACHED_CONTEXT_MARKER_RE)
|
||||
|
||||
if (!marker || marker.index === undefined) {
|
||||
return content.replace(CONTEXT_WARNINGS_MARKER_RE, '').trim()
|
||||
}
|
||||
|
||||
const visibleText = content.slice(0, marker.index).replace(CONTEXT_WARNINGS_MARKER_RE, '').trim()
|
||||
const attachedContext = content.slice(marker.index + marker[0].length)
|
||||
const refs = [...new Set(Array.from(attachedContext.matchAll(CONTEXT_REF_RE)).map(match => match[0]))]
|
||||
|
||||
return [refs.join('\n'), visibleText].filter(Boolean).join('\n\n') || visibleText
|
||||
}
|
||||
|
||||
export function appendTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] {
|
||||
const next = [...parts]
|
||||
const last = next.at(-1)
|
||||
|
|
@ -363,6 +386,7 @@ export function toChatMessages(messages: SessionMessage[]): ChatMessage[] {
|
|||
}
|
||||
|
||||
const content = message.content || message.text || message.context || message.name || ''
|
||||
const displayContent = displayContentForMessage(message.role, content)
|
||||
const parts: ChatMessagePart[] = []
|
||||
|
||||
const reasoning =
|
||||
|
|
@ -374,8 +398,8 @@ export function toChatMessages(messages: SessionMessage[]): ChatMessage[] {
|
|||
parts.push(reasoningPart(reasoning))
|
||||
}
|
||||
|
||||
if (content) {
|
||||
parts.push(textPart(content))
|
||||
if (displayContent) {
|
||||
parts.push(textPart(displayContent))
|
||||
}
|
||||
|
||||
if (message.role === 'assistant' && Array.isArray(message.tool_calls)) {
|
||||
|
|
|
|||
18
apps/desktop/src/lib/chat-runtime.test.ts
Normal file
18
apps/desktop/src/lib/chat-runtime.test.ts
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import { coerceThinkingText } from './chat-runtime'
|
||||
|
||||
describe('coerceThinkingText', () => {
|
||||
it('strips streaming status prefixes from thinking deltas', () => {
|
||||
expect(coerceThinkingText("◉_◉ processing... checking the user's request")).toBe("checking the user's request")
|
||||
expect(coerceThinkingText('(¬‿¬) analyzing... reading the file')).toBe('reading the file')
|
||||
})
|
||||
|
||||
it('drops empty thinking rewrite placeholder text', () => {
|
||||
expect(
|
||||
coerceThinkingText(
|
||||
"◉_◉ processing... I don't see any current rewritten thinking or next thinking to process. Could you provide the thinking content you'd like me to rewrite?"
|
||||
)
|
||||
).toBe('')
|
||||
})
|
||||
})
|
||||
|
|
@ -2,6 +2,7 @@ import type { ThreadMessage } from '@assistant-ui/react'
|
|||
|
||||
import type { QuickModelOption } from '@/app/chat/composer/types'
|
||||
import type { ClientSessionState, CommandDispatchResponse } from '@/app/types'
|
||||
import { formatRefValue } from '@/components/assistant-ui/directive-text'
|
||||
import { type ChatMessage, type ChatMessagePart, chatMessageText, textPart } from '@/lib/chat-messages'
|
||||
import type { ComposerAttachment } from '@/store/composer'
|
||||
import type { ModelOptionsResponse, SessionInfo } from '@/types/hermes'
|
||||
|
|
@ -25,7 +26,11 @@ export const BUILTIN_PERSONALITIES = [
|
|||
'hype'
|
||||
]
|
||||
|
||||
const SPINNER_STATUS_RE = /^\s*[((][^\s))]{1,8}[))]\s+[^.\n]{2,48}\.\.\.\s*/
|
||||
const THINKING_STATUS_PREFIX_RE =
|
||||
/^\s*(?:(?:[^\s.]{1,16})\s+)?(?:processing|thinking|reasoning|analyzing|pondering|contemplating|musing|cogitating|ruminating|deliberating|mulling|reflecting|computing|synthesizing|formulating|brainstorming)\.\.\.\s*/i
|
||||
|
||||
const EMPTY_THINKING_PLACEHOLDER_RE =
|
||||
/\b(?:current rewritten thinking|next thinking to process|provide the thinking content|don't see any .*thinking)\b/i
|
||||
|
||||
export function createClientSessionState(
|
||||
storedSessionId: string | null = null,
|
||||
|
|
@ -102,7 +107,9 @@ export function coerceGatewayText(value: unknown): string {
|
|||
}
|
||||
|
||||
export function coerceThinkingText(value: unknown): string {
|
||||
return coerceGatewayText(value).replace(SPINNER_STATUS_RE, '').trim()
|
||||
const text = coerceGatewayText(value).replace(THINKING_STATUS_PREFIX_RE, '').trim()
|
||||
|
||||
return EMPTY_THINKING_PLACEHOLDER_RE.test(text) ? '' : text
|
||||
}
|
||||
|
||||
export function isImageGenerationTool(name?: string): boolean {
|
||||
|
|
@ -135,7 +142,7 @@ export function attachmentDisplayText(attachment: ComposerAttachment): string |
|
|||
if (attachment.kind === 'image') {
|
||||
const id = attachment.detail || attachment.path || attachment.label
|
||||
|
||||
return id ? `@image:${id}` : null
|
||||
return id ? `@image:${formatRefValue(id)}` : null
|
||||
}
|
||||
|
||||
return null
|
||||
|
|
|
|||
19
apps/desktop/src/lib/speech-text.ts
Normal file
19
apps/desktop/src/lib/speech-text.ts
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
const EMOJI_RE = /[\p{Extended_Pictographic}\uFE0F\u200D]+/gu
|
||||
const FENCED_CODE_RE = /```[\s\S]*?(?:```|$)/g
|
||||
const INLINE_CODE_RE = /`([^`]+)`/g
|
||||
const MARKDOWN_LINK_RE = /\[([^\]]+)\]\(([^)]+)\)/g
|
||||
const URL_RE = /\bhttps?:\/\/\S+/gi
|
||||
|
||||
export function sanitizeTextForSpeech(text: string): string {
|
||||
return text
|
||||
.replace(FENCED_CODE_RE, ' ')
|
||||
.replace(MARKDOWN_LINK_RE, '$1')
|
||||
.replace(INLINE_CODE_RE, '$1')
|
||||
.replace(URL_RE, ' link ')
|
||||
.replace(EMOJI_RE, ' ')
|
||||
.replace(/^#{1,6}\s+/gm, '')
|
||||
.replace(/[*_~>#]/g, '')
|
||||
.replace(/^\s*[-+*]\s+/gm, '')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
}
|
||||
96
apps/desktop/src/lib/voice-playback.ts
Normal file
96
apps/desktop/src/lib/voice-playback.ts
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
import { speakText } from '@/hermes'
|
||||
import {
|
||||
$voicePlayback,
|
||||
setVoicePlaybackState,
|
||||
type VoicePlaybackSource,
|
||||
type VoicePlaybackState
|
||||
} from '@/store/voice-playback'
|
||||
|
||||
import { sanitizeTextForSpeech } from './speech-text'
|
||||
|
||||
let currentAudio: HTMLAudioElement | null = null
|
||||
let sequence = 0
|
||||
|
||||
function currentState(status: VoicePlaybackState['status'], options?: VoicePlaybackOptions): VoicePlaybackState {
|
||||
return {
|
||||
messageId: options?.messageId ?? null,
|
||||
sequence,
|
||||
source: options?.source ?? null,
|
||||
status
|
||||
}
|
||||
}
|
||||
|
||||
export interface VoicePlaybackOptions {
|
||||
messageId?: string | null
|
||||
source: VoicePlaybackSource
|
||||
}
|
||||
|
||||
export function stopVoicePlayback() {
|
||||
sequence += 1
|
||||
|
||||
if (currentAudio) {
|
||||
currentAudio.pause()
|
||||
currentAudio.src = ''
|
||||
currentAudio = null
|
||||
}
|
||||
|
||||
setVoicePlaybackState({
|
||||
messageId: null,
|
||||
sequence,
|
||||
source: null,
|
||||
status: 'idle'
|
||||
})
|
||||
}
|
||||
|
||||
export async function playSpeechText(text: string, options: VoicePlaybackOptions): Promise<boolean> {
|
||||
stopVoicePlayback()
|
||||
|
||||
const speakableText = sanitizeTextForSpeech(text)
|
||||
|
||||
if (!speakableText) {
|
||||
return false
|
||||
}
|
||||
|
||||
const ownSequence = sequence
|
||||
const isCurrent = () => ownSequence === sequence
|
||||
|
||||
setVoicePlaybackState(currentState('preparing', options))
|
||||
|
||||
try {
|
||||
const response = await speakText(speakableText)
|
||||
|
||||
if (!isCurrent()) {
|
||||
return false
|
||||
}
|
||||
|
||||
const audio = new Audio(response.data_url)
|
||||
currentAudio = audio
|
||||
setVoicePlaybackState(currentState('speaking', options))
|
||||
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
audio.addEventListener('ended', () => resolve(), { once: true })
|
||||
audio.addEventListener('error', () => reject(new Error('Playback failed')), { once: true })
|
||||
void audio.play().catch(reject)
|
||||
})
|
||||
|
||||
if (!isCurrent()) {
|
||||
return false
|
||||
}
|
||||
|
||||
currentAudio = null
|
||||
setVoicePlaybackState(currentState('idle'))
|
||||
|
||||
return true
|
||||
} catch (error) {
|
||||
if (isCurrent()) {
|
||||
currentAudio = null
|
||||
setVoicePlaybackState(currentState('idle'))
|
||||
}
|
||||
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
export function isVoicePlaybackActive() {
|
||||
return $voicePlayback.get().status !== 'idle'
|
||||
}
|
||||
|
|
@ -50,6 +50,13 @@ const ERROR_SUMMARIES: { test: (msg: string) => boolean; summarize: (msg: string
|
|||
test: msg => /neither voice_tools_openai_key nor openai_api_key is set/i.test(msg),
|
||||
summarize: () => 'OpenAI TTS needs VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY.'
|
||||
},
|
||||
{
|
||||
test: msg => /ELEVENLABS_API_KEY not set/i.test(msg) || /ElevenLabs STT API error \(HTTP 401\)/i.test(msg),
|
||||
summarize: msg =>
|
||||
/ELEVENLABS_API_KEY not set/i.test(msg)
|
||||
? 'ElevenLabs STT needs ELEVENLABS_API_KEY.'
|
||||
: 'ElevenLabs rejected the API key (401).'
|
||||
},
|
||||
{
|
||||
test: msg => /method not allowed/i.test(msg),
|
||||
summarize: () => 'The desktop backend does not support that audio endpoint yet. Restart Hermes Desktop.'
|
||||
|
|
|
|||
22
apps/desktop/src/store/voice-playback.ts
Normal file
22
apps/desktop/src/store/voice-playback.ts
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import { atom } from 'nanostores'
|
||||
|
||||
export type VoicePlaybackSource = 'read-aloud' | 'voice-conversation'
|
||||
export type VoicePlaybackStatus = 'idle' | 'preparing' | 'speaking'
|
||||
|
||||
export interface VoicePlaybackState {
|
||||
messageId: string | null
|
||||
sequence: number
|
||||
source: VoicePlaybackSource | null
|
||||
status: VoicePlaybackStatus
|
||||
}
|
||||
|
||||
export const $voicePlayback = atom<VoicePlaybackState>({
|
||||
messageId: null,
|
||||
sequence: 0,
|
||||
source: null,
|
||||
status: 'idle'
|
||||
})
|
||||
|
||||
export function setVoicePlaybackState(next: VoicePlaybackState) {
|
||||
$voicePlayback.set(next)
|
||||
}
|
||||
|
|
@ -184,6 +184,29 @@ button {
|
|||
-webkit-app-region: no-drag;
|
||||
}
|
||||
|
||||
@keyframes voice-wave {
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.45;
|
||||
transform: scaleY(0.28);
|
||||
}
|
||||
|
||||
35% {
|
||||
opacity: 0.95;
|
||||
transform: scaleY(1);
|
||||
}
|
||||
|
||||
62% {
|
||||
opacity: 0.7;
|
||||
transform: scaleY(0.52);
|
||||
}
|
||||
}
|
||||
|
||||
.voice-wave-bar {
|
||||
animation: voice-wave 860ms ease-in-out infinite;
|
||||
transform-origin: center;
|
||||
}
|
||||
|
||||
.composer-liquid-shell-wrap {
|
||||
pointer-events: none;
|
||||
border-radius: var(--composer-glass-radius, 20px);
|
||||
|
|
|
|||
|
|
@ -168,6 +168,7 @@ export interface SessionRuntimeInfo {
|
|||
personality?: string
|
||||
provider?: string
|
||||
reasoning_effort?: string
|
||||
running?: boolean
|
||||
service_tier?: string
|
||||
skills?: Record<string, string[]> | string[]
|
||||
tools?: Record<string, string[]>
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import tailwindcss from '@tailwindcss/vite'
|
|||
import path from 'path'
|
||||
|
||||
export default defineConfig({
|
||||
base: './',
|
||||
plugins: [react(), tailwindcss()],
|
||||
resolve: {
|
||||
alias: {
|
||||
|
|
|
|||
|
|
@ -830,7 +830,7 @@ DEFAULT_CONFIG = {
|
|||
|
||||
"stt": {
|
||||
"enabled": True,
|
||||
"provider": "local", # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) | "mistral" (Voxtral Transcribe)
|
||||
"provider": "local", # "local" (free, faster-whisper) | "groq" | "openai" (Whisper API) | "mistral" (Voxtral Transcribe) | "elevenlabs" (Scribe)
|
||||
"local": {
|
||||
"model": "base", # tiny, base, small, medium, large-v3
|
||||
"language": "", # auto-detect by default; set to "en", "es", "fr", etc. to force
|
||||
|
|
@ -841,6 +841,12 @@ DEFAULT_CONFIG = {
|
|||
"mistral": {
|
||||
"model": "voxtral-mini-latest", # voxtral-mini-latest, voxtral-mini-2602
|
||||
},
|
||||
"elevenlabs": {
|
||||
"model_id": "scribe_v2", # scribe_v2, scribe_v1
|
||||
"language_code": "", # auto-detect by default; set to "eng", "spa", "fra", etc. to force
|
||||
"tag_audio_events": False,
|
||||
"diarize": False,
|
||||
},
|
||||
},
|
||||
|
||||
"voice": {
|
||||
|
|
@ -1791,9 +1797,10 @@ OPTIONAL_ENV_VARS = {
|
|||
"category": "tool",
|
||||
},
|
||||
"ELEVENLABS_API_KEY": {
|
||||
"description": "ElevenLabs API key for premium text-to-speech voices",
|
||||
"description": "ElevenLabs API key for premium text-to-speech voices and Scribe transcription",
|
||||
"prompt": "ElevenLabs API key",
|
||||
"url": "https://elevenlabs.io/",
|
||||
"tools": ["elevenlabs_tts", "voice_transcription"],
|
||||
"password": True,
|
||||
"category": "tool",
|
||||
},
|
||||
|
|
|
|||
|
|
@ -280,7 +280,12 @@ _SCHEMA_OVERRIDES: Dict[str, Dict[str, Any]] = {
|
|||
"stt.provider": {
|
||||
"type": "select",
|
||||
"description": "Speech-to-text provider",
|
||||
"options": ["local", "openai", "mistral"],
|
||||
"options": ["local", "groq", "openai", "mistral", "xai", "elevenlabs"],
|
||||
},
|
||||
"stt.elevenlabs.model_id": {
|
||||
"type": "select",
|
||||
"description": "ElevenLabs Scribe model",
|
||||
"options": ["scribe_v2", "scribe_v1"],
|
||||
},
|
||||
"display.skin": {
|
||||
"type": "select",
|
||||
|
|
|
|||
|
|
@ -24,6 +24,8 @@ def isolate_env(monkeypatch):
|
|||
"MISTRAL_API_KEY",
|
||||
"XAI_API_KEY",
|
||||
"XAI_STT_BASE_URL",
|
||||
"ELEVENLABS_API_KEY",
|
||||
"ELEVENLABS_STT_BASE_URL",
|
||||
):
|
||||
monkeypatch.delenv(key, raising=False)
|
||||
|
||||
|
|
@ -87,6 +89,15 @@ class TestProviderSelectionGate:
|
|||
return_value={"XAI_API_KEY": "dotenv-secret"}):
|
||||
assert tt._get_provider({"enabled": True, "provider": "xai"}) == "xai"
|
||||
|
||||
def test_explicit_elevenlabs_sees_dotenv(self):
|
||||
from tools import transcription_tools as tt
|
||||
|
||||
with patch.object(tt, "_HAS_FASTER_WHISPER", False), \
|
||||
patch.object(tt, "_has_local_command", return_value=False), \
|
||||
patch("hermes_cli.config.load_env",
|
||||
return_value={"ELEVENLABS_API_KEY": "dotenv-secret"}):
|
||||
assert tt._get_provider({"enabled": True, "provider": "elevenlabs"}) == "elevenlabs"
|
||||
|
||||
def test_auto_detect_sees_dotenv_groq(self):
|
||||
"""No local backend, no explicit provider — auto-detect should fall
|
||||
through to Groq when its key lives in dotenv only. Before the fix
|
||||
|
|
@ -193,6 +204,33 @@ class TestTranscribeCallSitesReadDotenv:
|
|||
assert result["success"] is True
|
||||
assert captured["headers"]["Authorization"] == "Bearer xai-dotenv-key"
|
||||
|
||||
def test_transcribe_elevenlabs_forwards_dotenv_key(self):
|
||||
from tools import transcription_tools as tt
|
||||
|
||||
captured: dict = {}
|
||||
|
||||
def fake_post(url, **kwargs):
|
||||
captured["url"] = url
|
||||
captured["headers"] = kwargs.get("headers", {})
|
||||
response = MagicMock()
|
||||
response.status_code = 200
|
||||
response.json.return_value = {"text": "hello"}
|
||||
return response
|
||||
|
||||
def fake_get_env_value(name, default=None):
|
||||
if name == "ELEVENLABS_API_KEY":
|
||||
return "elevenlabs-dotenv-key"
|
||||
return None
|
||||
|
||||
with patch.object(tt, "get_env_value", side_effect=fake_get_env_value), \
|
||||
patch.object(tt, "_load_stt_config", return_value={}), \
|
||||
patch("requests.post", side_effect=fake_post), \
|
||||
patch("builtins.open", MagicMock()):
|
||||
result = tt._transcribe_elevenlabs("/tmp/fake.mp3", "scribe_v2")
|
||||
|
||||
assert result["success"] is True
|
||||
assert captured["headers"]["xi-api-key"] == "elevenlabs-dotenv-key"
|
||||
|
||||
|
||||
class TestEndToEndRegressionGuard:
|
||||
"""End-to-end probe: patch ``hermes_cli.config.load_env`` to simulate
|
||||
|
|
|
|||
|
|
@ -49,6 +49,7 @@ def clean_env(monkeypatch):
|
|||
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
||||
monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
|
||||
monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False)
|
||||
monkeypatch.delenv("HERMES_LOCAL_STT_COMMAND", raising=False)
|
||||
monkeypatch.delenv("HERMES_LOCAL_STT_LANGUAGE", raising=False)
|
||||
|
||||
|
|
@ -1342,3 +1343,161 @@ class TestTranscribeAudioXAIDispatch:
|
|||
transcribe_audio(sample_ogg, model="custom-stt")
|
||||
|
||||
assert mock_xai.call_args[0][1] == "custom-stt"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# _transcribe_elevenlabs
|
||||
# ============================================================================
|
||||
|
||||
class TestTranscribeElevenLabs:
|
||||
def test_no_key(self, monkeypatch):
|
||||
monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False)
|
||||
from tools.transcription_tools import _transcribe_elevenlabs
|
||||
result = _transcribe_elevenlabs("/tmp/test.ogg", "scribe_v2")
|
||||
assert result["success"] is False
|
||||
assert "ELEVENLABS_API_KEY" in result["error"]
|
||||
|
||||
def test_successful_transcription(self, monkeypatch, sample_ogg):
|
||||
monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key")
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {"text": "hello from elevenlabs"}
|
||||
|
||||
config = {
|
||||
"elevenlabs": {
|
||||
"language_code": "eng",
|
||||
"tag_audio_events": True,
|
||||
"diarize": True,
|
||||
}
|
||||
}
|
||||
with patch("tools.transcription_tools._load_stt_config", return_value=config), \
|
||||
patch("requests.post", return_value=mock_response) as mock_post:
|
||||
from tools.transcription_tools import _transcribe_elevenlabs
|
||||
result = _transcribe_elevenlabs(sample_ogg, "scribe_v2")
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["transcript"] == "hello from elevenlabs"
|
||||
assert result["provider"] == "elevenlabs"
|
||||
call_kwargs = mock_post.call_args.kwargs
|
||||
assert call_kwargs["headers"]["xi-api-key"] == "eleven-test-key"
|
||||
assert call_kwargs["data"]["model_id"] == "scribe_v2"
|
||||
assert call_kwargs["data"]["language_code"] == "eng"
|
||||
assert call_kwargs["data"]["tag_audio_events"] == "true"
|
||||
assert call_kwargs["data"]["diarize"] == "true"
|
||||
|
||||
def test_api_error_returns_failure(self, monkeypatch, sample_ogg):
|
||||
monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key")
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 401
|
||||
mock_response.json.return_value = {"detail": {"message": "Invalid API key"}}
|
||||
mock_response.text = '{"detail": {"message": "Invalid API key"}}'
|
||||
|
||||
with patch("tools.transcription_tools._load_stt_config", return_value={}), \
|
||||
patch("requests.post", return_value=mock_response):
|
||||
from tools.transcription_tools import _transcribe_elevenlabs
|
||||
result = _transcribe_elevenlabs(sample_ogg, "scribe_v2")
|
||||
|
||||
assert result["success"] is False
|
||||
assert "HTTP 401" in result["error"]
|
||||
assert "Invalid API key" in result["error"]
|
||||
|
||||
def test_empty_transcript_returns_failure(self, monkeypatch, sample_ogg):
|
||||
monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test-key")
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.json.return_value = {"text": " "}
|
||||
|
||||
with patch("tools.transcription_tools._load_stt_config", return_value={}), \
|
||||
patch("requests.post", return_value=mock_response):
|
||||
from tools.transcription_tools import _transcribe_elevenlabs
|
||||
result = _transcribe_elevenlabs(sample_ogg, "scribe_v2")
|
||||
|
||||
assert result["success"] is False
|
||||
assert "empty transcript" in result["error"]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# _get_provider — ElevenLabs
|
||||
# ============================================================================
|
||||
|
||||
class TestGetProviderElevenLabs:
|
||||
"""ElevenLabs-specific provider selection tests."""
|
||||
|
||||
def test_elevenlabs_when_key_set(self, monkeypatch):
|
||||
monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test")
|
||||
from tools.transcription_tools import _get_provider
|
||||
assert _get_provider({"provider": "elevenlabs"}) == "elevenlabs"
|
||||
|
||||
def test_elevenlabs_explicit_no_key_returns_none(self, monkeypatch):
|
||||
"""Explicit elevenlabs with no key returns none — no cross-provider fallback."""
|
||||
monkeypatch.delenv("ELEVENLABS_API_KEY", raising=False)
|
||||
from tools.transcription_tools import _get_provider
|
||||
assert _get_provider({"provider": "elevenlabs"}) == "none"
|
||||
|
||||
def test_auto_detect_elevenlabs_after_xai(self, monkeypatch):
|
||||
"""Auto-detect: elevenlabs is tried after xai when all above are unavailable."""
|
||||
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
||||
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
||||
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
|
||||
monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
|
||||
monkeypatch.delenv("XAI_API_KEY", raising=False)
|
||||
monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test")
|
||||
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
|
||||
patch("tools.transcription_tools._has_local_command", return_value=False), \
|
||||
patch("tools.transcription_tools._HAS_OPENAI", False), \
|
||||
patch("tools.transcription_tools._HAS_MISTRAL", False):
|
||||
from tools.transcription_tools import _get_provider
|
||||
assert _get_provider({}) == "elevenlabs"
|
||||
|
||||
def test_auto_detect_xai_preferred_over_elevenlabs(self, monkeypatch):
|
||||
"""Auto-detect: xai is preferred over elevenlabs."""
|
||||
monkeypatch.setenv("XAI_API_KEY", "xai-test")
|
||||
monkeypatch.setenv("ELEVENLABS_API_KEY", "eleven-test")
|
||||
with patch("tools.transcription_tools._HAS_FASTER_WHISPER", False), \
|
||||
patch("tools.transcription_tools._has_local_command", return_value=False), \
|
||||
patch("tools.transcription_tools._HAS_OPENAI", False), \
|
||||
patch("tools.transcription_tools._HAS_MISTRAL", False):
|
||||
from tools.transcription_tools import _get_provider
|
||||
assert _get_provider({}) == "xai"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# transcribe_audio — ElevenLabs dispatch
|
||||
# ============================================================================
|
||||
|
||||
class TestTranscribeAudioElevenLabsDispatch:
|
||||
def test_dispatches_to_elevenlabs(self, sample_ogg):
|
||||
with patch("tools.transcription_tools._load_stt_config", return_value={"provider": "elevenlabs"}), \
|
||||
patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \
|
||||
patch("tools.transcription_tools._transcribe_elevenlabs",
|
||||
return_value={"success": True, "transcript": "hi", "provider": "elevenlabs"}) as mock_elevenlabs:
|
||||
from tools.transcription_tools import transcribe_audio
|
||||
result = transcribe_audio(sample_ogg)
|
||||
|
||||
assert result["success"] is True
|
||||
assert result["provider"] == "elevenlabs"
|
||||
mock_elevenlabs.assert_called_once()
|
||||
|
||||
def test_config_elevenlabs_model_used(self, sample_ogg):
|
||||
config = {"provider": "elevenlabs", "elevenlabs": {"model_id": "scribe_v1"}}
|
||||
with patch("tools.transcription_tools._load_stt_config", return_value=config), \
|
||||
patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \
|
||||
patch("tools.transcription_tools._transcribe_elevenlabs",
|
||||
return_value={"success": True, "transcript": "hi"}) as mock_elevenlabs:
|
||||
from tools.transcription_tools import transcribe_audio
|
||||
transcribe_audio(sample_ogg, model=None)
|
||||
|
||||
assert mock_elevenlabs.call_args[0][1] == "scribe_v1"
|
||||
|
||||
def test_model_override_passed_to_elevenlabs(self, sample_ogg):
|
||||
with patch("tools.transcription_tools._load_stt_config", return_value={}), \
|
||||
patch("tools.transcription_tools._get_provider", return_value="elevenlabs"), \
|
||||
patch("tools.transcription_tools._transcribe_elevenlabs",
|
||||
return_value={"success": True, "transcript": "hi"}) as mock_elevenlabs:
|
||||
from tools.transcription_tools import transcribe_audio
|
||||
transcribe_audio(sample_ogg, model="scribe_v2")
|
||||
|
||||
assert mock_elevenlabs.call_args[0][1] == "scribe_v2"
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ Provides speech-to-text transcription with six providers:
|
|||
- **mistral** — Mistral Voxtral Transcribe API, requires ``MISTRAL_API_KEY``.
|
||||
- **xai** — xAI Grok STT API, requires ``XAI_API_KEY``. High accuracy,
|
||||
Inverse Text Normalization, diarization, 21 languages.
|
||||
- **elevenlabs** — ElevenLabs Scribe API, requires ``ELEVENLABS_API_KEY``.
|
||||
|
||||
Used by the messaging gateway to automatically transcribe voice messages
|
||||
sent by users on Telegram, Discord, WhatsApp, Slack, and Signal.
|
||||
|
|
@ -84,6 +85,7 @@ DEFAULT_LOCAL_STT_LANGUAGE = "en"
|
|||
DEFAULT_STT_MODEL = os.getenv("STT_OPENAI_MODEL", "whisper-1")
|
||||
DEFAULT_GROQ_STT_MODEL = os.getenv("STT_GROQ_MODEL", "whisper-large-v3-turbo")
|
||||
DEFAULT_MISTRAL_STT_MODEL = os.getenv("STT_MISTRAL_MODEL", "voxtral-mini-latest")
|
||||
DEFAULT_ELEVENLABS_STT_MODEL = os.getenv("STT_ELEVENLABS_MODEL", "scribe_v2")
|
||||
LOCAL_STT_COMMAND_ENV = "HERMES_LOCAL_STT_COMMAND"
|
||||
LOCAL_STT_LANGUAGE_ENV = "HERMES_LOCAL_STT_LANGUAGE"
|
||||
COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
|
||||
|
|
@ -91,6 +93,7 @@ COMMON_LOCAL_BIN_DIRS = ("/opt/homebrew/bin", "/usr/local/bin")
|
|||
GROQ_BASE_URL = os.getenv("GROQ_BASE_URL", "https://api.groq.com/openai/v1")
|
||||
OPENAI_BASE_URL = os.getenv("STT_OPENAI_BASE_URL", "https://api.openai.com/v1")
|
||||
XAI_STT_BASE_URL = os.getenv("XAI_STT_BASE_URL", "https://api.x.ai/v1")
|
||||
ELEVENLABS_STT_BASE_URL = os.getenv("ELEVENLABS_STT_BASE_URL", "https://api.elevenlabs.io/v1")
|
||||
|
||||
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg", ".aac", ".flac"}
|
||||
LOCAL_NATIVE_AUDIO_FORMATS = {".wav", ".aiff", ".aif"}
|
||||
|
|
@ -268,9 +271,17 @@ def _get_provider(stt_config: dict) -> str:
|
|||
)
|
||||
return "none"
|
||||
|
||||
if provider == "elevenlabs":
|
||||
if get_env_value("ELEVENLABS_API_KEY"):
|
||||
return "elevenlabs"
|
||||
logger.warning(
|
||||
"STT provider 'elevenlabs' configured but ELEVENLABS_API_KEY not set"
|
||||
)
|
||||
return "none"
|
||||
|
||||
return provider # Unknown — let it fail downstream
|
||||
|
||||
# --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai -
|
||||
# --- Auto-detect (no explicit provider): local > groq > openai > mistral > xai > elevenlabs -
|
||||
|
||||
if _HAS_FASTER_WHISPER:
|
||||
return "local"
|
||||
|
|
@ -288,6 +299,9 @@ def _get_provider(stt_config: dict) -> str:
|
|||
if get_env_value("XAI_API_KEY"):
|
||||
logger.info("No local STT available, using xAI Grok STT API")
|
||||
return "xai"
|
||||
if get_env_value("ELEVENLABS_API_KEY"):
|
||||
logger.info("No local STT available, using ElevenLabs Scribe STT API")
|
||||
return "elevenlabs"
|
||||
return "none"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -781,6 +795,92 @@ def _transcribe_xai(file_path: str, model_name: str) -> Dict[str, Any]:
|
|||
return {"success": False, "transcript": "", "error": f"xAI STT transcription failed: {e}"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider: ElevenLabs (Scribe STT API)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _transcribe_elevenlabs(file_path: str, model_name: str) -> Dict[str, Any]:
|
||||
"""Transcribe using ElevenLabs Scribe STT API."""
|
||||
api_key = get_env_value("ELEVENLABS_API_KEY")
|
||||
if not api_key:
|
||||
return {"success": False, "transcript": "", "error": "ELEVENLABS_API_KEY not set"}
|
||||
|
||||
stt_config = _load_stt_config()
|
||||
elevenlabs_config = stt_config.get("elevenlabs", {})
|
||||
base_url = str(
|
||||
elevenlabs_config.get("base_url")
|
||||
or get_env_value("ELEVENLABS_STT_BASE_URL")
|
||||
or ELEVENLABS_STT_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
language_code = str(elevenlabs_config.get("language_code") or "").strip()
|
||||
tag_audio_events = is_truthy_value(elevenlabs_config.get("tag_audio_events", False))
|
||||
diarize = is_truthy_value(elevenlabs_config.get("diarize", False))
|
||||
|
||||
try:
|
||||
import requests
|
||||
|
||||
data: Dict[str, str] = {
|
||||
"model_id": model_name,
|
||||
"tag_audio_events": "true" if tag_audio_events else "false",
|
||||
"diarize": "true" if diarize else "false",
|
||||
}
|
||||
if language_code:
|
||||
data["language_code"] = language_code
|
||||
|
||||
with open(file_path, "rb") as audio_file:
|
||||
response = requests.post(
|
||||
f"{base_url}/speech-to-text",
|
||||
headers={"xi-api-key": api_key},
|
||||
files={"file": (Path(file_path).name, audio_file)},
|
||||
data=data,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
detail = ""
|
||||
try:
|
||||
err_body = response.json()
|
||||
error_value = err_body.get("detail") or err_body.get("error")
|
||||
if isinstance(error_value, dict):
|
||||
detail = str(error_value.get("message") or error_value)
|
||||
elif error_value:
|
||||
detail = str(error_value)
|
||||
else:
|
||||
detail = response.text[:300]
|
||||
except Exception:
|
||||
detail = response.text[:300]
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": f"ElevenLabs STT API error (HTTP {response.status_code}): {detail}",
|
||||
}
|
||||
|
||||
result = response.json()
|
||||
transcript_text = _extract_transcript_text(result)
|
||||
if not transcript_text:
|
||||
return {
|
||||
"success": False,
|
||||
"transcript": "",
|
||||
"error": "ElevenLabs STT returned empty transcript",
|
||||
}
|
||||
|
||||
logger.info(
|
||||
"Transcribed %s via ElevenLabs Scribe (%s, %d chars)",
|
||||
Path(file_path).name,
|
||||
model_name,
|
||||
len(transcript_text),
|
||||
)
|
||||
|
||||
return {"success": True, "transcript": transcript_text, "provider": "elevenlabs"}
|
||||
|
||||
except PermissionError:
|
||||
return {"success": False, "transcript": "", "error": f"Permission denied: {file_path}"}
|
||||
except Exception as e:
|
||||
logger.error("ElevenLabs STT transcription failed: %s", e, exc_info=True)
|
||||
return {"success": False, "transcript": "", "error": f"ElevenLabs STT transcription failed: {e}"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -792,7 +892,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
|
||||
Provider priority:
|
||||
1. User config (``stt.provider`` in config.yaml)
|
||||
2. Auto-detect: local faster-whisper (free) > Groq (free tier) > OpenAI (paid)
|
||||
2. Auto-detect: local > Groq > OpenAI > Mistral > xAI > ElevenLabs
|
||||
|
||||
Args:
|
||||
file_path: Absolute path to the audio file to transcribe.
|
||||
|
|
@ -854,6 +954,11 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
model_name = model or "grok-stt"
|
||||
return _transcribe_xai(file_path, model_name)
|
||||
|
||||
if provider == "elevenlabs":
|
||||
elevenlabs_cfg = stt_config.get("elevenlabs", {})
|
||||
model_name = model or elevenlabs_cfg.get("model_id", DEFAULT_ELEVENLABS_STT_MODEL)
|
||||
return _transcribe_elevenlabs(file_path, model_name)
|
||||
|
||||
# No provider available
|
||||
return {
|
||||
"success": False,
|
||||
|
|
@ -862,8 +967,9 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A
|
|||
"No STT provider available. Install faster-whisper for free local "
|
||||
f"transcription, configure {LOCAL_STT_COMMAND_ENV} or install a local whisper CLI, "
|
||||
"set GROQ_API_KEY for free Groq Whisper, set MISTRAL_API_KEY for Mistral "
|
||||
"Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, or set VOICE_TOOLS_OPENAI_KEY "
|
||||
"or OPENAI_API_KEY for the OpenAI Whisper API."
|
||||
"Voxtral Transcribe, set XAI_API_KEY for xAI Grok STT, set ELEVENLABS_API_KEY "
|
||||
"for ElevenLabs Scribe, or set VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY for "
|
||||
"the OpenAI Whisper API."
|
||||
),
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1409,6 +1409,7 @@ def _session_info(agent, session: dict | None = None) -> dict:
|
|||
"cwd": cwd,
|
||||
"branch": _git_branch_for_cwd(cwd),
|
||||
"personality": str(personality or ""),
|
||||
"running": bool((session or {}).get("running")),
|
||||
"version": "",
|
||||
"release_date": "",
|
||||
"update_behind": None,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue