mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tui): match CLI's voice slash + VAD-continuous recording model
The TUI had drifted from the CLI's voice model in two ways:
- /voice on was lighting up the microphone immediately and Ctrl+B was
interpreted as a mode toggle. The CLI separates the two: /voice on
just flips the umbrella bit, recording only starts once the user
presses Ctrl+B, which also sets _voice_continuous so the VAD loop
auto-restarts until the user presses Ctrl+B again or three silent
cycles pass.
- /voice tts was missing entirely, so users couldn't turn agent reply
speech on/off from inside the TUI.
This commit brings the TUI to parity.
Python
- hermes_cli/voice.py: continuous-mode API (start_continuous,
stop_continuous, is_continuous_active) layered on the existing PTT
wrappers. The silence callback transcribes, fires on_transcript,
tracks consecutive no-speech cycles, and auto-restarts — mirroring
cli.py:_voice_stop_and_transcribe + _restart_recording.
- tui_gateway/server.py:
- voice.toggle now supports on / off / tts / status. The umbrella
bit lives in HERMES_VOICE + display.voice_enabled; tts lives in
HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down
any active continuous loop so a toggle-off really releases the
microphone.
- voice.record start/stop now drives start_continuous/stop_continuous.
start is refused with a clear error when the mode is off, matching
cli.py:handle_voice_record's early return on `not _voice_mode`.
- New voice.transcript / voice.status events emit through
_voice_emit (remembers the sid that last enabled the mode so
events land in the right session).
TypeScript
- gatewayTypes.ts: voice.status + voice.transcript event
discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse
gains status for the new "started/stopped" responses.
- interfaces.ts: GatewayEventHandlerContext gains composer.setInput +
submission.submitRef + voice.{setRecording, setProcessing,
setVoiceEnabled}; InputHandlerContext.voice gains enabled +
setVoiceEnabled for the mode-aware Ctrl+B handler.
- createGatewayEventHandler.ts: voice.status drives REC/STT badges;
voice.transcript auto-submits when the composer is empty (CLI
_pending_input.put parity) and appends when a draft is in flight.
no_speech_limit flips voice off + sys line.
- useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop),
not voice.toggle, and nudges the user with a sys line when the
mode is off instead of silently flipping it on.
- useMainApp.ts: wires the new event-handler context fields.
- slash/commands/session.ts: /voice handles on / off / tts / status
with CLI-matching output ("voice: mode on · tts off").
Backward compat preserved for voice.record (was always PTT shape;
gateway still honours start/stop with mode-gating added).
This commit is contained in:
parent
0bb460b070
commit
04c489b587
10 changed files with 861 additions and 78 deletions
|
|
@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
|
|||
composer: {
|
||||
dequeue: () => undefined,
|
||||
queueEditRef: ref<null | number>(null),
|
||||
sendQueued: vi.fn()
|
||||
sendQueued: vi.fn(),
|
||||
setInput: vi.fn()
|
||||
},
|
||||
gateway: {
|
||||
gw: { request: vi.fn() },
|
||||
|
|
@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
|
|||
resumeById: vi.fn(),
|
||||
setCatalog: vi.fn()
|
||||
},
|
||||
submission: {
|
||||
submitRef: { current: vi.fn() }
|
||||
},
|
||||
system: {
|
||||
bellOnComplete: false,
|
||||
sys: vi.fn()
|
||||
|
|
@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
|
|||
panel: (title: string, sections: any[]) =>
|
||||
appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
|
||||
setHistoryItems: vi.fn()
|
||||
},
|
||||
voice: {
|
||||
setProcessing: vi.fn(),
|
||||
setRecording: vi.fn(),
|
||||
setVoiceEnabled: vi.fn()
|
||||
}
|
||||
}) as any
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
|||
const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
|
||||
const { bellOnComplete, stdout, sys } = ctx.system
|
||||
const { appendMessage, panel, setHistoryItems } = ctx.transcript
|
||||
const { setInput } = ctx.composer
|
||||
const { submitRef } = ctx.submission
|
||||
const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice
|
||||
|
||||
let pendingThinkingStatus = ''
|
||||
let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
|
||||
|
|
@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
|
|||
return
|
||||
}
|
||||
|
||||
case 'voice.status': {
|
||||
// Continuous VAD loop reports its internal state so the status bar
|
||||
// can show listening / transcribing / idle without polling.
|
||||
const state = String(ev.payload?.state ?? '')
|
||||
|
||||
if (state === 'listening') {
|
||||
setVoiceRecording(true)
|
||||
setVoiceProcessing(false)
|
||||
} else if (state === 'transcribing') {
|
||||
setVoiceRecording(false)
|
||||
setVoiceProcessing(true)
|
||||
} else {
|
||||
setVoiceRecording(false)
|
||||
setVoiceProcessing(false)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
case 'voice.transcript': {
|
||||
// CLI parity: the 3-strikes silence detector flipped off automatically.
|
||||
// Mirror that on the UI side and tell the user why the mode is off.
|
||||
if (ev.payload?.no_speech_limit) {
|
||||
setVoiceEnabled(false)
|
||||
setVoiceRecording(false)
|
||||
setVoiceProcessing(false)
|
||||
sys('voice: no speech detected 3 times, continuous mode stopped')
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
const text = String(ev.payload?.text ?? '').trim()
|
||||
|
||||
if (!text) {
|
||||
return
|
||||
}
|
||||
|
||||
// Match CLI's _pending_input.put(transcript): auto-submit when the
|
||||
// composer is empty, otherwise append so the user can keep editing
|
||||
// a partial draft they were working on.
|
||||
setInput(prev => {
|
||||
if (!prev) {
|
||||
// defer submit so React commits the state change first
|
||||
setTimeout(() => submitRef.current(text), 0)
|
||||
|
||||
return ''
|
||||
}
|
||||
|
||||
return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
|
||||
})
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
case 'gateway.start_timeout': {
|
||||
const { cwd, python } = ev.payload ?? {}
|
||||
const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
|
||||
|
|
|
|||
|
|
@ -189,9 +189,11 @@ export interface InputHandlerContext {
|
|||
stdout?: NodeJS.WriteStream
|
||||
}
|
||||
voice: {
|
||||
enabled: boolean
|
||||
recording: boolean
|
||||
setProcessing: StateSetter<boolean>
|
||||
setRecording: StateSetter<boolean>
|
||||
setVoiceEnabled: StateSetter<boolean>
|
||||
}
|
||||
wheelStep: number
|
||||
}
|
||||
|
|
@ -201,6 +203,9 @@ export interface InputHandlerResult {
|
|||
}
|
||||
|
||||
export interface GatewayEventHandlerContext {
|
||||
composer: {
|
||||
setInput: StateSetter<string>
|
||||
}
|
||||
gateway: GatewayServices
|
||||
session: {
|
||||
STARTUP_RESUME_ID: string
|
||||
|
|
@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
|
|||
resumeById: (id: string) => void
|
||||
setCatalog: StateSetter<null | SlashCatalog>
|
||||
}
|
||||
submission: {
|
||||
submitRef: MutableRefObject<(value: string) => void>
|
||||
}
|
||||
system: {
|
||||
bellOnComplete: boolean
|
||||
stdout?: NodeJS.WriteStream
|
||||
|
|
@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
|
|||
panel: (title: string, sections: PanelSection[]) => void
|
||||
setHistoryItems: StateSetter<Msg[]>
|
||||
}
|
||||
voice: {
|
||||
setProcessing: StateSetter<boolean>
|
||||
setRecording: StateSetter<boolean>
|
||||
setVoiceEnabled: StateSetter<boolean>
|
||||
}
|
||||
}
|
||||
|
||||
export interface SlashHandlerContext {
|
||||
|
|
|
|||
|
|
@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
|
|||
},
|
||||
|
||||
{
|
||||
help: 'toggle voice input',
|
||||
help: 'voice mode: [on|off|tts|status]',
|
||||
name: 'voice',
|
||||
run: (arg, ctx) => {
|
||||
const action = arg === 'on' || arg === 'off' ? arg : 'status'
|
||||
const normalized = (arg ?? '').trim().toLowerCase()
|
||||
|
||||
const action =
|
||||
normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
|
||||
? normalized
|
||||
: 'status'
|
||||
|
||||
ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
|
||||
ctx.guarded<VoiceToggleResponse>(r => {
|
||||
ctx.voice.setVoiceEnabled(!!r.enabled)
|
||||
ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`)
|
||||
|
||||
// Match CLI's _show_voice_status / _enable_voice_mode /
|
||||
// _toggle_voice_tts output shape so users don't have to learn
|
||||
// two vocabularies.
|
||||
if (action === 'status') {
|
||||
const mode = r.enabled ? 'ON' : 'OFF'
|
||||
const tts = r.tts ? 'ON' : 'OFF'
|
||||
ctx.transcript.sys('Voice Mode Status')
|
||||
ctx.transcript.sys(` Mode: ${mode}`)
|
||||
ctx.transcript.sys(` TTS: ${tts}`)
|
||||
ctx.transcript.sys(' Record key: Ctrl+B')
|
||||
|
||||
// CLI's "Requirements:" block — surfaces STT/audio setup issues
|
||||
// so the user sees "STT provider: MISSING ..." instead of
|
||||
// silently failing on every Ctrl+B press.
|
||||
if (r.details) {
|
||||
ctx.transcript.sys('')
|
||||
ctx.transcript.sys(' Requirements:')
|
||||
|
||||
for (const line of r.details.split('\n')) {
|
||||
if (line.trim()) {
|
||||
ctx.transcript.sys(` ${line}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
if (action === 'tts') {
|
||||
ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// on/off — mirror cli.py:_enable_voice_mode's 3-line output
|
||||
if (r.enabled) {
|
||||
const tts = r.tts ? ' (TTS enabled)' : ''
|
||||
ctx.transcript.sys(`Voice mode enabled${tts}`)
|
||||
ctx.transcript.sys(' Ctrl+B to start/stop recording')
|
||||
ctx.transcript.sys(' /voice tts to toggle speech output')
|
||||
ctx.transcript.sys(' /voice off to disable voice mode')
|
||||
} else {
|
||||
ctx.transcript.sys('Voice mode disabled.')
|
||||
}
|
||||
})
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
|
|||
}
|
||||
}
|
||||
|
||||
const voiceStop = () => {
|
||||
voice.setRecording(false)
|
||||
voice.setProcessing(true)
|
||||
// CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
|
||||
// (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
|
||||
// Ctrl+B while the mode is off sys-nudges the user. While the mode is
|
||||
// on, the first press starts a continuous loop (gateway → start_continuous,
|
||||
// VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
|
||||
// The gateway publishes voice.status + voice.transcript events that
|
||||
// createGatewayEventHandler turns into UI badges and composer injection.
|
||||
const voiceRecordToggle = () => {
|
||||
if (!voice.enabled) {
|
||||
return actions.sys('voice: mode is off — enable with /voice on')
|
||||
}
|
||||
|
||||
const starting = !voice.recording
|
||||
const action = starting ? 'start' : 'stop'
|
||||
|
||||
// Optimistic UI — flip the REC badge immediately so the user gets
|
||||
// feedback while the RPC round-trips; the voice.status event is the
|
||||
// authoritative source and may correct us.
|
||||
if (starting) {
|
||||
voice.setRecording(true)
|
||||
} else {
|
||||
voice.setRecording(false)
|
||||
voice.setProcessing(false)
|
||||
}
|
||||
|
||||
gateway
|
||||
.rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
|
||||
.then(r => {
|
||||
if (!r) {
|
||||
return
|
||||
.rpc<VoiceRecordResponse>('voice.record', { action })
|
||||
.catch((e: Error) => {
|
||||
// Revert optimistic UI on failure.
|
||||
if (starting) {
|
||||
voice.setRecording(false)
|
||||
}
|
||||
|
||||
const transcript = String(r.text || '').trim()
|
||||
|
||||
if (!transcript) {
|
||||
return actions.sys('voice: no speech detected')
|
||||
}
|
||||
|
||||
cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
|
||||
})
|
||||
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
|
||||
.finally(() => {
|
||||
voice.setProcessing(false)
|
||||
patchUiState({ status: 'ready' })
|
||||
actions.sys(`voice error: ${e.message}`)
|
||||
})
|
||||
}
|
||||
|
||||
const voiceStart = () =>
|
||||
gateway
|
||||
.rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
|
||||
.then(r => {
|
||||
if (!r) {
|
||||
return
|
||||
}
|
||||
|
||||
voice.setRecording(true)
|
||||
patchUiState({ status: 'recording…' })
|
||||
})
|
||||
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
|
||||
|
||||
useInput((ch, key) => {
|
||||
const live = getUiState()
|
||||
|
||||
|
|
@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
|
|||
}
|
||||
|
||||
if (isVoiceToggleKey(key, ch)) {
|
||||
return voice.recording ? voiceStop() : voiceStart()
|
||||
return voiceRecordToggle()
|
||||
}
|
||||
|
||||
if (isAction(key, ch, 'g')) {
|
||||
|
|
|
|||
|
|
@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
|
|||
composer: { actions: composerActions, refs: composerRefs, state: composerState },
|
||||
gateway,
|
||||
terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
|
||||
voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
|
||||
voice: {
|
||||
enabled: voiceEnabled,
|
||||
recording: voiceRecording,
|
||||
setProcessing: setVoiceProcessing,
|
||||
setRecording: setVoiceRecording,
|
||||
setVoiceEnabled
|
||||
},
|
||||
wheelStep: WHEEL_SCROLL_STEP
|
||||
})
|
||||
|
||||
const onEvent = useMemo(
|
||||
() =>
|
||||
createGatewayEventHandler({
|
||||
composer: { setInput: composerActions.setInput },
|
||||
gateway,
|
||||
session: {
|
||||
STARTUP_RESUME_ID,
|
||||
|
|
@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
|
|||
resumeById: session.resumeById,
|
||||
setCatalog
|
||||
},
|
||||
submission: { submitRef },
|
||||
system: { bellOnComplete, stdout, sys },
|
||||
transcript: { appendMessage, panel, setHistoryItems }
|
||||
transcript: { appendMessage, panel, setHistoryItems },
|
||||
voice: {
|
||||
setProcessing: setVoiceProcessing,
|
||||
setRecording: setVoiceRecording,
|
||||
setVoiceEnabled
|
||||
}
|
||||
}),
|
||||
[
|
||||
appendMessage,
|
||||
bellOnComplete,
|
||||
composerActions.setInput,
|
||||
gateway,
|
||||
panel,
|
||||
session.newSession,
|
||||
session.resetSession,
|
||||
session.resumeById,
|
||||
setVoiceEnabled,
|
||||
setVoiceProcessing,
|
||||
setVoiceRecording,
|
||||
stdout,
|
||||
submitRef,
|
||||
sys
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -236,10 +236,16 @@ export interface ImageAttachResponse {
|
|||
// ── Voice ────────────────────────────────────────────────────────────
|
||||
|
||||
export interface VoiceToggleResponse {
|
||||
audio_available?: boolean
|
||||
available?: boolean
|
||||
details?: string
|
||||
enabled?: boolean
|
||||
stt_available?: boolean
|
||||
tts?: boolean
|
||||
}
|
||||
|
||||
export interface VoiceRecordResponse {
|
||||
status?: string
|
||||
text?: string
|
||||
}
|
||||
|
||||
|
|
@ -368,6 +374,8 @@ export type GatewayEvent =
|
|||
| { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
|
||||
| { payload?: undefined; session_id?: string; type: 'message.start' }
|
||||
| { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
|
||||
| { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
|
||||
| { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
|
||||
| { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
|
||||
| { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
|
||||
| { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue