feat(tui): match CLI's voice slash + VAD-continuous recording model

The TUI had drifted from the CLI's voice model in two ways:

- /voice on was lighting up the microphone immediately and Ctrl+B was
  interpreted as a mode toggle.  The CLI separates the two: /voice on
  just flips the umbrella bit, recording only starts once the user
  presses Ctrl+B, which also sets _voice_continuous so the VAD loop
  auto-restarts until the user presses Ctrl+B again or three silent
  cycles pass.
- /voice tts was missing entirely, so users couldn't turn agent reply
  speech on/off from inside the TUI.

This commit brings the TUI to parity.

Python

- hermes_cli/voice.py: continuous-mode API (start_continuous,
  stop_continuous, is_continuous_active) layered on the existing PTT
  wrappers. The silence callback transcribes, fires on_transcript,
  tracks consecutive no-speech cycles, and auto-restarts — mirroring
  cli.py:_voice_stop_and_transcribe + _restart_recording.
- tui_gateway/server.py:
  - voice.toggle now supports on / off / tts / status.  The umbrella
    bit lives in HERMES_VOICE + display.voice_enabled; tts lives in
    HERMES_VOICE_TTS + display.voice_tts.  /voice off also tears down
    any active continuous loop so a toggle-off really releases the
    microphone.
  - voice.record start/stop now drives start_continuous/stop_continuous.
    start is refused with a clear error when the mode is off, matching
    cli.py:handle_voice_record's early return on `not _voice_mode`.
  - New voice.transcript / voice.status events emit through
    _voice_emit (remembers the sid that last enabled the mode so
    events land in the right session).

TypeScript

- gatewayTypes.ts: voice.status + voice.transcript event
  discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse
  gains status for the new "started/stopped" responses.
- interfaces.ts: GatewayEventHandlerContext gains composer.setInput +
  submission.submitRef + voice.{setRecording, setProcessing,
  setVoiceEnabled}; InputHandlerContext.voice gains enabled +
  setVoiceEnabled for the mode-aware Ctrl+B handler.
- createGatewayEventHandler.ts: voice.status drives REC/STT badges;
  voice.transcript auto-submits when the composer is empty (CLI
  _pending_input.put parity) and appends when a draft is in flight.
  no_speech_limit flips voice off + sys line.
- useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop),
  not voice.toggle, and nudges the user with a sys line when the
  mode is off instead of silently flipping it on.
- useMainApp.ts: wires the new event-handler context fields.
- slash/commands/session.ts: /voice handles on / off / tts / status
  with CLI-matching output ("voice: mode on · tts off").

Backward compat preserved for voice.record (was always PTT shape;
gateway still honours start/stop with mode-gating added).
This commit is contained in:
0xbyt4 2026-04-24 00:55:17 +03:00 committed by Teknium
parent 0bb460b070
commit 04c489b587
10 changed files with 861 additions and 78 deletions

View file

@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
composer: {
dequeue: () => undefined,
queueEditRef: ref<null | number>(null),
sendQueued: vi.fn()
sendQueued: vi.fn(),
setInput: vi.fn()
},
gateway: {
gw: { request: vi.fn() },
@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
resumeById: vi.fn(),
setCatalog: vi.fn()
},
submission: {
submitRef: { current: vi.fn() }
},
system: {
bellOnComplete: false,
sys: vi.fn()
@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
panel: (title: string, sections: any[]) =>
appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
setHistoryItems: vi.fn()
},
voice: {
setProcessing: vi.fn(),
setRecording: vi.fn(),
setVoiceEnabled: vi.fn()
}
}) as any

View file

@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
const { bellOnComplete, stdout, sys } = ctx.system
const { appendMessage, panel, setHistoryItems } = ctx.transcript
const { setInput } = ctx.composer
const { submitRef } = ctx.submission
const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice
let pendingThinkingStatus = ''
let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
return
}
case 'voice.status': {
// Continuous VAD loop reports its internal state so the status bar
// can show listening / transcribing / idle without polling.
const state = String(ev.payload?.state ?? '')
if (state === 'listening') {
setVoiceRecording(true)
setVoiceProcessing(false)
} else if (state === 'transcribing') {
setVoiceRecording(false)
setVoiceProcessing(true)
} else {
setVoiceRecording(false)
setVoiceProcessing(false)
}
return
}
case 'voice.transcript': {
// CLI parity: the 3-strikes silence detector flipped off automatically.
// Mirror that on the UI side and tell the user why the mode is off.
if (ev.payload?.no_speech_limit) {
setVoiceEnabled(false)
setVoiceRecording(false)
setVoiceProcessing(false)
sys('voice: no speech detected 3 times, continuous mode stopped')
return
}
const text = String(ev.payload?.text ?? '').trim()
if (!text) {
return
}
// Match CLI's _pending_input.put(transcript): auto-submit when the
// composer is empty, otherwise append so the user can keep editing
// a partial draft they were working on.
setInput(prev => {
if (!prev) {
// defer submit so React commits the state change first
setTimeout(() => submitRef.current(text), 0)
return ''
}
return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
})
return
}
case 'gateway.start_timeout': {
const { cwd, python } = ev.payload ?? {}
const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''

View file

@ -189,9 +189,11 @@ export interface InputHandlerContext {
stdout?: NodeJS.WriteStream
}
voice: {
enabled: boolean
recording: boolean
setProcessing: StateSetter<boolean>
setRecording: StateSetter<boolean>
setVoiceEnabled: StateSetter<boolean>
}
wheelStep: number
}
@ -201,6 +203,9 @@ export interface InputHandlerResult {
}
export interface GatewayEventHandlerContext {
composer: {
setInput: StateSetter<string>
}
gateway: GatewayServices
session: {
STARTUP_RESUME_ID: string
@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
resumeById: (id: string) => void
setCatalog: StateSetter<null | SlashCatalog>
}
submission: {
submitRef: MutableRefObject<(value: string) => void>
}
system: {
bellOnComplete: boolean
stdout?: NodeJS.WriteStream
@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
panel: (title: string, sections: PanelSection[]) => void
setHistoryItems: StateSetter<Msg[]>
}
voice: {
setProcessing: StateSetter<boolean>
setRecording: StateSetter<boolean>
setVoiceEnabled: StateSetter<boolean>
}
}
export interface SlashHandlerContext {

View file

@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
},
{
help: 'toggle voice input',
help: 'voice mode: [on|off|tts|status]',
name: 'voice',
run: (arg, ctx) => {
const action = arg === 'on' || arg === 'off' ? arg : 'status'
const normalized = (arg ?? '').trim().toLowerCase()
const action =
normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
? normalized
: 'status'
ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
ctx.guarded<VoiceToggleResponse>(r => {
ctx.voice.setVoiceEnabled(!!r.enabled)
ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`)
// Match CLI's _show_voice_status / _enable_voice_mode /
// _toggle_voice_tts output shape so users don't have to learn
// two vocabularies.
if (action === 'status') {
const mode = r.enabled ? 'ON' : 'OFF'
const tts = r.tts ? 'ON' : 'OFF'
ctx.transcript.sys('Voice Mode Status')
ctx.transcript.sys(` Mode: ${mode}`)
ctx.transcript.sys(` TTS: ${tts}`)
ctx.transcript.sys(' Record key: Ctrl+B')
// CLI's "Requirements:" block — surfaces STT/audio setup issues
// so the user sees "STT provider: MISSING ..." instead of
// silently failing on every Ctrl+B press.
if (r.details) {
ctx.transcript.sys('')
ctx.transcript.sys(' Requirements:')
for (const line of r.details.split('\n')) {
if (line.trim()) {
ctx.transcript.sys(` ${line}`)
}
}
}
return
}
if (action === 'tts') {
ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
return
}
// on/off — mirror cli.py:_enable_voice_mode's 3-line output
if (r.enabled) {
const tts = r.tts ? ' (TTS enabled)' : ''
ctx.transcript.sys(`Voice mode enabled${tts}`)
ctx.transcript.sys(' Ctrl+B to start/stop recording')
ctx.transcript.sys(' /voice tts to toggle speech output')
ctx.transcript.sys(' /voice off to disable voice mode')
} else {
ctx.transcript.sys('Voice mode disabled.')
}
})
)
}

View file

@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
}
}
const voiceStop = () => {
voice.setRecording(false)
voice.setProcessing(true)
// CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
// (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
// Ctrl+B while the mode is off sys-nudges the user. While the mode is
// on, the first press starts a continuous loop (gateway → start_continuous,
// VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
// The gateway publishes voice.status + voice.transcript events that
// createGatewayEventHandler turns into UI badges and composer injection.
const voiceRecordToggle = () => {
if (!voice.enabled) {
return actions.sys('voice: mode is off — enable with /voice on')
}
const starting = !voice.recording
const action = starting ? 'start' : 'stop'
// Optimistic UI — flip the REC badge immediately so the user gets
// feedback while the RPC round-trips; the voice.status event is the
// authoritative source and may correct us.
if (starting) {
voice.setRecording(true)
} else {
voice.setRecording(false)
voice.setProcessing(false)
}
gateway
.rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
.then(r => {
if (!r) {
return
.rpc<VoiceRecordResponse>('voice.record', { action })
.catch((e: Error) => {
// Revert optimistic UI on failure.
if (starting) {
voice.setRecording(false)
}
const transcript = String(r.text || '').trim()
if (!transcript) {
return actions.sys('voice: no speech detected')
}
cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
})
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
.finally(() => {
voice.setProcessing(false)
patchUiState({ status: 'ready' })
actions.sys(`voice error: ${e.message}`)
})
}
const voiceStart = () =>
gateway
.rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
.then(r => {
if (!r) {
return
}
voice.setRecording(true)
patchUiState({ status: 'recording…' })
})
.catch((e: Error) => actions.sys(`voice error: ${e.message}`))
useInput((ch, key) => {
const live = getUiState()
@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
}
if (isVoiceToggleKey(key, ch)) {
return voice.recording ? voiceStop() : voiceStart()
return voiceRecordToggle()
}
if (isAction(key, ch, 'g')) {

View file

@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
composer: { actions: composerActions, refs: composerRefs, state: composerState },
gateway,
terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
voice: {
enabled: voiceEnabled,
recording: voiceRecording,
setProcessing: setVoiceProcessing,
setRecording: setVoiceRecording,
setVoiceEnabled
},
wheelStep: WHEEL_SCROLL_STEP
})
const onEvent = useMemo(
() =>
createGatewayEventHandler({
composer: { setInput: composerActions.setInput },
gateway,
session: {
STARTUP_RESUME_ID,
@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
resumeById: session.resumeById,
setCatalog
},
submission: { submitRef },
system: { bellOnComplete, stdout, sys },
transcript: { appendMessage, panel, setHistoryItems }
transcript: { appendMessage, panel, setHistoryItems },
voice: {
setProcessing: setVoiceProcessing,
setRecording: setVoiceRecording,
setVoiceEnabled
}
}),
[
appendMessage,
bellOnComplete,
composerActions.setInput,
gateway,
panel,
session.newSession,
session.resetSession,
session.resumeById,
setVoiceEnabled,
setVoiceProcessing,
setVoiceRecording,
stdout,
submitRef,
sys
]
)

View file

@ -236,10 +236,16 @@ export interface ImageAttachResponse {
// ── Voice ────────────────────────────────────────────────────────────
export interface VoiceToggleResponse {
audio_available?: boolean
available?: boolean
details?: string
enabled?: boolean
stt_available?: boolean
tts?: boolean
}
export interface VoiceRecordResponse {
status?: string
text?: string
}
@ -368,6 +374,8 @@ export type GatewayEvent =
| { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
| { payload?: undefined; session_id?: string; type: 'message.start' }
| { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
| { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
| { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
| { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
| { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
| { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }