feat(tui): match CLI's voice slash + VAD-continuous recording model

The TUI had drifted from the CLI's voice model in two ways: - /voice on was lighting up the microphone immediately and Ctrl+B was interpreted as a mode toggle. The CLI separates the two: /voice on just flips the umbrella bit, recording only starts once the user presses Ctrl+B, which also sets _voice_continuous so the VAD loop auto-restarts until the user presses Ctrl+B again or three silent cycles pass. - /voice tts was missing entirely, so users couldn't turn agent reply speech on/off from inside the TUI. This commit brings the TUI to parity. Python - hermes_cli/voice.py: continuous-mode API (start_continuous, stop_continuous, is_continuous_active) layered on the existing PTT wrappers. The silence callback transcribes, fires on_transcript, tracks consecutive no-speech cycles, and auto-restarts — mirroring cli.py:_voice_stop_and_transcribe + _restart_recording. - tui_gateway/server.py: - voice.toggle now supports on / off / tts / status. The umbrella bit lives in HERMES_VOICE + display.voice_enabled; tts lives in HERMES_VOICE_TTS + display.voice_tts. /voice off also tears down any active continuous loop so a toggle-off really releases the microphone. - voice.record start/stop now drives start_continuous/stop_continuous. start is refused with a clear error when the mode is off, matching cli.py:handle_voice_record's early return on `not _voice_mode`. - New voice.transcript / voice.status events emit through _voice_emit (remembers the sid that last enabled the mode so events land in the right session). TypeScript - gatewayTypes.ts: voice.status + voice.transcript event discriminants; VoiceToggleResponse gains tts; VoiceRecordResponse gains status for the new "started/stopped" responses. - interfaces.ts: GatewayEventHandlerContext gains composer.setInput + submission.submitRef + voice.{setRecording, setProcessing, setVoiceEnabled}; InputHandlerContext.voice gains enabled + setVoiceEnabled for the mode-aware Ctrl+B handler. - createGatewayEventHandler.ts: voice.status drives REC/STT badges; voice.transcript auto-submits when the composer is empty (CLI _pending_input.put parity) and appends when a draft is in flight. no_speech_limit flips voice off + sys line. - useInputHandlers.ts: Ctrl+B now calls voice.record (start/stop), not voice.toggle, and nudges the user with a sys line when the mode is off instead of silently flipping it on. - useMainApp.ts: wires the new event-handler context fields. - slash/commands/session.ts: /voice handles on / off / tts / status with CLI-matching output ("voice: mode on · tts off"). Backward compat preserved for voice.record (was always PTT shape; gateway still honours start/stop with mode-gating added).
2026-04-25 00:51:20 +00:00 · 2026-04-24 00:55:17 +03:00 · 2026-04-24 00:55:17 +03:00 · 04c489b587
commit 04c489b587
parent 0bb460b070
10 changed files with 861 additions and 78 deletions
--- a/ui-tui/src/tests/createGatewayEventHandler.test.ts
+++ b/ui-tui/src/tests/createGatewayEventHandler.test.ts
@ -15,7 +15,8 @@ const buildCtx = (appended: Msg[]) =>
    composer: {
      dequeue: () => undefined,
      queueEditRef: ref<null | number>(null),
-      sendQueued: vi.fn()
+      sendQueued: vi.fn(),
+      setInput: vi.fn()
    },
    gateway: {
      gw: { request: vi.fn() },
@ -29,6 +30,9 @@ const buildCtx = (appended: Msg[]) =>
      resumeById: vi.fn(),
      setCatalog: vi.fn()
    },
+    submission: {
+      submitRef: { current: vi.fn() }
+    },
    system: {
      bellOnComplete: false,
      sys: vi.fn()
@ -38,6 +42,11 @@ const buildCtx = (appended: Msg[]) =>
      panel: (title: string, sections: any[]) =>
        appended.push({ kind: 'panel', panelData: { sections, title }, role: 'system', text: '' }),
      setHistoryItems: vi.fn()
+    },
+    voice: {
+      setProcessing: vi.fn(),
+      setRecording: vi.fn(),
+      setVoiceEnabled: vi.fn()
    }
  }) as any

--- a/ui-tui/src/app/createGatewayEventHandler.ts
+++ b/ui-tui/src/app/createGatewayEventHandler.ts
@ -51,6 +51,9 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
  const { STARTUP_RESUME_ID, newSession, resumeById, setCatalog } = ctx.session
  const { bellOnComplete, stdout, sys } = ctx.system
  const { appendMessage, panel, setHistoryItems } = ctx.transcript
+  const { setInput } = ctx.composer
+  const { submitRef } = ctx.submission
+  const { setProcessing: setVoiceProcessing, setRecording: setVoiceRecording, setVoiceEnabled } = ctx.voice

  let pendingThinkingStatus = ''
  let thinkingStatusTimer: null | ReturnType<typeof setTimeout> = null
@ -261,6 +264,60 @@ export function createGatewayEventHandler(ctx: GatewayEventHandlerContext): (ev:
        return
      }

+      case 'voice.status': {
+        // Continuous VAD loop reports its internal state so the status bar
+        // can show listening / transcribing / idle without polling.
+        const state = String(ev.payload?.state ?? '')
+
+        if (state === 'listening') {
+          setVoiceRecording(true)
+          setVoiceProcessing(false)
+        } else if (state === 'transcribing') {
+          setVoiceRecording(false)
+          setVoiceProcessing(true)
+        } else {
+          setVoiceRecording(false)
+          setVoiceProcessing(false)
+        }
+
+        return
+      }
+
+      case 'voice.transcript': {
+        // CLI parity: the 3-strikes silence detector flipped off automatically.
+        // Mirror that on the UI side and tell the user why the mode is off.
+        if (ev.payload?.no_speech_limit) {
+          setVoiceEnabled(false)
+          setVoiceRecording(false)
+          setVoiceProcessing(false)
+          sys('voice: no speech detected 3 times, continuous mode stopped')
+
+          return
+        }
+
+        const text = String(ev.payload?.text ?? '').trim()
+
+        if (!text) {
+          return
+        }
+
+        // Match CLI's _pending_input.put(transcript): auto-submit when the
+        // composer is empty, otherwise append so the user can keep editing
+        // a partial draft they were working on.
+        setInput(prev => {
+          if (!prev) {
+            // defer submit so React commits the state change first
+            setTimeout(() => submitRef.current(text), 0)
+
+            return ''
+          }
+
+          return `${prev}${/\s$/.test(prev) ? '' : ' '}${text}`
+        })
+
+        return
+      }
+
      case 'gateway.start_timeout': {
        const { cwd, python } = ev.payload ?? {}
        const trace = python || cwd ? ` · ${String(python || '')} ${String(cwd || '')}`.trim() : ''
--- a/ui-tui/src/app/interfaces.ts
+++ b/ui-tui/src/app/interfaces.ts
@ -189,9 +189,11 @@ export interface InputHandlerContext {
    stdout?: NodeJS.WriteStream
  }
  voice: {
+    enabled: boolean
    recording: boolean
    setProcessing: StateSetter<boolean>
    setRecording: StateSetter<boolean>
+    setVoiceEnabled: StateSetter<boolean>
  }
  wheelStep: number
 }
@ -201,6 +203,9 @@ export interface InputHandlerResult {
 }

 export interface GatewayEventHandlerContext {
+  composer: {
+    setInput: StateSetter<string>
+  }
  gateway: GatewayServices
  session: {
    STARTUP_RESUME_ID: string
@ -210,6 +215,9 @@ export interface GatewayEventHandlerContext {
    resumeById: (id: string) => void
    setCatalog: StateSetter<null | SlashCatalog>
  }
+  submission: {
+    submitRef: MutableRefObject<(value: string) => void>
+  }
  system: {
    bellOnComplete: boolean
    stdout?: NodeJS.WriteStream
@ -220,6 +228,11 @@ export interface GatewayEventHandlerContext {
    panel: (title: string, sections: PanelSection[]) => void
    setHistoryItems: StateSetter<Msg[]>
  }
+  voice: {
+    setProcessing: StateSetter<boolean>
+    setRecording: StateSetter<boolean>
+    setVoiceEnabled: StateSetter<boolean>
+  }
 }

 export interface SlashHandlerContext {
--- a/ui-tui/src/app/slash/commands/session.ts
+++ b/ui-tui/src/app/slash/commands/session.ts
@ -184,15 +184,64 @@ export const sessionCommands: SlashCommand[] = [
  },

  {
-    help: 'toggle voice input',
+    help: 'voice mode: [on|off|tts|status]',
    name: 'voice',
    run: (arg, ctx) => {
-      const action = arg === 'on' || arg === 'off' ? arg : 'status'
+      const normalized = (arg ?? '').trim().toLowerCase()
+
+      const action =
+        normalized === 'on' || normalized === 'off' || normalized === 'tts' || normalized === 'status'
+          ? normalized
+          : 'status'

      ctx.gateway.rpc<VoiceToggleResponse>('voice.toggle', { action }).then(
        ctx.guarded<VoiceToggleResponse>(r => {
          ctx.voice.setVoiceEnabled(!!r.enabled)
-          ctx.transcript.sys(`voice: ${r.enabled ? 'on — press Ctrl+B to record' : 'off'}`)
+
+          // Match CLI's _show_voice_status / _enable_voice_mode /
+          // _toggle_voice_tts output shape so users don't have to learn
+          // two vocabularies.
+          if (action === 'status') {
+            const mode = r.enabled ? 'ON' : 'OFF'
+            const tts = r.tts ? 'ON' : 'OFF'
+            ctx.transcript.sys('Voice Mode Status')
+            ctx.transcript.sys(`  Mode:       ${mode}`)
+            ctx.transcript.sys(`  TTS:        ${tts}`)
+            ctx.transcript.sys('  Record key: Ctrl+B')
+
+            // CLI's "Requirements:" block — surfaces STT/audio setup issues
+            // so the user sees "STT provider: MISSING ..." instead of
+            // silently failing on every Ctrl+B press.
+            if (r.details) {
+              ctx.transcript.sys('')
+              ctx.transcript.sys('  Requirements:')
+
+              for (const line of r.details.split('\n')) {
+                if (line.trim()) {
+                  ctx.transcript.sys(`    ${line}`)
+                }
+              }
+            }
+
+            return
+          }
+
+          if (action === 'tts') {
+            ctx.transcript.sys(`Voice TTS ${r.tts ? 'enabled' : 'disabled'}.`)
+
+            return
+          }
+
+          // on/off — mirror cli.py:_enable_voice_mode's 3-line output
+          if (r.enabled) {
+            const tts = r.tts ? ' (TTS enabled)' : ''
+            ctx.transcript.sys(`Voice mode enabled${tts}`)
+            ctx.transcript.sys('  Ctrl+B to start/stop recording')
+            ctx.transcript.sys('  /voice tts  to toggle speech output')
+            ctx.transcript.sys('  /voice off  to disable voice mode')
+          } else {
+            ctx.transcript.sys('Voice mode disabled.')
+          }
        })
      )
    }
--- a/ui-tui/src/app/useInputHandlers.ts
+++ b/ui-tui/src/app/useInputHandlers.ts
@ -134,45 +134,43 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
    }
  }

-  const voiceStop = () => {
-    voice.setRecording(false)
-    voice.setProcessing(true)
+  // CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop
+  // (NOT the voice-mode umbrella bit). The mode is enabled via /voice on;
+  // Ctrl+B while the mode is off sys-nudges the user. While the mode is
+  // on, the first press starts a continuous loop (gateway → start_continuous,
+  // VAD auto-stop → transcribe → auto-restart), a subsequent press stops it.
+  // The gateway publishes voice.status + voice.transcript events that
+  // createGatewayEventHandler turns into UI badges and composer injection.
+  const voiceRecordToggle = () => {
+    if (!voice.enabled) {
+      return actions.sys('voice: mode is off — enable with /voice on')
+    }
+
+    const starting = !voice.recording
+    const action = starting ? 'start' : 'stop'
+
+    // Optimistic UI — flip the REC badge immediately so the user gets
+    // feedback while the RPC round-trips; the voice.status event is the
+    // authoritative source and may correct us.
+    if (starting) {
+      voice.setRecording(true)
+    } else {
+      voice.setRecording(false)
+      voice.setProcessing(false)
+    }

    gateway
-      .rpc<VoiceRecordResponse>('voice.record', { action: 'stop' })
-      .then(r => {
-        if (!r) {
-          return
+      .rpc<VoiceRecordResponse>('voice.record', { action })
+      .catch((e: Error) => {
+        // Revert optimistic UI on failure.
+        if (starting) {
+          voice.setRecording(false)
        }

-        const transcript = String(r.text || '').trim()
-
-        if (!transcript) {
-          return actions.sys('voice: no speech detected')
-        }
-
-        cActions.setInput(prev => (prev ? `${prev}${/\s$/.test(prev) ? '' : ' '}${transcript}` : transcript))
-      })
-      .catch((e: Error) => actions.sys(`voice error: ${e.message}`))
-      .finally(() => {
-        voice.setProcessing(false)
-        patchUiState({ status: 'ready' })
+        actions.sys(`voice error: ${e.message}`)
      })
  }

-  const voiceStart = () =>
-    gateway
-      .rpc<VoiceRecordResponse>('voice.record', { action: 'start' })
-      .then(r => {
-        if (!r) {
-          return
-        }
-
-        voice.setRecording(true)
-        patchUiState({ status: 'recording…' })
-      })
-      .catch((e: Error) => actions.sys(`voice error: ${e.message}`))
-
  useInput((ch, key) => {
    const live = getUiState()

@ -371,7 +369,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult {
    }

    if (isVoiceToggleKey(key, ch)) {
-      return voice.recording ? voiceStop() : voiceStart()
+      return voiceRecordToggle()
    }

    if (isAction(key, ch, 'g')) {
--- a/ui-tui/src/app/useMainApp.ts
+++ b/ui-tui/src/app/useMainApp.ts
@ -454,13 +454,20 @@ export function useMainApp(gw: GatewayClient) {
    composer: { actions: composerActions, refs: composerRefs, state: composerState },
    gateway,
    terminal: { hasSelection, scrollRef, scrollWithSelection, selection, stdout },
-    voice: { recording: voiceRecording, setProcessing: setVoiceProcessing, setRecording: setVoiceRecording },
+    voice: {
+      enabled: voiceEnabled,
+      recording: voiceRecording,
+      setProcessing: setVoiceProcessing,
+      setRecording: setVoiceRecording,
+      setVoiceEnabled
+    },
    wheelStep: WHEEL_SCROLL_STEP
  })

  const onEvent = useMemo(
    () =>
      createGatewayEventHandler({
+        composer: { setInput: composerActions.setInput },
        gateway,
        session: {
          STARTUP_RESUME_ID,
@ -470,18 +477,29 @@ export function useMainApp(gw: GatewayClient) {
          resumeById: session.resumeById,
          setCatalog
        },
+        submission: { submitRef },
        system: { bellOnComplete, stdout, sys },
-        transcript: { appendMessage, panel, setHistoryItems }
+        transcript: { appendMessage, panel, setHistoryItems },
+        voice: {
+          setProcessing: setVoiceProcessing,
+          setRecording: setVoiceRecording,
+          setVoiceEnabled
+        }
      }),
    [
      appendMessage,
      bellOnComplete,
+      composerActions.setInput,
      gateway,
      panel,
      session.newSession,
      session.resetSession,
      session.resumeById,
+      setVoiceEnabled,
+      setVoiceProcessing,
+      setVoiceRecording,
      stdout,
+      submitRef,
      sys
    ]
  )
--- a/ui-tui/src/gatewayTypes.ts
+++ b/ui-tui/src/gatewayTypes.ts
@ -236,10 +236,16 @@ export interface ImageAttachResponse {
 // ── Voice ────────────────────────────────────────────────────────────

 export interface VoiceToggleResponse {
+  audio_available?: boolean
+  available?: boolean
+  details?: string
  enabled?: boolean
+  stt_available?: boolean
+  tts?: boolean
 }

 export interface VoiceRecordResponse {
+  status?: string
  text?: string
 }

@ -368,6 +374,8 @@ export type GatewayEvent =
  | { payload?: { text?: string }; session_id?: string; type: 'thinking.delta' }
  | { payload?: undefined; session_id?: string; type: 'message.start' }
  | { payload?: { kind?: string; text?: string }; session_id?: string; type: 'status.update' }
+  | { payload?: { state?: 'idle' | 'listening' | 'transcribing' }; session_id?: string; type: 'voice.status' }
+  | { payload?: { no_speech_limit?: boolean; text?: string }; session_id?: string; type: 'voice.transcript' }
  | { payload: { line: string }; session_id?: string; type: 'gateway.stderr' }
  | { payload?: { cwd?: string; python?: string }; session_id?: string; type: 'gateway.start_timeout' }
  | { payload?: { preview?: string }; session_id?: string; type: 'gateway.protocol_error' }