fix(tui): harden voice.record + TextInput paste + super-mod reserved list

Three round-7 Copilot follow-ups on #19835:

- voice.record start handler used _load_cfg().get('voice', {}).get(...) without
  shape checks, so malformed YAML (bool/scalar/list) returned 5025 instead of
  using VAD defaults. Centralized _voice_cfg_dict() helper and type-guarded
  silence_threshold/silence_duration with numeric fallbacks.
- TextInput pass-through check moved above paste/copy handling so configured
  voice chords (ctrl+v / alt+v / cmd+v) beat the composer's paste/copy
  defaults.
- parser now also rejects super+{c,d,l,v} — on macOS those are
  copy/exit/clear/paste and would be advertised in /voice status but never
  actually toggle recording.
This commit is contained in:
Brooklyn Nicholson 2026-05-04 15:15:54 -05:00
parent ff4e478fcc
commit be8223130b
5 changed files with 100 additions and 22 deletions

View file

@ -156,6 +156,42 @@ def test_voice_toggle_handles_non_dict_voice_cfg(monkeypatch):
)
def test_voice_record_start_handles_non_dict_voice_cfg(monkeypatch):
"""Round-7 Copilot review regression on #19835.
The ``voice.record`` start path previously read
``_load_cfg().get("voice", {}).get(...)`` without any shape checks.
When ``voice`` is a non-dict (bool/scalar/list) ``get`` raises
AttributeError and the handler returns 5025 instead of falling
back to the VAD defaults. Now it uses ``_voice_cfg_dict()`` and
non-numeric silence values are coerced to the documented defaults.
"""
captured: dict = {}
def fake_start_continuous(**kwargs):
captured.update(kwargs)
monkeypatch.setitem(
sys.modules,
"hermes_cli.voice",
types.SimpleNamespace(start_continuous=fake_start_continuous, stop_continuous=lambda: None),
)
monkeypatch.setenv("HERMES_VOICE", "1")
for bad in (True, "cmd+b", None, 42, ["ctrl+b"], {"silence_threshold": "loud"}):
captured.clear()
monkeypatch.setattr(server, "_load_cfg", lambda b=bad: {"voice": b})
resp = server.dispatch(
{"id": "voice-record", "method": "voice.record", "params": {"action": "start"}}
)
assert "result" in resp, f"voice.record raised for voice={bad!r}: {resp.get('error')}"
assert resp["result"]["status"] == "recording"
assert captured["silence_threshold"] == 200
assert captured["silence_duration"] == 3.0
def test_voice_toggle_tts_branch_also_carries_record_key(monkeypatch):
"""Round-2 Copilot review regression on #19835.

View file

@ -5278,22 +5278,26 @@ def _voice_tts_enabled() -> bool:
return os.environ.get("HERMES_VOICE_TTS", "").strip() == "1"
def _voice_record_key() -> str:
"""Current ``voice.record_key`` value, documented default on error.
def _voice_cfg_dict() -> dict:
"""Shape-safe accessor for the ``voice:`` block in config.yaml.
``_load_cfg()`` returns raw ``yaml.safe_load()`` output, so both the
root AND ``voice`` may be any YAML scalar / list / None. A hand-edit
like ``voice: true`` or ``voice: cmd+b`` (string where a dict is
expected) or even a malformed top-level config that parses to a
scalar would otherwise break ``.get("record_key")`` and take every
``voice.toggle`` branch down with it (Copilot round-3/round-4
review on #19835). Coerce through ``isinstance`` at every level so
malformed config falls back to the documented default instead of
crashing /voice.
like ``voice: true`` or a malformed top-level config that parses to
a scalar would otherwise break ``.get("")`` and take every
``voice.*`` branch down with it (Copilot round-3..7 review on
#19835). Coerce through ``isinstance`` at every level so malformed
config falls back to an empty dict instead of crashing /voice.
"""
cfg = _load_cfg()
voice_cfg = cfg.get("voice") if isinstance(cfg, dict) else None
record_key = voice_cfg.get("record_key") if isinstance(voice_cfg, dict) else None
return voice_cfg if isinstance(voice_cfg, dict) else {}
def _voice_record_key() -> str:
"""Current ``voice.record_key`` value, documented default on error."""
record_key = _voice_cfg_dict().get("record_key")
return str(record_key) if isinstance(record_key, str) and record_key else "ctrl+b"
@ -5419,15 +5423,19 @@ def _(rid, params: dict) -> dict:
from hermes_cli.voice import start_continuous
voice_cfg = _load_cfg().get("voice", {})
# Shape-safe lookups: malformed ``voice:`` YAML (bool/scalar/list)
# must not crash /voice with a 5025 — fall back to VAD defaults.
voice_cfg = _voice_cfg_dict()
threshold = voice_cfg.get("silence_threshold")
duration = voice_cfg.get("silence_duration")
start_continuous(
on_transcript=lambda t: _voice_emit("voice.transcript", {"text": t}),
on_status=lambda s: _voice_emit("voice.status", {"state": s}),
on_silent_limit=lambda: _voice_emit(
"voice.transcript", {"no_speech_limit": True}
),
silence_threshold=voice_cfg.get("silence_threshold", 200),
silence_duration=voice_cfg.get("silence_duration", 3.0),
silence_threshold=threshold if isinstance(threshold, (int, float)) else 200,
silence_duration=duration if isinstance(duration, (int, float)) else 3.0,
)
return _ok(rid, {"status": "recording"})

View file

@ -207,10 +207,23 @@ describe('parseVoiceRecordKey (#18994)', () => {
expect(parseVoiceRecordKey('ctrl+c')).toEqual(DEFAULT_VOICE_RECORD_KEY)
expect(parseVoiceRecordKey('ctrl+d')).toEqual(DEFAULT_VOICE_RECORD_KEY)
expect(parseVoiceRecordKey('ctrl+l')).toEqual(DEFAULT_VOICE_RECORD_KEY)
// Alt- and super-modifier versions of those letters are NOT
// intercepted, so they remain usable.
// Alt-modifier versions of those letters are NOT intercepted, so
// they remain usable.
expect(parseVoiceRecordKey('alt+c').mod).toBe('alt')
expect(parseVoiceRecordKey('super+l').mod).toBe('super')
})
it('rejects super+{c,d,l,v} — mac action-mod chords are claimed before voice', async () => {
const { DEFAULT_VOICE_RECORD_KEY, parseVoiceRecordKey } = await importPlatform('linux')
// On macOS super+c/d/l/v are copy / exit / clear / paste. Reject at
// parse time so /voice status doesn't advertise dead bindings.
expect(parseVoiceRecordKey('super+c')).toEqual(DEFAULT_VOICE_RECORD_KEY)
expect(parseVoiceRecordKey('super+d')).toEqual(DEFAULT_VOICE_RECORD_KEY)
expect(parseVoiceRecordKey('super+l')).toEqual(DEFAULT_VOICE_RECORD_KEY)
expect(parseVoiceRecordKey('super+v')).toEqual(DEFAULT_VOICE_RECORD_KEY)
// Other super letters still work (no global chord claims them).
expect(parseVoiceRecordKey('super+b').mod).toBe('super')
expect(parseVoiceRecordKey('super+o').mod).toBe('super')
})
// Round-5 Copilot review regressions on #19835.

View file

@ -707,6 +707,15 @@ export function TextInput({
(inp: string, k: Key, event: InputEvent) => {
const eventRaw = event.keypress.raw
// Configured voice shortcut wins over composer-level defaults like
// paste/copy so users who bind voice to ctrl+v / alt+v / cmd+v
// actually get voice toggled instead of a paste (Copilot round-7
// follow-up on #19835). The pass-through predicate is a no-op for
// ordinary typing and plain paste when voice is unbound to 'v'.
if (shouldPassThroughToGlobalHandler(inp, k, voiceRecordKey)) {
return
}
if (
eventRaw === '\x1bv' ||
eventRaw === '\x1bV' ||
@ -752,12 +761,6 @@ export function TextInput({
return
}
// Chords claimed by useInputHandlers — pass through instead of letting
// them fall into text-editing behavior here.
if (shouldPassThroughToGlobalHandler(inp, k, voiceRecordKey)) {
return
}
if (k.return) {
if (k.shift || k.ctrl || (isMac ? isActionMod(k) : k.meta)) {
flushParentChange()

View file

@ -138,6 +138,16 @@ const _NAMED_KEY_ALIASES: Record<string, VoiceRecordKeyNamed> = {
* round-4 review on #19835). */
const _RESERVED_CTRL_CHARS = new Set(['c', 'd', 'l'])
/** On macOS the action-modifier also intercepts standard editor chords
* via ``isCopyShortcut`` / ``isAction`` in ``useInputHandlers()``:
* - super+c copy
* - super+d exit
* - super+l clear screen
* - super+v paste (also claimed at the TextInput layer)
* Reject at parse time for the same reason as ``_RESERVED_CTRL_CHARS``
* (Copilot round-7 review on #19835). */
const _RESERVED_SUPER_CHARS = new Set(['c', 'd', 'l', 'v'])
interface RuntimeKeyEvent {
alt?: boolean
backspace?: boolean
@ -247,6 +257,14 @@ export const parseVoiceRecordKey = (raw: unknown): ParsedVoiceRecordKey => {
return DEFAULT_VOICE_RECORD_KEY
}
// Same for ``super+c`` / ``super+d`` / ``super+l`` / ``super+v`` —
// on macOS those are copy / exit / clear / paste and get claimed
// by ``isCopyShortcut`` / ``isAction`` / the TextInput paste layer
// before voice has a chance to toggle.
if (mod === 'super' && last.length === 1 && _RESERVED_SUPER_CHARS.has(last)) {
return DEFAULT_VOICE_RECORD_KEY
}
if (last.length === 1) {
return { ch: last, mod, raw: lower }
}