diff --git a/tests/plugins/transcription/check_parity_vs_main.py b/tests/plugins/transcription/check_parity_vs_main.py index 2a0ac85dc8d..c6ad8370bcf 100644 --- a/tests/plugins/transcription/check_parity_vs_main.py +++ b/tests/plugins/transcription/check_parity_vs_main.py @@ -1,10 +1,9 @@ -"""Behavior-parity check for the STT plugin hook (follow-up to #30398). +"""Behavior-parity check for the STT plugin hook + command-provider registry. Spawns one subprocess per (version, scenario) cell — pinned to either -``origin/main`` (no plugin hook; ``stt.provider: openrouter`` falls -through to the "No STT provider available" error path) or this PR's -worktree (plugin hook present; same config routes through the plugin -registry when a plugin is registered). +``origin/main`` (no plugin hook, no STT command-provider registry; only +the legacy ``HERMES_LOCAL_STT_COMMAND`` escape hatch exists) or this PR's +worktree (both new surfaces present). Each subprocess clears all STT-related env vars + writes a ``config.yaml``, then asks the dispatcher how it would route a @@ -14,13 +13,18 @@ Each subprocess clears all STT-related env vars + writes a Where ``dispatch_kind`` ∈ ``{"builtin_local", "builtin_groq", "builtin_openai", ..., -"plugin", "plugin_unavailable", "no_provider_error", "stt_disabled"}``. +"plugin", "plugin_unavailable", "command_provider", +"no_provider_error", "stt_disabled"}``. Acceptable diffs: - ``no_provider_error → plugin`` for the ``plugin-installed`` scenario. - ``no_provider_error → plugin_unavailable`` for the ``plugin-installed-unavailable`` scenario (PR returns the cleaner unavailability envelope instead of the generic auto-detect error). +- ``no_provider_error → command_provider`` for the + ``command-provider-installed`` scenario (registry shipped with this PR). +- ``no_provider_error → command_provider`` for + ``command-vs-plugin-same-name`` (command wins precedence, same as TTS). Run from the PR worktree:: @@ -101,7 +105,7 @@ try: @property def name(self): return "openrouter" def transcribe(self, file_path, **kw): - return {"success": True, "transcript": "plugin transcript", "provider": "openrouter"} + return {"success": True, "transcript": "PLUGIN: openrouter transcript", "provider": "openrouter"} transcription_registry._reset_for_tests() transcription_registry.register_provider(_FakeProvider()) @@ -183,6 +187,13 @@ elif not success and "No STT provider" in error_text: dispatch_kind = "no_provider_error" elif provider_name in ("local", "local_command", "groq", "openai", "mistral", "xai"): dispatch_kind = "builtin_" + provider_name +elif success and isinstance(result, dict) and result.get("transcript", "").startswith("CMD:"): + # Command-provider scenarios below emit transcripts prefixed with "CMD:" + # so the harness can distinguish command-provider dispatch from a + # plugin dispatch even when they share a provider name. + dispatch_kind = "command_provider" +elif success and isinstance(result, dict) and result.get("transcript", "").startswith("PLUGIN:"): + dispatch_kind = "plugin" elif success and provider_name and provider_name not in ("local", "local_command", "groq", "openai", "mistral", "xai"): dispatch_kind = "plugin" else: @@ -197,6 +208,35 @@ print(json.dumps(shape)) """ +def _cmd_yaml(provider_name: str, transcript: str) -> str: + """Build a YAML snippet for an stt.providers.: type: command entry. + + Produces a shell command that writes ``transcript`` to {output_path}. + Backslashes in the venv python path are doubled for YAML, and the + inner double quotes around the python -c payload are YAML-escaped. + Keeps the test scenarios readable. + """ + interp = sys.executable.replace("\\", "\\\\") + # Inside the YAML double-quoted string, we use single quotes around + # the python -c body so we don't have to YAML-escape inner double + # quotes. Single quotes inside the body are not needed; the body uses + # double quotes for module references and string literals. + payload = ( + f"import sys; open(sys.argv[1], 'w').write('{transcript}')" + ) + command = f'{interp} -c "{payload}" {{output_path}}' + # YAML-escape: double-quote the whole thing, escape inner " and \. + yaml_escaped = command.replace("\\", "\\\\").replace('"', '\\"') + return ( + "stt:\n" + f" provider: {provider_name}\n" + " providers:\n" + f" {provider_name}:\n" + " type: command\n" + f' command: "{yaml_escaped}"\n' + ) + + SCENARIOS: list[tuple[str, str, dict[str, str], str]] = [ # (label, config.yaml body, scenario_env, plugin_register) ("stt-disabled", "stt:\n enabled: false\n", {}, "no"), @@ -215,9 +255,46 @@ SCENARIOS: list[tuple[str, str, dict[str, str], str]] = [ ("plugin-installed-unavailable", "stt:\n provider: openrouter\n", {}, "unavailable"), # Built-in name + plugin tries to shadow → both: built-in ("explicit-openai-with-plugin-registered", "stt:\n provider: openai\n", {}, "yes"), + # NEW (this PR): stt.providers.: type: command registry. + # Provider name "fake-cli" + transcript prefixed "CMD:" so dispatch_kind + # detection routes it to "command_provider". On main (no registry), + # this falls through to no_provider_error. + ( + "command-provider-installed", + _cmd_yaml("fake-cli", "CMD: fake-cli transcript"), + {}, + "no", + ), + # NEW (this PR): same name registered as BOTH a command provider and + # a plugin under "openrouter". Command must win (config more local + # than plugin install). The plugin emits "PLUGIN:..." — assertion is + # that the transcript is "CMD:...", proving command-wins precedence. + ( + "command-vs-plugin-same-name", + _cmd_yaml("openrouter", "CMD: openrouter via command wins"), + {}, + "yes", # also register a plugin under "openrouter" — must NOT fire + ), + # NEW (this PR): built-in name with a command provider declared under + # it → built-in still wins (built-in elif chain has precedence). + # The command would write "CMD: HIJACK" if it fired — assertion is + # that built-in OpenAI dispatch fires instead. + ( + "explicit-openai-with-command-shadow", + _cmd_yaml("openai", "CMD: HIJACK"), + {}, + "no", + ), ] +# Subprocesses reset the registry between runs via ``_reset_for_tests`` so +# registrations from earlier scenarios don't leak. The command-provider +# scenarios also work on origin/main — the subprocess just executes the +# native dispatch path, which falls through to "no_provider_error" because +# main has no registry for stt.providers.. + + def _run_scenario(repo_path: Path, label: str, config_yaml: str, env: dict, plugin_register: str) -> dict: venv_python = repo_path / ".venv" / "bin" / "python" if not venv_python.exists(): @@ -297,7 +374,9 @@ def main() -> int: # On main, "plugin-installed" returns no_provider_error (no # plugin hook); on PR, plugin dispatches. Same shape for # "plugin-installed-unavailable" but PR returns the cleaner - # plugin_unavailable envelope. Both diffs are expected. + # plugin_unavailable envelope. The new command-provider scenarios + # also intentionally diff against main (which has no stt.providers + # registry yet). no_provider_to_plugin = ( main_reduced.get("dispatch_kind") == "no_provider_error" and pr_reduced.get("dispatch_kind") == "plugin" @@ -308,12 +387,20 @@ def main() -> int: and pr_reduced.get("dispatch_kind") == "plugin_unavailable" and label == "plugin-installed-unavailable" ) + no_provider_to_command = ( + main_reduced.get("dispatch_kind") == "no_provider_error" + and pr_reduced.get("dispatch_kind") == "command_provider" + and label in {"command-provider-installed", "command-vs-plugin-same-name"} + ) if no_provider_to_plugin: print(f" [DIFF] {label}: no_provider_error → plugin — expected") intentional_diffs.append((label, main_reduced, pr_reduced)) elif no_provider_to_unavailable: print(f" [DIFF] {label}: no_provider_error → plugin_unavailable — expected") intentional_diffs.append((label, main_reduced, pr_reduced)) + elif no_provider_to_command: + print(f" [DIFF] {label}: no_provider_error → command_provider — expected") + intentional_diffs.append((label, main_reduced, pr_reduced)) else: print(f" [FAIL] {label}") print(f" main: {main_reduced}") diff --git a/tests/tools/test_transcription_command_providers.py b/tests/tools/test_transcription_command_providers.py new file mode 100644 index 00000000000..6873b0389ea --- /dev/null +++ b/tests/tools/test_transcription_command_providers.py @@ -0,0 +1,607 @@ +""" +Tests for the STT command-provider registry (``stt.providers.``). + +Mirrors ``tests/tools/test_tts_command_providers.py`` — same shape, same +invariants, adapted for the input=audio → output=transcript flow. + +Covers: +- Resolution: built-in precedence, missing/unknown name, type/command gating +- Placeholder rendering: shell-quote-aware, doubled-brace preservation +- Helpers: timeout fallback, output_format validation, iter/has-any +- End-to-end via transcribe_audio(): command-provider wins when configured, + built-ins still win when name collides, plugin coexistence + +Nothing here talks to a real STT engine. The shell command writes a static +transcript to ``{output_path}`` using ``python -c`` so the tests run +identically on Linux, macOS, and Windows (with minor quoting differences). +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import tempfile +import wave +from pathlib import Path +from typing import Optional +from unittest.mock import patch + +import pytest + +from tools.transcription_tools import ( + BUILTIN_STT_PROVIDERS, + COMMAND_STT_OUTPUT_FORMATS, + DEFAULT_COMMAND_STT_LANGUAGE, + DEFAULT_COMMAND_STT_OUTPUT_FORMAT, + DEFAULT_COMMAND_STT_TIMEOUT_SECONDS, + _get_command_stt_output_format, + _get_command_stt_timeout, + _get_named_stt_provider_config, + _has_any_command_stt_provider, + _is_command_stt_provider_config, + _iter_command_stt_providers, + _quote_command_stt_placeholder, + _render_command_stt_template, + _resolve_command_stt_provider_config, + _shell_quote_context_stt, + _transcribe_command_stt, + transcribe_audio, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_silent_wav(path: Path, seconds: float = 0.1) -> Path: + """Write a minimal silent .wav file so _validate_audio_file accepts it.""" + path.parent.mkdir(parents=True, exist_ok=True) + with wave.open(str(path), "wb") as w: + w.setnchannels(1) + w.setsampwidth(2) + w.setframerate(8000) + frames = b"\x00\x00" * int(8000 * seconds) + w.writeframes(frames) + return path + + +def _python_emit_command(transcript_text: str, output_placeholder: str = "{output_path}") -> str: + """Return a portable shell command that writes ``transcript_text`` to {output_path}.""" + interpreter = sys.executable + # Use repr() to embed the literal string safely; outer single quotes + # avoid shell expansion of $ / ` / etc. + payload = ( + "import sys; " + f"open(sys.argv[1], 'w').write({transcript_text!r})" + ) + return f'"{interpreter}" -c "{payload}" {output_placeholder}' + + +def _python_emit_stdout_command(transcript_text: str) -> str: + """Return a portable shell command that writes transcript to stdout only.""" + interpreter = sys.executable + payload = f"import sys; sys.stdout.write({transcript_text!r})" + return f'"{interpreter}" -c "{payload}"' + + +# --------------------------------------------------------------------------- +# _resolve_command_stt_provider_config / built-in precedence +# --------------------------------------------------------------------------- + + +class TestResolveCommandSTTProviderConfig: + def test_builtin_names_are_never_command_providers(self): + cfg = { + "providers": { + "openai": {"type": "command", "command": "echo hi"}, + "groq": {"type": "command", "command": "echo hi"}, + "local": {"type": "command", "command": "echo hi"}, + "local_command": {"type": "command", "command": "echo hi"}, + "mistral": {"type": "command", "command": "echo hi"}, + "xai": {"type": "command", "command": "echo hi"}, + }, + } + for name in BUILTIN_STT_PROVIDERS: + assert _resolve_command_stt_provider_config(name, cfg) is None + + def test_missing_provider_returns_none(self): + cfg = {"providers": {}} + assert _resolve_command_stt_provider_config("nope", cfg) is None + + def test_empty_provider_returns_none(self): + assert _resolve_command_stt_provider_config("", {}) is None + assert _resolve_command_stt_provider_config(None, {}) is None # type: ignore[arg-type] + + def test_none_provider_short_circuits(self): + # "none" is the auto-detect-failed sentinel; never a command provider. + cfg = { + "providers": { + "none": {"type": "command", "command": "echo hi"}, + }, + } + assert _resolve_command_stt_provider_config("none", cfg) is None + + def test_provider_without_command_field_returns_none(self): + cfg = {"providers": {"my-cli": {"type": "command"}}} + assert _resolve_command_stt_provider_config("my-cli", cfg) is None + + def test_provider_with_empty_command_returns_none(self): + cfg = {"providers": {"my-cli": {"type": "command", "command": " "}}} + assert _resolve_command_stt_provider_config("my-cli", cfg) is None + + def test_provider_with_explicit_type_other_than_command_returns_none(self): + cfg = {"providers": {"my-cli": {"type": "http", "command": "echo hi"}}} + assert _resolve_command_stt_provider_config("my-cli", cfg) is None + + def test_provider_with_command_string_and_no_type_resolves(self): + cfg = {"providers": {"my-cli": {"command": "whisper {input_path}"}}} + result = _resolve_command_stt_provider_config("my-cli", cfg) + assert result is not None + assert result["command"] == "whisper {input_path}" + + def test_provider_with_explicit_type_command_resolves(self): + cfg = {"providers": {"my-cli": {"type": "command", "command": "echo hi"}}} + result = _resolve_command_stt_provider_config("my-cli", cfg) + assert result is not None + + def test_resolution_is_case_insensitive(self): + cfg = {"providers": {"my-cli": {"type": "command", "command": "echo hi"}}} + assert _resolve_command_stt_provider_config("MY-CLI", cfg) is not None + assert _resolve_command_stt_provider_config(" my-cli ", cfg) is not None + + +# --------------------------------------------------------------------------- +# _get_named_stt_provider_config: legacy stt. fallback +# --------------------------------------------------------------------------- + + +class TestGetNamedSTTProviderConfig: + def test_canonical_stt_providers_lookup(self): + cfg = {"providers": {"my-cli": {"command": "whisper {input_path}"}}} + result = _get_named_stt_provider_config(cfg, "my-cli") + assert result == {"command": "whisper {input_path}"} + + def test_legacy_stt_dot_name_fallback(self): + # Users who followed the built-in layout (stt.openai.*) for their + # custom name still work. + cfg = {"my-cli": {"command": "whisper {input_path}"}} + result = _get_named_stt_provider_config(cfg, "my-cli") + assert result == {"command": "whisper {input_path}"} + + def test_builtin_name_is_not_legacy_resolved(self): + # stt.openai has model/language but no command — must NOT be + # mis-detected as a command provider. + cfg = {"openai": {"model": "whisper-1", "language": "en"}} + result = _get_named_stt_provider_config(cfg, "openai") + assert result == {} + + def test_missing_returns_empty(self): + assert _get_named_stt_provider_config({}, "nope") == {} + assert _get_named_stt_provider_config({"providers": {}}, "nope") == {} + + def test_canonical_wins_over_legacy(self): + cfg = { + "providers": {"my-cli": {"command": "canonical"}}, + "my-cli": {"command": "legacy"}, + } + assert _get_named_stt_provider_config(cfg, "my-cli")["command"] == "canonical" + + +# --------------------------------------------------------------------------- +# Helpers: timeout / format / iter / has-any +# --------------------------------------------------------------------------- + + +class TestSTTCommandHelpers: + def test_timeout_uses_default_when_missing(self): + assert _get_command_stt_timeout({}) == DEFAULT_COMMAND_STT_TIMEOUT_SECONDS + + def test_timeout_accepts_int_and_float(self): + assert _get_command_stt_timeout({"timeout": 5}) == 5.0 + assert _get_command_stt_timeout({"timeout": 2.5}) == 2.5 + + def test_timeout_falls_back_when_invalid(self): + assert _get_command_stt_timeout({"timeout": "not-a-number"}) == \ + DEFAULT_COMMAND_STT_TIMEOUT_SECONDS + assert _get_command_stt_timeout({"timeout": -5}) == \ + DEFAULT_COMMAND_STT_TIMEOUT_SECONDS + assert _get_command_stt_timeout({"timeout": 0}) == \ + DEFAULT_COMMAND_STT_TIMEOUT_SECONDS + + def test_timeout_legacy_key(self): + assert _get_command_stt_timeout({"timeout_seconds": 7}) == 7.0 + + def test_output_format_defaults_to_txt(self): + assert _get_command_stt_output_format({}) == DEFAULT_COMMAND_STT_OUTPUT_FORMAT + assert DEFAULT_COMMAND_STT_OUTPUT_FORMAT == "txt" + + def test_output_format_validates_against_allowed_set(self): + for fmt in COMMAND_STT_OUTPUT_FORMATS: + assert _get_command_stt_output_format({"format": fmt}) == fmt + + def test_output_format_rejects_unknown(self): + assert _get_command_stt_output_format({"format": "exe"}) == \ + DEFAULT_COMMAND_STT_OUTPUT_FORMAT + assert _get_command_stt_output_format({"format": "../etc/passwd"}) == \ + DEFAULT_COMMAND_STT_OUTPUT_FORMAT + + def test_output_format_strips_leading_dot(self): + assert _get_command_stt_output_format({"format": ".json"}) == "json" + + def test_output_format_legacy_key(self): + assert _get_command_stt_output_format({"output_format": "srt"}) == "srt" + + def test_iter_command_providers_yields_only_command_type(self): + cfg = { + "providers": { + "cmd-one": {"type": "command", "command": "x"}, + "no-cmd": {"type": "command"}, # no command field + "wrong-type": {"type": "http", "command": "x"}, + "cmd-two": {"command": "y"}, # implicit type + }, + } + names = {name for name, _ in _iter_command_stt_providers(cfg)} + assert names == {"cmd-one", "cmd-two"} + + def test_iter_command_providers_excludes_builtins(self): + # Defense in depth — a user trying to register a built-in name as + # a command provider should be silently ignored at iteration time. + cfg = { + "providers": { + "openai": {"type": "command", "command": "x"}, + "groq": {"command": "y"}, + "custom": {"command": "z"}, + }, + } + names = {name for name, _ in _iter_command_stt_providers(cfg)} + assert names == {"custom"} + + def test_has_any_command_provider_false_when_none_configured(self): + assert _has_any_command_stt_provider({"providers": {}}) is False + + def test_has_any_command_provider_true_when_one_configured(self): + cfg = {"providers": {"custom": {"command": "x"}}} + assert _has_any_command_stt_provider(cfg) is True + + +# --------------------------------------------------------------------------- +# Template rendering +# --------------------------------------------------------------------------- + + +class TestRenderCommandSTTTemplate: + def test_renders_all_placeholders(self): + rendered = _render_command_stt_template( + "whisper {input_path} -o {output_path} --lang {language} --model {model}", + { + "input_path": "/tmp/audio.wav", + "output_path": "/tmp/out.txt", + "output_dir": "/tmp", + "format": "txt", + "language": "en", + "model": "base", + }, + ) + assert "/tmp/audio.wav" in rendered + assert "/tmp/out.txt" in rendered + assert "en" in rendered + assert "base" in rendered + + def test_preserves_doubled_braces(self): + rendered = _render_command_stt_template( + 'echo {{"foo": {input_path}}}', + {"input_path": "audio.wav"}, + ) + # Doubled braces collapse to single braces — JSON snippets survive. + assert rendered.startswith('echo {"foo":') + assert rendered.endswith('}') + assert "audio.wav" in rendered + + def test_shell_quote_outside_quotes_uses_shlex(self): + rendered = _render_command_stt_template( + "whisper {input_path}", + {"input_path": "/tmp/has space.wav"}, + ) + # shlex.quote wraps strings with whitespace in single quotes. + if os.name != "nt": + assert "'/tmp/has space.wav'" in rendered + + def test_shell_quote_inside_single_quotes(self): + rendered = _render_command_stt_template( + "whisper '{input_path}'", + {"input_path": "/tmp/he's-here.wav"}, + ) + # Inside '...': use the '\'' trick. + assert r"he'\''s-here" in rendered + + def test_shell_quote_inside_double_quotes(self): + rendered = _render_command_stt_template( + 'whisper "{input_path}"', + {"input_path": "$VAR.wav"}, + ) + # Inside "...": $, `, " are escaped. + assert r"\$VAR.wav" in rendered + + def test_placeholder_not_in_dict_passes_through(self): + # Unknown placeholder isn't replaced — preserves literal text. + rendered = _render_command_stt_template( + "echo {unknown_name}", + {"input_path": "x"}, + ) + assert rendered == "echo {unknown_name}" + + +# --------------------------------------------------------------------------- +# _transcribe_command_stt: end-to-end via the runner +# --------------------------------------------------------------------------- + + +class TestTranscribeCommandSTT: + def test_writes_transcript_to_output_path(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + cfg = { + "type": "command", + "command": _python_emit_command("hello world"), + } + result = _transcribe_command_stt(str(audio), "fake-cli", cfg, {}) + assert result["success"] is True + assert result["transcript"] == "hello world" + assert result["provider"] == "fake-cli" + + def test_reads_transcript_from_stdout_when_no_file(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + cfg = { + "type": "command", + "command": _python_emit_stdout_command("stdout transcript"), + } + result = _transcribe_command_stt(str(audio), "fake-cli", cfg, {}) + assert result["success"] is True + assert result["transcript"] == "stdout transcript" + + def test_missing_command_returns_error(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + result = _transcribe_command_stt(str(audio), "fake-cli", {}, {}) + assert result["success"] is False + assert "command is not configured" in result["error"] + + def test_missing_audio_returns_error(self, tmp_path): + cfg = {"command": _python_emit_command("x")} + result = _transcribe_command_stt( + str(tmp_path / "does-not-exist.wav"), "fake-cli", cfg, {}, + ) + assert result["success"] is False + assert "Audio file not found" in result["error"] + + def test_nonzero_exit_returns_error_with_stderr(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + # Use a command that fails reliably across platforms. + interpreter = sys.executable + cfg = { + "command": ( + f'"{interpreter}" -c "import sys; sys.stderr.write(\'boom\'); sys.exit(7)"' + ), + } + result = _transcribe_command_stt(str(audio), "fake-cli", cfg, {}) + assert result["success"] is False + assert "exited with code 7" in result["error"] + assert "boom" in result["error"] + + def test_timeout_returns_clean_error(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + interpreter = sys.executable + cfg = { + "command": f'"{interpreter}" -c "import time; time.sleep(5)"', + "timeout": 0.5, + } + result = _transcribe_command_stt(str(audio), "slow-cli", cfg, {}) + assert result["success"] is False + assert "timed out after" in result["error"] + + def test_model_override_passed_to_template(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + # Write the model into the transcript so we can assert it propagated. + interpreter = sys.executable + payload = "import sys; open(sys.argv[2], 'w').write(sys.argv[1])" + cfg = { + "command": f'"{interpreter}" -c "{payload}" {{model}} {{output_path}}', + "model": "config-model", + } + result = _transcribe_command_stt( + str(audio), "fake-cli", cfg, {}, model_override="override-model", + ) + assert result["success"] is True + assert result["transcript"] == "override-model" + + def test_config_model_used_when_no_override(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + interpreter = sys.executable + payload = "import sys; open(sys.argv[2], 'w').write(sys.argv[1])" + cfg = { + "command": f'"{interpreter}" -c "{payload}" {{model}} {{output_path}}', + "model": "config-model", + } + result = _transcribe_command_stt(str(audio), "fake-cli", cfg, {}) + assert result["transcript"] == "config-model" + + def test_language_from_provider_config_wins(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + interpreter = sys.executable + payload = "import sys; open(sys.argv[2], 'w').write(sys.argv[1])" + cfg = { + "command": f'"{interpreter}" -c "{payload}" {{language}} {{output_path}}', + "language": "fr", + } + # stt.language is "es" but provider config says "fr" — provider wins. + result = _transcribe_command_stt( + str(audio), "fake-cli", cfg, {"language": "es"}, + ) + assert result["transcript"] == "fr" + + def test_language_falls_back_to_stt_section(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + interpreter = sys.executable + payload = "import sys; open(sys.argv[2], 'w').write(sys.argv[1])" + cfg = { + "command": f'"{interpreter}" -c "{payload}" {{language}} {{output_path}}', + } + result = _transcribe_command_stt( + str(audio), "fake-cli", cfg, {"language": "ja"}, + ) + assert result["transcript"] == "ja" + + def test_language_defaults_to_en(self, tmp_path): + audio = _make_silent_wav(tmp_path / "input.wav") + interpreter = sys.executable + payload = "import sys; open(sys.argv[2], 'w').write(sys.argv[1])" + cfg = { + "command": f'"{interpreter}" -c "{payload}" {{language}} {{output_path}}', + } + result = _transcribe_command_stt(str(audio), "fake-cli", cfg, {}) + assert result["transcript"] == DEFAULT_COMMAND_STT_LANGUAGE + + +# --------------------------------------------------------------------------- +# End-to-end via transcribe_audio(): dispatcher integration +# --------------------------------------------------------------------------- + + +class TestTranscribeAudioDispatchToCommandProvider: + """Verify ``transcribe_audio()`` picks command providers correctly. + + These tests bypass the lazy-load STT detection (faster-whisper / + HERMES_LOCAL_STT_COMMAND) by patching ``_load_stt_config`` directly. + """ + + def _config_with_command_provider(self, name: str, command: str) -> dict: + return { + "provider": name, + "providers": { + name: {"type": "command", "command": command}, + }, + } + + def test_command_provider_dispatches_via_transcribe_audio(self, tmp_path): + audio = _make_silent_wav(tmp_path / "audio.wav") + cfg = self._config_with_command_provider( + "fake-cli", _python_emit_command("dispatched via command") + ) + with patch("tools.transcription_tools._load_stt_config", return_value=cfg): + result = transcribe_audio(str(audio)) + assert result["success"] is True + assert result["transcript"] == "dispatched via command" + assert result["provider"] == "fake-cli" + + def test_builtin_name_shadow_does_not_route_to_command(self, tmp_path): + # User mis-configures stt.providers.openai as a command — must NOT + # hijack the real OpenAI built-in. The built-in elif chain owns + # the name; the command-provider resolver explicitly rejects it. + audio = _make_silent_wav(tmp_path / "audio.wav") + cfg = { + "provider": "openai", + "providers": { + "openai": {"type": "command", "command": _python_emit_command("HIJACK")}, + }, + } + with patch("tools.transcription_tools._load_stt_config", return_value=cfg): + # openai dispatch will likely fail with no API key — that's fine, + # what matters is the transcript is NOT "HIJACK" (which would + # mean the command-provider hijacked the built-in name). + result = transcribe_audio(str(audio)) + assert result.get("transcript") != "HIJACK" + + def test_unknown_provider_no_command_falls_through_to_error(self, tmp_path): + audio = _make_silent_wav(tmp_path / "audio.wav") + cfg = {"provider": "unknown-cli"} + with patch("tools.transcription_tools._load_stt_config", return_value=cfg): + result = transcribe_audio(str(audio)) + assert result["success"] is False + assert "No STT provider available" in result["error"] + + +# --------------------------------------------------------------------------- +# Command vs plugin precedence +# --------------------------------------------------------------------------- + + +class TestCommandWinsOverPlugin: + """When a name has BOTH a command provider AND a registered plugin, the + command provider must win — same precedence rule as TTS PR #17843 + (config is more local than plugin install). + """ + + def test_command_wins_when_both_configured(self, tmp_path): + audio = _make_silent_wav(tmp_path / "audio.wav") + cfg = { + "provider": "fake-cli", + "providers": { + "fake-cli": { + "type": "command", + "command": _python_emit_command("FROM_COMMAND"), + }, + }, + } + + # Register a plugin under the SAME name. It must NOT fire. + from agent.transcription_provider import TranscriptionProvider + from agent.transcription_registry import ( + _reset_for_tests, + register_provider, + ) + + class FakePlugin(TranscriptionProvider): + @property + def name(self) -> str: + return "fake-cli" + + def transcribe(self, file_path, *, model=None, language=None, **extra): + return { + "success": True, + "transcript": "FROM_PLUGIN", + "provider": self.name, + } + + _reset_for_tests() + try: + register_provider(FakePlugin()) + with patch("tools.transcription_tools._load_stt_config", return_value=cfg): + result = transcribe_audio(str(audio)) + finally: + _reset_for_tests() + + assert result["success"] is True + assert result["transcript"] == "FROM_COMMAND" + + def test_plugin_fires_when_no_command_provider(self, tmp_path): + audio = _make_silent_wav(tmp_path / "audio.wav") + cfg = {"provider": "fake-plugin"} + + from agent.transcription_provider import TranscriptionProvider + from agent.transcription_registry import ( + _reset_for_tests, + register_provider, + ) + + class FakePlugin(TranscriptionProvider): + @property + def name(self) -> str: + return "fake-plugin" + + def transcribe(self, file_path, *, model=None, language=None, **extra): + return { + "success": True, + "transcript": "FROM_PLUGIN", + "provider": self.name, + } + + _reset_for_tests() + try: + register_provider(FakePlugin()) + with patch("tools.transcription_tools._load_stt_config", return_value=cfg): + result = transcribe_audio(str(audio)) + finally: + _reset_for_tests() + + assert result["success"] is True + assert result["transcript"] == "FROM_PLUGIN" diff --git a/tools/transcription_tools.py b/tools/transcription_tools.py index a9c59ea9bfb..0a8e6e5054f 100644 --- a/tools/transcription_tools.py +++ b/tools/transcription_tools.py @@ -233,6 +233,503 @@ BUILTIN_STT_PROVIDERS = frozenset({ }) +# --------------------------------------------------------------------------- +# Command-provider registry (``stt.providers.: type: command``) +# --------------------------------------------------------------------------- +# +# Mirrors the TTS command-provider registry shipped in PR #17843 — same +# placeholder grammar, same shell-quote-aware rendering, same process-tree +# termination on timeout. Lets any whisper CLI / ASR CLI / curl pipeline +# become an STT backend with zero Python. +# +# Resolution order: +# 1. Built-in (``local``, ``local_command``, ``groq``, ``openai``, +# ``mistral``, ``xai``) → native handler. **Always wins.** +# 2. ``stt.providers.: type: command`` → command-provider runner. +# 3. Plugin-registered TranscriptionProvider → plugin dispatch. +# 4. No match → "No STT provider available". +# +# The single-env-var ``HERMES_LOCAL_STT_COMMAND`` escape hatch is preserved +# untouched via the built-in ``local_command`` path. Use the command-provider +# registry when you want MULTIPLE shell-driven STT engines, or you want a +# named provider you can pick via ``stt.provider`` in config.yaml. +DEFAULT_COMMAND_STT_TIMEOUT_SECONDS = 300 +DEFAULT_COMMAND_STT_LANGUAGE = "en" +DEFAULT_COMMAND_STT_OUTPUT_FORMAT = "txt" +COMMAND_STT_OUTPUT_FORMATS = frozenset({"txt", "json", "srt", "vtt"}) + + +def _get_stt_section(stt_config: Dict[str, Any], name: str) -> Dict[str, Any]: + """Return an stt sub-section if it's a dict, else an empty dict.""" + if not isinstance(stt_config, dict): + return {} + section = stt_config.get(name) + return section if isinstance(section, dict) else {} + + +def _get_named_stt_provider_config( + stt_config: Dict[str, Any], + name: str, +) -> Dict[str, Any]: + """Return the config dict for a user-declared STT command provider. + + Looks up ``stt.providers.`` first (the canonical location), and + falls back to ``stt.`` so users who followed the built-in layout + still work. Returns an empty dict when the provider is not declared. + + Built-in names are NOT special-cased here — the caller short-circuits + them before this is consulted, AND ``_is_command_stt_provider_config`` + requires an explicit ``command:`` value, so a built-in section like + ``stt.openai`` (which has ``model``/``language`` but no ``command``) + can't accidentally be treated as a command provider. + """ + providers = _get_stt_section(stt_config, "providers") + section = providers.get(name) if isinstance(providers, dict) else None + if isinstance(section, dict): + return section + # Back-compat: allow ``stt.`` for user-declared providers too, + # but only when the name is not a built-in (so a user's ``stt.openai`` + # block still means the OpenAI provider, not a custom command). + if name.lower() not in BUILTIN_STT_PROVIDERS: + legacy = _get_stt_section(stt_config, name) + if legacy: + return legacy + return {} + + +def _is_command_stt_provider_config(config: Dict[str, Any]) -> bool: + """Return True when *config* declares a command-type STT provider.""" + if not isinstance(config, dict): + return False + ptype = str(config.get("type") or "").strip().lower() + if ptype and ptype != "command": + return False + command = config.get("command") + return isinstance(command, str) and bool(command.strip()) + + +def _resolve_command_stt_provider_config( + provider: str, + stt_config: Dict[str, Any], +) -> Optional[Dict[str, Any]]: + """Return the provider config if *provider* resolves to a command type. + + Built-in provider names are rejected (they have native handlers). + Returns None when the name is a built-in, ``"none"``, unknown, or not + a command type. + """ + if not provider: + return None + key = provider.lower().strip() + if key in BUILTIN_STT_PROVIDERS or key == "none": + return None + config = _get_named_stt_provider_config(stt_config, key) + if _is_command_stt_provider_config(config): + return config + return None + + +def _iter_command_stt_providers(stt_config: Dict[str, Any]): + """Yield (name, config) pairs for every declared command-type STT provider.""" + if not isinstance(stt_config, dict): + return + providers = _get_stt_section(stt_config, "providers") + for name, cfg in (providers or {}).items(): + if isinstance(name, str) and name.lower() not in BUILTIN_STT_PROVIDERS: + if _is_command_stt_provider_config(cfg): + yield name, cfg + + +def _has_any_command_stt_provider(stt_config: Optional[Dict[str, Any]] = None) -> bool: + """Return True when any command-type STT provider is configured.""" + if stt_config is None: + stt_config = _load_stt_config() + for _name, _cfg in _iter_command_stt_providers(stt_config): + return True + return False + + +def _get_command_stt_timeout(config: Dict[str, Any]) -> float: + """Return timeout in seconds, falling back when invalid.""" + raw = config.get("timeout", config.get("timeout_seconds", DEFAULT_COMMAND_STT_TIMEOUT_SECONDS)) + try: + value = float(raw) + except (TypeError, ValueError): + return float(DEFAULT_COMMAND_STT_TIMEOUT_SECONDS) + if value <= 0: + return float(DEFAULT_COMMAND_STT_TIMEOUT_SECONDS) + return value + + +def _get_command_stt_output_format(config: Dict[str, Any]) -> str: + """Return the validated output format (txt/json/srt/vtt).""" + raw = ( + config.get("format") + or config.get("output_format") + or DEFAULT_COMMAND_STT_OUTPUT_FORMAT + ) + fmt = str(raw).lower().strip().lstrip(".") + return fmt if fmt in COMMAND_STT_OUTPUT_FORMATS else DEFAULT_COMMAND_STT_OUTPUT_FORMAT + + +def _shell_quote_context_stt(command_template: str, position: int) -> Optional[str]: + """Return the shell quote character active right before *position*. + + Mirrors ``tools.tts_tool._shell_quote_context`` — kept local to avoid + cross-module import of a private helper. Returns ``"'"`` / ``'"'`` when + inside a quoted region, ``None`` for bare context. + """ + quote: Optional[str] = None + escaped = False + i = 0 + while i < position: + char = command_template[i] + if quote == "'": + if char == "'": + quote = None + elif quote == '"': + if escaped: + escaped = False + elif char == "\\": + escaped = True + elif char == '"': + quote = None + elif char == "'": + quote = "'" + elif char == '"': + quote = '"' + elif char == "\\": + i += 1 + i += 1 + return quote + + +def _quote_command_stt_placeholder(value: str, quote_context: Optional[str]) -> str: + """Quote a placeholder value for its position in a shell command template. + + Mirrors ``tools.tts_tool._quote_command_tts_placeholder``. + """ + if quote_context == "'": + return value.replace("'", r"'\''") + if quote_context == '"': + return ( + value + .replace("\\", "\\\\") + .replace('"', r'\"') + .replace("$", r"\$") + .replace("`", r"\`") + ) + if os.name == "nt": + return subprocess.list2cmdline([value]) + return shlex.quote(value) + + +def _render_command_stt_template( + command_template: str, + placeholders: Dict[str, str], +) -> str: + """Replace supported placeholders while preserving ``{{`` / ``}}``. + + Mirrors ``tools.tts_tool._render_command_tts_template``. Placeholders + are shell-quote-aware: ``{voice}`` inside single quotes gets + single-quote-safe escaping, inside double quotes gets ``$``/`` ` ``/`` " `` + escaping, outside quotes gets ``shlex.quote``. Doubled braces ``{{`` and + ``}}`` are preserved as literal ``{`` / ``}`` for users who want to + embed JSON snippets in their command. + """ + import re + + names = "|".join(re.escape(name) for name in placeholders) + pattern = re.compile( + rf"(?{names})\}}\}}|\{{(?P{names})\}})" + ) + replacements: list[tuple[str, str]] = [] + + def replace_match(match: "re.Match[str]") -> str: + name = match.group("double") or match.group("single") + token = f"__HERMES_STT_PLACEHOLDER_{len(replacements)}__" + replacements.append(( + token, + _quote_command_stt_placeholder( + placeholders[name], + _shell_quote_context_stt(command_template, match.start()), + ), + )) + return token + + rendered = pattern.sub(replace_match, command_template) + rendered = rendered.replace("{{", "{").replace("}}", "}") + for token, value in replacements: + rendered = rendered.replace(token, value) + return rendered + + +def _terminate_command_stt_process_tree(proc: subprocess.Popen) -> None: + """Best-effort termination of a shell process and all of its children. + + Mirrors ``tools.tts_tool._terminate_command_tts_process_tree``. + """ + if proc.poll() is not None: + return + + if os.name == "nt": + try: + subprocess.run( + ["taskkill", "/F", "/T", "/PID", str(proc.pid)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=5, + ) + except Exception: + proc.kill() + return + + try: + import psutil # type: ignore + except ImportError: + # psutil is optional — fall back to single-process terminate/kill + proc.terminate() + try: + proc.wait(timeout=2) + except subprocess.TimeoutExpired: + proc.kill() + return + + try: + parent = psutil.Process(proc.pid) + for child in parent.children(recursive=True): + try: + child.terminate() + except psutil.NoSuchProcess: + pass + parent.terminate() + except psutil.NoSuchProcess: + return + except Exception: + proc.terminate() + + try: + proc.wait(timeout=2) + return + except subprocess.TimeoutExpired: + pass + + try: + parent = psutil.Process(proc.pid) + for child in parent.children(recursive=True): + try: + child.kill() + except psutil.NoSuchProcess: + pass + parent.kill() + except psutil.NoSuchProcess: + return + except Exception: + proc.kill() + + +def _run_command_stt(command: str, timeout: float) -> subprocess.CompletedProcess: + """Run a command-provider shell command with process-tree timeout cleanup. + + Mirrors ``tools.tts_tool._run_command_tts``. + """ + popen_kwargs: Dict[str, Any] = { + "shell": True, + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "text": True, + } + if os.name == "nt": + popen_kwargs["creationflags"] = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0) + else: + popen_kwargs["start_new_session"] = True + + proc = subprocess.Popen(command, **popen_kwargs) + try: + stdout, stderr = proc.communicate(timeout=timeout) + except subprocess.TimeoutExpired as exc: + _terminate_command_stt_process_tree(proc) + try: + stdout, stderr = proc.communicate(timeout=1) + except Exception: + stdout = getattr(exc, "output", None) + stderr = getattr(exc, "stderr", None) + raise subprocess.TimeoutExpired( + command, + timeout, + output=stdout, + stderr=stderr, + ) from exc + + if proc.returncode: + raise subprocess.CalledProcessError( + proc.returncode, + command, + output=stdout, + stderr=stderr, + ) + return subprocess.CompletedProcess(command, proc.returncode, stdout, stderr) + + +def _read_command_stt_output(output_path: Path, stdout: str, fmt: str) -> str: + """Return the transcript text from a command-provider invocation. + + Resolution: + 1. If ``output_path`` exists and is non-empty → read it (raw text). + 2. Else if ``stdout`` is non-empty → use stdout (lets users write + curl-style one-liners that emit transcript to stdout instead of + writing a file). + 3. Else → raise RuntimeError (no usable output produced). + + For JSON format, we still return the raw bytes — extracting a + ``text`` field is out of scope; users either configure ``format: txt`` + or post-process JSON downstream. (Same trade-off as TTS: the runner + doesn't try to be clever about output shape.) + """ + if output_path.exists(): + try: + content = output_path.read_text(encoding="utf-8").strip() + except UnicodeDecodeError: + content = output_path.read_bytes().decode("utf-8", errors="replace").strip() + if content: + return content + if stdout and stdout.strip(): + return stdout.strip() + raise RuntimeError( + f"Command STT provider wrote no output file at {output_path} " + f"and produced no stdout" + ) + + +def _transcribe_command_stt( + file_path: str, + provider_name: str, + config: Dict[str, Any], + stt_config: Dict[str, Any], + model_override: Optional[str] = None, +) -> Dict[str, Any]: + """Transcribe via a user-declared ``stt.providers.: type: command``. + + Placeholder grammar: + + | Placeholder | Substituted with | + |-------------------|-----------------------------------------------------------| + | ``{input_path}`` | absolute path to the audio file (original location) | + | ``{output_path}`` | absolute path the provider should write its transcript to | + | ``{output_dir}`` | parent dir of ``{output_path}`` | + | ``{format}`` | configured output format (``txt`` / ``json`` / ``srt`` / ``vtt``) | + | ``{language}`` | configured language code (default ``en``) | + | ``{model}`` | configured model id (empty when not set) | + + All placeholders are shell-quote-aware (see ``_render_command_stt_template``). + Doubled braces ``{{`` and ``}}`` are preserved as literal braces. + + Returns the standard transcribe-response envelope (``success``, + ``transcript``, ``provider``, ``error``). + """ + command_template = str(config.get("command") or "").strip() + if not command_template: + return { + "success": False, + "transcript": "", + "provider": provider_name, + "error": f"stt.providers.{provider_name}.command is not configured", + } + + audio = Path(file_path).expanduser() + if not audio.exists(): + return { + "success": False, + "transcript": "", + "provider": provider_name, + "error": f"Audio file not found: {file_path}", + } + + timeout = _get_command_stt_timeout(config) + output_format = _get_command_stt_output_format(config) + language = ( + config.get("language") + or stt_config.get("language") + or DEFAULT_COMMAND_STT_LANGUAGE + ) + model = model_override or config.get("model") or "" + + try: + with tempfile.TemporaryDirectory(prefix=f"hermes-cmd-stt-{provider_name}-") as tmpdir: + output_path = Path(tmpdir) / f"transcript.{output_format}" + placeholders = { + "input_path": str(audio.resolve()), + "output_path": str(output_path), + "output_dir": str(output_path.parent), + "format": output_format, + "language": str(language), + "model": str(model), + } + command = _render_command_stt_template(command_template, placeholders) + logger.info( + "Transcribing %s via command STT provider '%s'...", + audio.name, provider_name, + ) + try: + result = _run_command_stt(command, timeout) + except subprocess.TimeoutExpired: + return { + "success": False, + "transcript": "", + "provider": provider_name, + "error": ( + f"STT command provider '{provider_name}' timed out after " + f"{timeout:g}s" + ), + } + except subprocess.CalledProcessError as exc: + detail_parts = [] + if exc.stderr: + detail_parts.append(f"stderr: {exc.stderr.strip()}") + if exc.stdout: + detail_parts.append(f"stdout: {exc.stdout.strip()}") + detail = "; ".join(detail_parts) or "no command output" + return { + "success": False, + "transcript": "", + "provider": provider_name, + "error": ( + f"STT command provider '{provider_name}' exited with code " + f"{exc.returncode}: {detail}" + ), + } + + try: + transcript_text = _read_command_stt_output( + output_path, result.stdout or "", output_format, + ) + except RuntimeError as exc: + return { + "success": False, + "transcript": "", + "provider": provider_name, + "error": str(exc), + } + + except OSError as exc: + return { + "success": False, + "transcript": "", + "provider": provider_name, + "error": f"STT command provider '{provider_name}' failed: {exc}", + } + + logger.info( + "Transcribed %s via command STT provider '%s' (%d chars)", + audio.name, provider_name, len(transcript_text), + ) + return { + "success": True, + "transcript": transcript_text, + "provider": provider_name, + } + + def _get_provider(stt_config: dict) -> str: """Determine which STT provider to use. @@ -352,6 +849,7 @@ def _get_provider(stt_config: dict) -> str: def _dispatch_to_plugin_provider( file_path: str, provider: str, + stt_config: Optional[Dict[str, Any]] = None, *, model: Optional[str] = None, language: Optional[str] = None, @@ -370,12 +868,17 @@ def _dispatch_to_plugin_provider( function defensively rejects those names so a plugin can't be silently dispatched under a built-in name even if it somehow slipped past the registry's built-in shadow guard. - 2. Plugin dispatch fires only when ``provider`` matches a + 2. Same-name command-type provider declared under + ``stt.providers.: type: command`` wins over a plugin. The + caller short-circuits to the command runner before reaching us, + but we re-verify here so a refactor of the caller can't silently + break the invariant (matches TTS PR #17843 precedence rule). + 3. Plugin dispatch fires only when ``provider`` matches a registered :class:`TranscriptionProvider` whose ``name`` equals the configured value. Unknown names with no plugin registered return None (caller surfaces the legacy "No STT provider" message). - 3. Availability gating: when the matched plugin reports + 4. Availability gating: when the matched plugin reports ``is_available() == False`` (missing API key, missing optional SDK, etc.) this returns an error envelope identifying the plugin as unavailable — **not** ``None`` — because the user @@ -392,6 +895,13 @@ def _dispatch_to_plugin_provider( key = provider.lower().strip() if key in BUILTIN_STT_PROVIDERS or key == "none": return None + # Defense in depth: command-provider check should already have + # short-circuited the caller. If a same-name command config exists, + # bail so the command path wins. + if stt_config is not None and _is_command_stt_provider_config( + _get_named_stt_provider_config(stt_config, key) + ): + return None try: from agent.transcription_registry import get_provider from hermes_cli.plugins import _ensure_plugins_discovered @@ -1058,9 +1568,26 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A model_name = model or "grok-stt" return _transcribe_xai(file_path, model_name) + # User-declared command-type provider + # (``stt.providers.: type: command``). Fires after the built-in + # elif chain — built-in names short-circuit upstream so a user's + # ``stt.providers.openai.command`` can't override the real OpenAI + # handler — and BEFORE the plugin dispatcher, because config is more + # local than a plugin install (same precedence rule as TTS PR #17843). + command_provider_config = _resolve_command_stt_provider_config(provider, stt_config) + if command_provider_config is not None: + return _transcribe_command_stt( + file_path, + provider, + command_provider_config, + stt_config, + model_override=model, + ) + # Plugin-registered STT backend (e.g. OpenRouter, SenseAudio, # Gemini-STT). Fires only when ``provider`` is neither a built-in - # nor ``"none"``. The dispatcher enforces built-ins-always-win + # nor ``"none"`` AND there is no same-name command provider. The + # dispatcher enforces built-ins-always-win + command-wins-over-plugin # defensively. Returns None when no plugin is registered for the # configured name, falling through to the legacy "No STT provider" # error message below. @@ -1076,6 +1603,7 @@ def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, A plugin_result = _dispatch_to_plugin_provider( file_path, provider, + stt_config, model=plugin_model, language=plugin_language, ) diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 35d7ade0b59..fa879cac17f 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -455,17 +455,104 @@ If your configured provider isn't available, Hermes automatically falls back: - **Mistral key/SDK not set** → Skipped in auto-detect; falls through to next available provider - **Nothing available** → Voice messages pass through with an accurate note to the user +### STT custom command providers + +If the STT engine you want isn't natively supported (Doubao ASR, NVIDIA Parakeet, a whisper.cpp build, an open-source SenseVoice CLI, anything else that exposes a shell command), wire it in as a **command-type provider** without writing any Python. Hermes runs your shell command against the audio file and reads back the transcript. + +Declare one or more providers under `stt.providers.` and switch between them with `stt.provider: ` — same shape as the TTS [command-provider registry](#custom-command-providers), adapted for the input=audio → output=transcript direction. + +```yaml +stt: + provider: parakeet # pick any name under stt.providers + providers: + parakeet: + type: command + command: "parakeet-asr --model nvidia/parakeet-tdt-0.6b-v2 --in {input_path} --out {output_path}" + format: txt + language: en + timeout: 300 + + whispercpp: + type: command + command: "whisper-cli -m ~/models/ggml-large-v3.bin -f {input_path} -otxt -of {output_dir}/transcript" + format: txt + + sensevoice: + type: command + command: "sensevoice-cli {input_path} --json | tee {output_path}" + format: json +``` + +This complements the legacy `HERMES_LOCAL_STT_COMMAND` escape hatch — that env var still works untouched via the built-in `local_command` path. Use `stt.providers.` when you want **multiple** shell-driven STT engines, a name you can pick via `stt.provider`, or anything that needs per-provider `language` / `model` / `timeout`. + +#### STT placeholders + +Your command template can reference these placeholders. Hermes substitutes them at render time and shell-quotes each value for the surrounding context (bare / single-quoted / double-quoted), so paths with spaces are safe. + +| Placeholder | Meaning | +|-------------------|----------------------------------------------------------------------| +| `{input_path}` | Absolute path to the input audio file (original location, read-only) | +| `{output_path}` | Absolute path the command should write the transcript to | +| `{output_dir}` | Parent directory of `{output_path}` (handy for whisper-style tools) | +| `{format}` | Configured output format: `txt` / `json` / `srt` / `vtt` | +| `{language}` | Configured language code (defaults to `en`) | +| `{model}` | `stt.providers..model`, empty when unset | + +Use `{{` and `}}` for literal braces (handy when embedding JSON snippets in the command). + +#### How the transcript is read back + +After your command exits successfully: + +1. If `{output_path}` exists and is non-empty → Hermes reads it as UTF-8 text. +2. Otherwise, if the command wrote to stdout → Hermes uses that. +3. Otherwise → error: "Command STT provider wrote no output file and produced no stdout". + +This lets you use the registry for both file-writing CLIs (`whisper-cli`, `parakeet-asr`) and curl-style one-liners that emit transcript to stdout (`curl … | jq -r .text`). + +For `format: json` / `srt` / `vtt`, Hermes returns the raw file content as the `transcript` field. Extracting `.text` from JSON is out of scope for the runner — either configure `format: txt`, or post-process JSON downstream. + +#### STT command-provider optional keys + +| Key | Default | Meaning | +|-----------------|---------|------------------------------------------------------------------------------------------------------| +| `timeout` | `300` | Seconds; the process tree is killed on expiry (Unix `start_new_session`, Windows `taskkill /T`). | +| `format` | `txt` | One of `txt` / `json` / `srt` / `vtt`. Sets the extension of `{output_path}`. | +| `language` | `en` | Forwarded to `{language}`. Defaults to `stt.language` then `en`. | +| `model` | empty | Forwarded to `{model}`. The `model=` argument to `transcribe_audio()` overrides this. | + +#### STT command-provider behavior notes + +- **Built-ins always win.** Declaring `stt.providers.openai: type: command` does NOT override the real OpenAI Whisper handler. The built-in name is short-circuited before the command-provider resolver runs. +- **Process-tree cleanup.** A command running over `timeout` has its entire process tree killed, not just the shell wrapper. Long-running ASR pipelines that fork model-loading subprocesses are reaped reliably. +- **Shell-quoting is automatic.** Placeholders inside `'…'` get single-quote-safe escaping; inside `"…"` get `$`/`` ` ``/`"` escaping; outside quotes get `shlex.quote`. Don't pre-quote placeholder values. + +#### STT command-provider security + +The shell command runs under the same user as Hermes with full filesystem access — same trust model as `tts.providers.: type: command` and `HERMES_LOCAL_STT_COMMAND`. Only declare command providers from sources you trust. + ### Python plugin providers (STT) -For STT engines that aren't built-in (OpenRouter, SenseAudio, Gemini-STT, Deepgram, custom proprietary backends), register a Python plugin via `ctx.register_transcription_provider()`. The plugin **coexists with** the 6 built-in providers (`local`, `local_command`, `groq`, `openai`, `mistral`, `xai`) — those keep their native implementations and always win on name collision. +For STT engines that aren't built-in AND can't be expressed as a shell command (need a Python SDK, OAuth-refreshing auth, streaming chunks, etc.), register a Python plugin via `ctx.register_transcription_provider()`. The plugin **coexists with** the 6 built-in providers (`local`, `local_command`, `groq`, `openai`, `mistral`, `xai`) and the `stt.providers.: type: command` registry — built-ins keep their native implementations and always win on name collision; command providers win over plugins of the same name (config is more local than plugin install). + +#### When to pick which (STT) + +| Backend has… | Use | +|--------------------------------------------------------------|------------------------------------------------------------------| +| A single shell command that takes an audio file and emits text | `stt.providers.: type: command` (no Python needed) | +| Only the legacy single-command escape hatch is wanted | `HERMES_LOCAL_STT_COMMAND` env var (preserved for back-compat) | +| A Python SDK with no CLI | `register_transcription_provider()` plugin | +| OAuth-refreshing auth, streaming chunks, voice-list metadata | `register_transcription_provider()` plugin | +| A built-in already covers it (`local`, `groq`, `openai`, …) | Set `stt.provider: ` — built-ins are inline | #### Resolution order 1. **`stt.provider` is a built-in name** → built-in dispatch. **Always wins.** -2. **`stt.provider` matches a plugin-registered `TranscriptionProvider`** → plugin dispatch: +2. **`stt.provider` matches `stt.providers.` with `command:` set** → command-provider runner (see [STT custom command providers](#stt-custom-command-providers)). Wins over a same-name plugin. +3. **`stt.provider` matches a plugin-registered `TranscriptionProvider`** → plugin dispatch: - if the plugin's `is_available()` returns `False` (missing creds or SDK), the call surfaces an unavailability error envelope identifying the plugin — **not** the generic "No STT provider available" message. - otherwise the plugin's `transcribe()` is called with `model` (from the public `model=` arg, falling back to `stt..model`) and `language` (from `stt..language`). -3. **No match** → "No STT provider available" error. +4. **No match** → "No STT provider available" error. #### Per-provider config namespace