diff --git a/tests/tools/test_tts_command_providers.py b/tests/tools/test_tts_command_providers.py new file mode 100644 index 0000000000..eae5d06d02 --- /dev/null +++ b/tests/tools/test_tts_command_providers.py @@ -0,0 +1,490 @@ +""" +Tests for custom command-type TTS providers. + +These tests cover the ``tts.providers.`` registry: built-in +precedence, command resolution, placeholder rendering, shell-quote +context handling, timeout / failure cleanup, voice_compatible opt-in, +and max_text_length lookup. + +Nothing here talks to a real TTS engine. The shell command itself is +portable: we write bytes to ``{output_path}`` using ``python -c`` so +the tests run identically on Linux, macOS, and (with minor quoting +differences) Windows. +""" + +import json +import os +import subprocess +import sys +from pathlib import Path +from typing import Optional +from unittest.mock import patch + +import pytest + +from tools.tts_tool import ( + BUILTIN_TTS_PROVIDERS, + COMMAND_TTS_OUTPUT_FORMATS, + DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH, + DEFAULT_COMMAND_TTS_OUTPUT_FORMAT, + DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS, + _generate_command_tts, + _get_command_tts_output_format, + _get_command_tts_timeout, + _get_named_provider_config, + _has_any_command_tts_provider, + _is_command_provider_config, + _is_command_tts_voice_compatible, + _iter_command_providers, + _render_command_tts_template, + _resolve_command_provider_config, + _resolve_max_text_length, + _shell_quote_context, + check_tts_requirements, + text_to_speech_tool, +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _python_copy_command(output_placeholder: str = "{output_path}") -> str: + """Return a cross-platform shell command that copies {input_path} -> output.""" + interpreter = sys.executable + return ( + f'"{interpreter}" -c "import shutil, sys; ' + f'shutil.copyfile(sys.argv[1], sys.argv[2])" ' + f'{{input_path}} {output_placeholder}' + ) + + +# --------------------------------------------------------------------------- +# _resolve_command_provider_config / built-in precedence +# --------------------------------------------------------------------------- + +class TestResolveCommandProviderConfig: + def test_builtin_names_are_never_command_providers(self): + cfg = { + "providers": { + "openai": {"type": "command", "command": "echo hi"}, + "edge": {"type": "command", "command": "echo hi"}, + }, + } + for name in BUILTIN_TTS_PROVIDERS: + assert _resolve_command_provider_config(name, cfg) is None + + def test_missing_provider_returns_none(self): + cfg = {"providers": {}} + assert _resolve_command_provider_config("nope", cfg) is None + + def test_user_declared_command_provider_resolves(self): + cfg = { + "providers": { + "piper": {"type": "command", "command": "piper foo"}, + }, + } + resolved = _resolve_command_provider_config("piper", cfg) + assert resolved is not None + assert resolved["command"] == "piper foo" + + def test_type_command_is_implied_when_command_is_set(self): + cfg = {"providers": {"piper": {"command": "piper foo"}}} + resolved = _resolve_command_provider_config("piper", cfg) + assert resolved is not None + + def test_other_type_values_reject(self): + cfg = {"providers": {"piper": {"type": "python", "command": "piper foo"}}} + assert _resolve_command_provider_config("piper", cfg) is None + + def test_empty_command_rejects(self): + cfg = {"providers": {"piper": {"type": "command", "command": " "}}} + assert _resolve_command_provider_config("piper", cfg) is None + + def test_case_insensitive_lookup(self): + cfg = {"providers": {"piper": {"type": "command", "command": "x"}}} + assert _resolve_command_provider_config("PIPER", cfg) is not None + + +class TestGetNamedProviderConfig: + def test_providers_block_wins(self): + cfg = {"providers": {"voxcpm": {"command": "new"}}, + "voxcpm": {"command": "legacy"}} + assert _get_named_provider_config(cfg, "voxcpm") == {"command": "new"} + + def test_legacy_tts_name_block_still_resolves(self): + cfg = {"voxcpm": {"type": "command", "command": "legacy"}} + assert _get_named_provider_config(cfg, "voxcpm") == { + "type": "command", "command": "legacy" + } + + def test_builtin_names_do_not_leak_through_legacy_path(self): + """``tts.openai`` must never be mistaken for a command provider.""" + cfg = {"openai": {"command": "oops", "type": "command"}} + assert _get_named_provider_config(cfg, "openai") == {} + + +class TestIsCommandProviderConfig: + def test_empty_dict_is_false(self): + assert _is_command_provider_config({}) is False + + def test_non_dict_is_false(self): + assert _is_command_provider_config("foo") is False + assert _is_command_provider_config(None) is False + + def test_type_mismatch_is_false(self): + assert _is_command_provider_config({"type": "native", "command": "x"}) is False + + +# --------------------------------------------------------------------------- +# _iter_command_providers / _has_any_command_tts_provider +# --------------------------------------------------------------------------- + +class TestIterCommandProviders: + def test_iterates_only_user_command_providers(self): + cfg = { + "providers": { + "openai": {"type": "command", "command": "shouldnt show up"}, + "piper": {"type": "command", "command": "piper"}, + "voxcpm": {"type": "command", "command": "voxcpm"}, + "broken": {"type": "command", "command": ""}, + }, + } + names = sorted(name for name, _ in _iter_command_providers(cfg)) + assert names == ["piper", "voxcpm"] + + def test_has_any_command_provider_detects_declared(self): + cfg = {"providers": {"piper": {"type": "command", "command": "piper"}}} + assert _has_any_command_tts_provider(cfg) is True + + def test_has_any_command_provider_when_none(self): + assert _has_any_command_tts_provider({"providers": {}}) is False + assert _has_any_command_tts_provider({}) is False + + +# --------------------------------------------------------------------------- +# config getters +# --------------------------------------------------------------------------- + +class TestConfigGetters: + def test_timeout_defaults(self): + assert _get_command_tts_timeout({}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS) + + def test_timeout_coerces_string(self): + assert _get_command_tts_timeout({"timeout": "45"}) == 45.0 + + def test_timeout_rejects_non_positive(self): + assert _get_command_tts_timeout({"timeout": 0}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS) + assert _get_command_tts_timeout({"timeout": -1}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS) + + def test_timeout_rejects_garbage(self): + assert _get_command_tts_timeout({"timeout": "fast"}) == float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS) + + def test_timeout_seconds_alias(self): + assert _get_command_tts_timeout({"timeout_seconds": 90}) == 90.0 + + def test_output_format_defaults(self): + assert _get_command_tts_output_format({}) == DEFAULT_COMMAND_TTS_OUTPUT_FORMAT + + def test_output_format_path_override(self): + assert _get_command_tts_output_format({}, "/tmp/clip.wav") == "wav" + + def test_output_format_unknown_path_falls_back_to_config(self): + assert _get_command_tts_output_format({"format": "ogg"}, "/tmp/clip.xyz") == "ogg" + + def test_output_format_rejects_unknown(self): + assert _get_command_tts_output_format({"format": "m4a"}) == DEFAULT_COMMAND_TTS_OUTPUT_FORMAT + + def test_output_format_supported_set(self): + assert COMMAND_TTS_OUTPUT_FORMATS == frozenset({"mp3", "wav", "ogg", "flac"}) + + def test_voice_compatible_boolean(self): + assert _is_command_tts_voice_compatible({"voice_compatible": True}) is True + assert _is_command_tts_voice_compatible({"voice_compatible": False}) is False + + def test_voice_compatible_string(self): + assert _is_command_tts_voice_compatible({"voice_compatible": "yes"}) is True + assert _is_command_tts_voice_compatible({"voice_compatible": "0"}) is False + + def test_voice_compatible_default_off(self): + assert _is_command_tts_voice_compatible({}) is False + + +# --------------------------------------------------------------------------- +# _resolve_max_text_length for command providers +# --------------------------------------------------------------------------- + +class TestMaxTextLengthForCommandProviders: + def test_default_for_command_provider(self): + cfg = {"providers": {"piper": {"type": "command", "command": "x"}}} + assert _resolve_max_text_length("piper", cfg) == DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH + + def test_override_under_providers(self): + cfg = {"providers": {"piper": {"type": "command", "command": "x", "max_text_length": 2500}}} + assert _resolve_max_text_length("piper", cfg) == 2500 + + def test_override_under_legacy_tts_name_block(self): + cfg = {"piper": {"type": "command", "command": "x", "max_text_length": 7777}} + assert _resolve_max_text_length("piper", cfg) == 7777 + + def test_non_command_unknown_provider_still_falls_back(self): + assert _resolve_max_text_length("unknown", {}) > 0 + + +# --------------------------------------------------------------------------- +# _shell_quote_context / template rendering +# --------------------------------------------------------------------------- + +class TestShellQuoteContext: + def test_bare_context(self): + tpl = 'tts {output_path}' + pos = tpl.index("{output_path}") + assert _shell_quote_context(tpl, pos) is None + + def test_inside_single_quotes(self): + tpl = "tts '{output_path}'" + pos = tpl.index("{output_path}") + assert _shell_quote_context(tpl, pos) == "'" + + def test_inside_double_quotes(self): + tpl = 'tts "{output_path}"' + pos = tpl.index("{output_path}") + assert _shell_quote_context(tpl, pos) == '"' + + def test_escaped_double_quote_inside_double(self): + tpl = r'tts "foo \" {output_path}"' + pos = tpl.index("{output_path}") + assert _shell_quote_context(tpl, pos) == '"' + + +class TestRenderCommandTtsTemplate: + def test_substitutes_all_placeholders(self): + placeholders = { + "input_path": "/tmp/in.txt", + "text_path": "/tmp/in.txt", + "output_path": "/tmp/out.mp3", + "format": "mp3", + "voice": "af_sky", + "model": "tiny", + "speed": "1.0", + } + rendered = _render_command_tts_template( + "tts --voice {voice} --in {input_path} --out {output_path}", + placeholders, + ) + assert "af_sky" in rendered + assert "/tmp/out.mp3" in rendered + + def test_quotes_paths_with_spaces(self): + placeholders = { + "input_path": "/tmp/Jane Doe/in.txt", + "text_path": "/tmp/Jane Doe/in.txt", + "output_path": "/tmp/out.mp3", + "format": "mp3", + "voice": "", + "model": "", + "speed": "1.0", + } + rendered = _render_command_tts_template( + "tts --in {input_path} --out {output_path}", + placeholders, + ) + # shlex.quote wraps space-containing paths in single quotes on POSIX. + if os.name != "nt": + assert "'/tmp/Jane Doe/in.txt'" in rendered + + def test_literal_braces_survive(self): + placeholders = { + "input_path": "/tmp/in.txt", "text_path": "/tmp/in.txt", + "output_path": "/tmp/out.mp3", "format": "mp3", + "voice": "", "model": "", "speed": "1.0", + } + rendered = _render_command_tts_template( + "echo '{{not a placeholder}}' && tts --in {input_path}", + placeholders, + ) + assert "{not a placeholder}" in rendered + + def test_injection_is_neutralized(self): + """Embedded shell metacharacters in a placeholder value must be quoted.""" + placeholders = { + "input_path": "/tmp/in.txt", "text_path": "/tmp/in.txt", + "output_path": "/tmp/out; rm -rf /", + "format": "mp3", + "voice": "$(whoami)", "model": "", "speed": "1.0", + } + rendered = _render_command_tts_template( + "tts --voice {voice} --out {output_path}", + placeholders, + ) + # The injection payload must not appear unquoted in the rendered + # command. On POSIX shlex.quote wraps the value in single quotes. + if os.name != "nt": + assert "'$(whoami)'" in rendered or "'\\''" in rendered + assert "; rm -rf /" not in rendered.replace( + "'/tmp/out; rm -rf /'", "", + ) + + def test_preserves_shell_quoting_style(self): + placeholders = { + "input_path": "/tmp/in.txt", "text_path": "/tmp/in.txt", + "output_path": "/tmp/out.mp3", "format": "mp3", + "voice": "bob's voice", "model": "", "speed": "1.0", + } + # When the template wraps the placeholder in double quotes we must + # escape for that context, not collapse to single-quoted form. + rendered = _render_command_tts_template( + 'tts --voice "{voice}"', + placeholders, + ) + assert '"bob\'s voice"' in rendered + + +# --------------------------------------------------------------------------- +# End-to-end: _generate_command_tts +# --------------------------------------------------------------------------- + +class TestGenerateCommandTts: + def test_writes_output_file(self, tmp_path): + out = tmp_path / "clip.mp3" + config = {"command": _python_copy_command()} + result = _generate_command_tts( + "hello world", + str(out), + "py-copy", + config, + {}, + ) + assert result == str(out) + assert out.exists() + # The command copied the input text file over to output, so it + # contains the original UTF-8 text. + assert out.read_text(encoding="utf-8") == "hello world" + + def test_empty_command_raises(self, tmp_path): + with pytest.raises(ValueError, match="is not configured"): + _generate_command_tts( + "hello", + str(tmp_path / "x.mp3"), + "empty", + {"command": " "}, + {}, + ) + + def test_nonzero_exit_raises_runtime(self, tmp_path): + config = {"command": f'"{sys.executable}" -c "import sys; sys.exit(3)"'} + with pytest.raises(RuntimeError, match="exited with code 3"): + _generate_command_tts( + "hello", + str(tmp_path / "x.mp3"), + "failing", + config, + {}, + ) + + def test_empty_output_raises_runtime(self, tmp_path): + # This command completes successfully but writes nothing. + config = {"command": f'"{sys.executable}" -c "pass"'} + with pytest.raises(RuntimeError, match="produced no output"): + _generate_command_tts( + "hello", + str(tmp_path / "x.mp3"), + "silent", + config, + {}, + ) + + @pytest.mark.skipif(os.name == "nt", reason="POSIX-only timeout semantics") + def test_timeout_raises_runtime(self, tmp_path): + config = { + "command": f'"{sys.executable}" -c "import time; time.sleep(10)"', + "timeout": 1, + } + with pytest.raises(RuntimeError, match="timed out"): + _generate_command_tts( + "hello", + str(tmp_path / "x.mp3"), + "slow", + config, + {}, + ) + + +# --------------------------------------------------------------------------- +# text_to_speech_tool integration +# --------------------------------------------------------------------------- + +class TestTextToSpeechToolWithCommandProvider: + def test_command_provider_dispatches_end_to_end(self, tmp_path): + cfg = { + "tts": { + "provider": "py-copy", + "providers": { + "py-copy": { + "type": "command", + "command": _python_copy_command(), + "output_format": "mp3", + }, + }, + }, + } + out = tmp_path / "clip.mp3" + + # Patch the config loader used by the tool so we don't touch disk. + def fake_load(): + return cfg["tts"] + + with patch("tools.tts_tool._load_tts_config", fake_load): + result = text_to_speech_tool(text="hi", output_path=str(out)) + data = json.loads(result) + assert data["success"] is True, data + assert data["provider"] == "py-copy" + assert data["voice_compatible"] is False + assert Path(data["file_path"]).exists() + + def test_voice_compatible_opt_in_toggles_flag(self, tmp_path): + """voice_compatible=true is reflected in the response when the + file is already .ogg (no ffmpeg needed).""" + cfg = { + "provider": "py-copy-ogg", + "providers": { + "py-copy-ogg": { + "type": "command", + "command": _python_copy_command(), + "output_format": "ogg", + "voice_compatible": True, + }, + }, + } + out = tmp_path / "clip.ogg" + + with patch("tools.tts_tool._load_tts_config", return_value=cfg): + result = text_to_speech_tool(text="hi", output_path=str(out)) + data = json.loads(result) + assert data["success"] is True + assert data["voice_compatible"] is True + assert data["media_tag"].startswith("[[audio_as_voice]]") + + def test_missing_command_falls_through_to_builtin(self, tmp_path): + """A provider entry with an empty command is not a command + provider; the tool should not raise a "command not configured" + error but fall through to the built-in resolution path.""" + cfg = { + "provider": "broken", + "providers": { + "broken": {"type": "command", "command": " "}, + }, + } + with patch("tools.tts_tool._load_tts_config", return_value=cfg): + result = text_to_speech_tool(text="hi", output_path=str(tmp_path / "x.mp3")) + data = json.loads(result) + # The response should not carry the command-provider error text. + err = (data.get("error") or "").lower() + assert "tts.providers.broken.command is not configured" not in err + + +class TestCheckTtsRequirements: + def test_configured_command_provider_satisfies_requirement(self): + cfg = {"providers": {"x": {"type": "command", "command": "echo x"}}} + with patch("tools.tts_tool._load_tts_config", return_value=cfg): + assert check_tts_requirements() is True diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 11004dfa80..a89dff605c 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -2,14 +2,23 @@ """ Text-to-Speech Tool Module -Supports seven TTS providers: +Built-in TTS providers: - Edge TTS (default, free, no API key): Microsoft Edge neural voices - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY - OpenAI TTS: Good quality, needs OPENAI_API_KEY - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY - Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY - Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY -- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed +- xAI TTS: Grok voices, needs XAI_API_KEY +- NeuTTS (local, free, no API key): On-device TTS via neutts +- KittenTTS (local, free, no API key): On-device 25MB model + +Custom command providers: +- Users can declare any number of named providers with ``type: command`` + under ``tts.providers.`` in ``~/.hermes/config.yaml``. Hermes + writes the input text to a temp file and runs the configured shell + command, which must produce the audio file at the expected path. + See the Local Command section of ``website/docs/user-guide/features/tts.md``. Output formats: - Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS) @@ -32,7 +41,9 @@ import logging import os import queue import re +import shlex import shutil +import signal import subprocess import tempfile import threading @@ -181,9 +192,13 @@ def _resolve_max_text_length( Resolution order: 1. ``tts..max_text_length`` (user override in config.yaml) - 2. ElevenLabs model-aware table (keyed on configured ``model_id``) - 3. ``PROVIDER_MAX_TEXT_LENGTH`` default - 4. ``FALLBACK_MAX_TEXT_LENGTH`` (4000) + 2. ``tts.providers..max_text_length`` for user-declared + command providers + 3. ElevenLabs model-aware table (keyed on configured ``model_id``) + 4. ``PROVIDER_MAX_TEXT_LENGTH`` default + 5. ``DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH`` when the provider is a + command-type user provider without an explicit cap + 6. ``FALLBACK_MAX_TEXT_LENGTH`` (4000) Non-positive or non-integer overrides fall through to the default so a broken config can't accidentally disable truncation entirely. @@ -192,11 +207,12 @@ def _resolve_max_text_length( return FALLBACK_MAX_TEXT_LENGTH key = provider.lower().strip() cfg = tts_config or {} - prov_cfg = cfg.get(key) if isinstance(cfg.get(key), dict) else {} + # Built-in-style override at tts..max_text_length wins first, + # matching historical behavior. + prov_cfg = cfg.get(key) if isinstance(cfg.get(key), dict) else {} override = prov_cfg.get("max_text_length") if prov_cfg else None if isinstance(override, bool): - # bool is an int subclass; treat explicit booleans as "not set" override = None if isinstance(override, int) and override > 0: return override @@ -207,7 +223,21 @@ def _resolve_max_text_length( if mapped: return mapped - return PROVIDER_MAX_TEXT_LENGTH.get(key, FALLBACK_MAX_TEXT_LENGTH) + if key in PROVIDER_MAX_TEXT_LENGTH: + return PROVIDER_MAX_TEXT_LENGTH[key] + + # User-declared command provider (under tts.providers.) + if key not in BUILTIN_TTS_PROVIDERS: + named = _get_named_provider_config(cfg, key) + if _is_command_provider_config(named): + named_override = named.get("max_text_length") + if isinstance(named_override, bool): + named_override = None + if isinstance(named_override, int) and named_override > 0: + return named_override + return DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH + + return FALLBACK_MAX_TEXT_LENGTH # =========================================================================== @@ -237,6 +267,408 @@ def _get_provider(tts_config: Dict[str, Any]) -> str: return (tts_config.get("provider") or DEFAULT_PROVIDER).lower().strip() +# =========================================================================== +# Custom command providers (type: command under tts.providers.) +# =========================================================================== +# +# Users can declare any number of command-type providers alongside the +# built-ins so they can plug any local CLI (Piper, VoxCPM, Kokoro CLIs, +# custom voice-cloning scripts, etc.) into Hermes without any Python code +# changes. The config shape is:: +# +# tts: +# provider: piper-en +# providers: +# piper-en: +# type: command +# command: "piper -m ~/model.onnx -f {output_path} < {input_path}" +# output_format: wav +# +# Hermes writes the input text to a temp UTF-8 file, runs the command with +# placeholder substitution, and reads the audio file the command wrote to +# ``{output_path}``. Supported placeholders: ``{input_path}``, +# ``{text_path}`` (alias for input_path), ``{output_path}``, ``{format}``, +# ``{voice}``, ``{model}``, ``{speed}``. Use ``{{`` / ``}}`` for literal braces. +# +# Built-in provider names always win over an entry with the same name under +# ``tts.providers``, so user config can't silently shadow ``edge`` etc. +# +# Placeholder values are shell-quoted for their surrounding context +# (bare / single / double quote), so paths with spaces work transparently. + +# Built-in provider names. Any ``tts.provider`` value NOT in this set is +# interpreted as a reference to ``tts.providers.``. +BUILTIN_TTS_PROVIDERS = frozenset({ + "edge", + "elevenlabs", + "openai", + "minimax", + "xai", + "mistral", + "gemini", + "neutts", + "kittentts", +}) + +DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS = 120 +DEFAULT_COMMAND_TTS_OUTPUT_FORMAT = "mp3" +COMMAND_TTS_OUTPUT_FORMATS = frozenset({"mp3", "wav", "ogg", "flac"}) +DEFAULT_COMMAND_TTS_MAX_TEXT_LENGTH = 5000 + + +def _get_provider_section(tts_config: Dict[str, Any], name: str) -> Dict[str, Any]: + """Return a provider config block if it's a dict, else an empty dict.""" + if not isinstance(tts_config, dict): + return {} + section = tts_config.get(name) + return section if isinstance(section, dict) else {} + + +def _get_named_provider_config( + tts_config: Dict[str, Any], + name: str, +) -> Dict[str, Any]: + """Return the config dict for a user-declared provider. + + Looks up ``tts.providers.`` first (the canonical location), and + falls back to ``tts.`` so users who followed the built-in layout + still work. Returns an empty dict when the provider is not declared. + """ + providers = _get_provider_section(tts_config, "providers") + section = providers.get(name) if isinstance(providers, dict) else None + if isinstance(section, dict): + return section + # Back-compat: allow ``tts.`` for user-declared providers too, + # but only when the name is not a built-in (so a user's ``tts.openai`` + # block still means the OpenAI provider, not a custom command). + if name.lower() not in BUILTIN_TTS_PROVIDERS: + legacy = _get_provider_section(tts_config, name) + if legacy: + return legacy + return {} + + +def _is_command_provider_config(config: Dict[str, Any]) -> bool: + """Return True when *config* declares a command-type provider.""" + if not isinstance(config, dict): + return False + ptype = str(config.get("type") or "").strip().lower() + if ptype and ptype != "command": + return False + command = config.get("command") + return isinstance(command, str) and bool(command.strip()) + + +def _resolve_command_provider_config( + provider: str, + tts_config: Dict[str, Any], +) -> Optional[Dict[str, Any]]: + """Return the provider config if *provider* resolves to a command type. + + Built-in provider names are rejected (they have native handlers). + Returns None when the name is a built-in, unknown, or not a command + type. + """ + if not provider: + return None + key = provider.lower().strip() + if key in BUILTIN_TTS_PROVIDERS: + return None + config = _get_named_provider_config(tts_config, key) + if _is_command_provider_config(config): + return config + return None + + +def _iter_command_providers(tts_config: Dict[str, Any]): + """Yield (name, config) pairs for every declared command-type provider.""" + if not isinstance(tts_config, dict): + return + providers = _get_provider_section(tts_config, "providers") + for name, cfg in (providers or {}).items(): + if isinstance(name, str) and name.lower() not in BUILTIN_TTS_PROVIDERS: + if _is_command_provider_config(cfg): + yield name, cfg + + +def _get_command_tts_timeout(config: Dict[str, Any]) -> float: + """Return timeout in seconds, falling back when invalid.""" + raw = config.get("timeout", config.get("timeout_seconds", DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS)) + try: + value = float(raw) + except (TypeError, ValueError): + return float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS) + if value <= 0: + return float(DEFAULT_COMMAND_TTS_TIMEOUT_SECONDS) + return value + + +def _get_command_tts_output_format( + config: Dict[str, Any], + output_path: Optional[str] = None, +) -> str: + """Return the validated output format (mp3/wav/ogg/flac).""" + if output_path: + suffix = Path(output_path).suffix.lower().strip().lstrip(".") + if suffix in COMMAND_TTS_OUTPUT_FORMATS: + return suffix + raw = ( + config.get("format") + or config.get("output_format") + or DEFAULT_COMMAND_TTS_OUTPUT_FORMAT + ) + fmt = str(raw).lower().strip().lstrip(".") + return fmt if fmt in COMMAND_TTS_OUTPUT_FORMATS else DEFAULT_COMMAND_TTS_OUTPUT_FORMAT + + +def _is_command_tts_voice_compatible(config: Dict[str, Any]) -> bool: + """Return True only when the user explicitly opted in to voice delivery.""" + value = config.get("voice_compatible", False) + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "on"} + return bool(value) + + +def _shell_quote_context(command_template: str, position: int) -> Optional[str]: + """Return the shell quote character active right before *position*. + + Returns ``"'"`` / ``'"'`` when inside a single- / double-quoted region + of the template, ``None`` for bare context. + """ + quote: Optional[str] = None + escaped = False + i = 0 + while i < position: + char = command_template[i] + if quote == "'": + if char == "'": + quote = None + elif quote == '"': + if escaped: + escaped = False + elif char == "\\": + escaped = True + elif char == '"': + quote = None + else: + if char == "'": + quote = "'" + elif char == '"': + quote = '"' + elif char == "\\": + i += 1 + i += 1 + return quote + + +def _quote_command_tts_placeholder(value: str, quote_context: Optional[str]) -> str: + """Quote a placeholder value for its position in a shell command template.""" + if quote_context == "'": + return value.replace("'", r"'\''") + if quote_context == '"': + return ( + value + .replace("\\", "\\\\") + .replace('"', r'\"') + .replace("$", r"\$") + .replace("`", r"\`") + ) + if os.name == "nt": + return subprocess.list2cmdline([value]) + return shlex.quote(value) + + +def _render_command_tts_template( + command_template: str, + placeholders: Dict[str, str], +) -> str: + """Replace supported placeholders while preserving ``{{`` / ``}}``.""" + names = "|".join(re.escape(name) for name in placeholders) + pattern = re.compile( + rf"(?{names})\}}\}}|\{{(?P{names})\}})" + ) + replacements: list[tuple[str, str]] = [] + + def replace_match(match: re.Match[str]) -> str: + name = match.group("double") or match.group("single") + token = f"__HERMES_TTS_PLACEHOLDER_{len(replacements)}__" + replacements.append(( + token, + _quote_command_tts_placeholder( + placeholders[name], + _shell_quote_context(command_template, match.start()), + ), + )) + return token + + rendered = pattern.sub(replace_match, command_template) + rendered = rendered.replace("{{", "{").replace("}}", "}") + for token, value in replacements: + rendered = rendered.replace(token, value) + return rendered + + +def _terminate_command_tts_process_tree(proc: subprocess.Popen) -> None: + """Best-effort termination of a shell process and all of its children.""" + if proc.poll() is not None: + return + + if os.name == "nt": + try: + subprocess.run( + ["taskkill", "/F", "/T", "/PID", str(proc.pid)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=5, + ) + except Exception: + proc.kill() + return + + try: + os.killpg(proc.pid, signal.SIGTERM) + except ProcessLookupError: + return + except Exception: + proc.terminate() + + try: + proc.wait(timeout=2) + return + except subprocess.TimeoutExpired: + pass + + try: + os.killpg(proc.pid, signal.SIGKILL) + except ProcessLookupError: + return + except Exception: + proc.kill() + + +def _run_command_tts(command: str, timeout: float) -> subprocess.CompletedProcess: + """Run a command-provider shell command with process-tree timeout cleanup.""" + popen_kwargs: Dict[str, Any] = { + "shell": True, + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "text": True, + } + if os.name == "nt": + popen_kwargs["creationflags"] = getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0) + else: + popen_kwargs["start_new_session"] = True + + proc = subprocess.Popen(command, **popen_kwargs) + try: + stdout, stderr = proc.communicate(timeout=timeout) + except subprocess.TimeoutExpired as exc: + _terminate_command_tts_process_tree(proc) + try: + stdout, stderr = proc.communicate(timeout=1) + except Exception: + stdout = getattr(exc, "output", None) + stderr = getattr(exc, "stderr", None) + raise subprocess.TimeoutExpired( + command, + timeout, + output=stdout, + stderr=stderr, + ) from exc + + if proc.returncode: + raise subprocess.CalledProcessError( + proc.returncode, + command, + output=stdout, + stderr=stderr, + ) + return subprocess.CompletedProcess(command, proc.returncode, stdout, stderr) + + +def _configured_command_tts_output_path(path: Path, config: Dict[str, Any]) -> Path: + """Return an output path whose extension matches the provider's output_format.""" + fmt = _get_command_tts_output_format(config) + return path.with_suffix(f".{fmt}") + + +def _generate_command_tts( + text: str, + output_path: str, + provider_name: str, + config: Dict[str, Any], + tts_config: Dict[str, Any], +) -> str: + """Generate speech by running a user-configured shell command. + + Returns the absolute path of the audio file the command wrote. + Raises ``ValueError`` when the provider config is invalid, and + ``RuntimeError`` for timeouts / non-zero exits / empty output. + """ + command_template = str(config.get("command") or "").strip() + if not command_template: + raise ValueError( + f"tts.providers.{provider_name}.command is not configured" + ) + + output = Path(output_path).expanduser() + output.parent.mkdir(parents=True, exist_ok=True) + if output.exists(): + output.unlink() + + timeout = _get_command_tts_timeout(config) + output_format = _get_command_tts_output_format(config, str(output)) + speed = config.get("speed", tts_config.get("speed", "")) + + with tempfile.TemporaryDirectory() as tmpdir: + text_path = Path(tmpdir) / "input.txt" + text_path.write_text(text, encoding="utf-8") + + placeholders = { + "input_path": str(text_path), + "text_path": str(text_path), + "output_path": str(output), + "format": output_format, + "voice": str(config.get("voice", "")), + "model": str(config.get("model", "")), + "speed": str(speed), + } + command = _render_command_tts_template(command_template, placeholders) + + try: + _run_command_tts(command, timeout) + except subprocess.TimeoutExpired as exc: + raise RuntimeError( + f"TTS provider '{provider_name}' timed out after {timeout:g}s" + ) from exc + except subprocess.CalledProcessError as exc: + detail_parts = [] + if exc.stderr: + detail_parts.append(f"stderr: {exc.stderr.strip()}") + if exc.stdout: + detail_parts.append(f"stdout: {exc.stdout.strip()}") + detail = "; ".join(detail_parts) or "no command output" + raise RuntimeError( + f"TTS provider '{provider_name}' exited with code " + f"{exc.returncode}: {detail}" + ) from exc + + if not output.exists() or output.stat().st_size <= 0: + raise RuntimeError( + f"TTS provider '{provider_name}' produced no output at {output}" + ) + return str(output) + + +def _has_any_command_tts_provider(tts_config: Optional[Dict[str, Any]] = None) -> bool: + """Return True when any command-type TTS provider is configured.""" + if tts_config is None: + tts_config = _load_tts_config() + for _name, _cfg in _iter_command_providers(tts_config): + return True + return False + + # =========================================================================== # ffmpeg Opus conversion (Edge TTS MP3 -> OGG Opus for Telegram) # =========================================================================== @@ -954,6 +1386,12 @@ def text_to_speech_tool( tts_config = _load_tts_config() provider = _get_provider(tts_config) + # User-declared command provider (type: command under tts.providers.) + # resolves BEFORE the built-in dispatch. Built-in names short-circuit here + # so a user's ``tts.providers.openai.command`` can't override the real + # OpenAI handler. + command_provider_config = _resolve_command_provider_config(provider, tts_config) + # Truncate very long text with a warning. The cap is per-provider # (OpenAI 4096, xAI 15k, MiniMax 10k, ElevenLabs model-aware, etc.). max_len = _resolve_max_text_length(provider, tts_config) @@ -975,13 +1413,23 @@ def text_to_speech_tool( # Determine output path if output_path: file_path = Path(output_path).expanduser() + if command_provider_config is not None: + # Respect caller-supplied path but align the extension with the + # provider's configured output_format so the command writes to a + # path the caller actually expects. + file_path = _configured_command_tts_output_path( + file_path, command_provider_config + ) else: timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") out_dir = Path(DEFAULT_OUTPUT_DIR) out_dir.mkdir(parents=True, exist_ok=True) + if command_provider_config is not None: + fmt = _get_command_tts_output_format(command_provider_config) + file_path = out_dir / f"tts_{timestamp}.{fmt}" # Use .ogg for Telegram with providers that support native Opus output, # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later). - if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"): + elif want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"): file_path = out_dir / f"tts_{timestamp}.ogg" else: file_path = out_dir / f"tts_{timestamp}.mp3" @@ -992,7 +1440,15 @@ def text_to_speech_tool( try: # Generate audio with the configured provider - if provider == "elevenlabs": + if command_provider_config is not None: + logger.info( + "Generating speech with command TTS provider '%s'...", provider, + ) + file_str = _generate_command_tts( + text, file_str, provider, command_provider_config, tts_config, + ) + + elif provider == "elevenlabs": try: _import_elevenlabs() except ImportError: @@ -1100,7 +1556,17 @@ def text_to_speech_tool( # Try Opus conversion for Telegram compatibility # Edge TTS outputs MP3, NeuTTS/KittenTTS output WAV — all need ffmpeg conversion voice_compatible = False - if provider in ("edge", "neutts", "minimax", "xai", "kittentts") and not file_str.endswith(".ogg"): + if command_provider_config is not None: + # Command providers are documents by default. Voice-bubble + # delivery only kicks in when the user explicitly opts in + # via ``voice_compatible: true`` in their provider config. + if _is_command_tts_voice_compatible(command_provider_config): + if not file_str.endswith(".ogg"): + opus_path = _convert_to_opus(file_str) + if opus_path: + file_str = opus_path + voice_compatible = file_str.endswith(".ogg") + elif provider in ("edge", "neutts", "minimax", "xai", "kittentts") and not file_str.endswith(".ogg"): opus_path = _convert_to_opus(file_str) if opus_path: file_str = opus_path @@ -1149,11 +1615,15 @@ def check_tts_requirements() -> bool: Check if at least one TTS provider is available. Edge TTS needs no API key and is the default, so if the package - is installed, TTS is available. + is installed, TTS is available. A user-declared command provider + also satisfies the requirement. Returns: bool: True if at least one provider can work. """ + # Any configured command provider counts as available. + if _has_any_command_tts_provider(): + return True try: _import_edge_tts() return True @@ -1499,7 +1969,7 @@ from tools.registry import registry, tool_error TTS_SCHEMA = { "name": "text_to_speech", - "description": "Convert text to speech audio. Returns a MEDIA: path that the platform delivers as a voice message. On Telegram it plays as a voice bubble, on Discord/WhatsApp as an audio attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured, not model-selected.", + "description": "Convert text to speech audio. Returns a MEDIA: path that the platform delivers as native audio. Compatible providers render as a voice bubble on Telegram; otherwise audio is sent as a regular attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured (built-in providers like edge/openai or custom command providers under tts.providers.), not model-selected.", "parameters": { "type": "object", "properties": { diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 0a49dc6983..2a77edc4c1 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -116,6 +116,73 @@ Without ffmpeg, Edge TTS, MiniMax TTS, NeuTTS, and KittenTTS audio are sent as r If you want voice bubbles without installing ffmpeg, switch to the OpenAI, ElevenLabs, or Mistral provider. ::: +### Custom command providers + +If a TTS engine you want isn't natively supported (Piper, VoxCPM, MLX-Kokoro, XTTS CLI, a voice-cloning script, anything else that exposes a CLI), you can wire it in as a **command-type provider** without writing any Python. Hermes writes the input text to a temp UTF-8 file, runs your shell command, and reads the audio file the command produced. + +Declare one or more providers under `tts.providers.` and switch between them with `tts.provider: ` — the same way you switch between built-ins like `edge` and `openai`. + +```yaml +tts: + provider: piper-en # pick any name under tts.providers + providers: + piper-en: + type: command + command: "piper -m ~/models/en_US-amy.onnx -f {output_path} < {input_path}" + output_format: wav + + voxcpm: + type: command + command: "voxcpm --ref ~/voice.wav --text-file {input_path} --out {output_path}" + output_format: mp3 + timeout: 180 + voice_compatible: true # try to deliver as a Telegram voice bubble + + mlx-kokoro: + type: command + command: "python -m mlx_kokoro --in {input_path} --out {output_path} --voice {voice}" + voice: af_sky + output_format: wav +``` + +#### Placeholders + +Your command template can reference these placeholders. Hermes substitutes them at render time and shell-quotes each value for the surrounding context (bare / single-quoted / double-quoted), so paths with spaces and other shell-sensitive characters are safe. + +| Placeholder | Meaning | +|------------------|------------------------------------------------------| +| `{input_path}` | Path to the temp UTF-8 text file Hermes wrote | +| `{text_path}` | Alias for `{input_path}` | +| `{output_path}` | Path the command must write audio to | +| `{format}` | `mp3` / `wav` / `ogg` / `flac` | +| `{voice}` | `tts.providers..voice`, empty when unset | +| `{model}` | `tts.providers..model` | +| `{speed}` | Resolved speed multiplier (provider or global) | + +Use `{{` and `}}` for literal braces. + +#### Optional keys + +| Key | Default | Meaning | +|--------------------|---------|------------------------------------------------------------------------------------------------------------| +| `timeout` | `120` | Seconds; the process tree is killed on expiry (Unix `killpg`, Windows `taskkill /T`). | +| `output_format` | `mp3` | One of `mp3` / `wav` / `ogg` / `flac`. Auto-inferred from the output extension if Hermes picks a path. | +| `voice_compatible` | `false` | When `true`, Hermes converts MP3/WAV output to Opus/OGG via ffmpeg so Telegram renders a voice bubble. | +| `max_text_length` | `5000` | Input is truncated to this length before rendering the command. | +| `voice` / `model` | empty | Passed to the command as placeholder values only. | + +#### Behavior notes + +- **Built-in names always win.** A `tts.providers.openai` entry never shadows the native OpenAI provider, so no user config can silently replace a built-in. +- **Default delivery is a document.** Command providers deliver as regular audio attachments on every platform. Opt in to voice-bubble delivery per-provider with `voice_compatible: true`. +- **Command failures surface to the agent.** Non-zero exit, empty output, or timeout all return an error with the command's stderr/stdout included so you can debug the provider from the conversation. +- **`type: command` is the default when `command:` is set.** Writing `type: command` explicitly is good practice but not required; an entry with a non-empty `command` string is treated as a command provider. +- **`{input_path}` / `{text_path}` are interchangeable.** Use whichever reads better in your command. + +#### Security + +Command-type providers run whatever shell command you configure, with your user's permissions. Hermes quotes placeholder values and enforces the configured timeout, but the command template itself is trusted local input — treat it the same way you would a shell script on your PATH. + ## Voice Message Transcription (STT) Voice messages sent on Telegram, Discord, WhatsApp, Slack, or Signal are automatically transcribed and injected as text into the conversation. The agent sees the transcript as normal text.