diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 971c5e780..c87b9f5a9 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -613,6 +613,10 @@ DEFAULT_CONFIG = { }, # Text-to-speech configuration + # Each provider supports an optional `max_text_length:` override for the + # per-request input-character cap. Omit it to use the provider's documented + # limit (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k model-aware, + # Gemini 5000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000). "tts": { "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local) "edge": { diff --git a/tests/tools/test_tts_max_text_length.py b/tests/tools/test_tts_max_text_length.py new file mode 100644 index 000000000..38a763ea7 --- /dev/null +++ b/tests/tools/test_tts_max_text_length.py @@ -0,0 +1,197 @@ +"""Tests for per-provider TTS input-character limits. + +Replaces the old global ``MAX_TEXT_LENGTH = 4000`` cap that truncated every +provider at 4000 chars even though OpenAI allows 4096, xAI allows 15000, +MiniMax allows 10000, and ElevenLabs allows 5000-40000 depending on model. +""" + +import json +from unittest.mock import patch + +import pytest + +from tools.tts_tool import ( + ELEVENLABS_MODEL_MAX_TEXT_LENGTH, + FALLBACK_MAX_TEXT_LENGTH, + PROVIDER_MAX_TEXT_LENGTH, + _resolve_max_text_length, +) + + +class TestResolveMaxTextLength: + def test_edge_default(self): + assert _resolve_max_text_length("edge", {}) == PROVIDER_MAX_TEXT_LENGTH["edge"] + + def test_openai_default_is_4096(self): + assert _resolve_max_text_length("openai", {}) == 4096 + + def test_xai_default_is_15000(self): + assert _resolve_max_text_length("xai", {}) == 15000 + + def test_minimax_default_is_10000(self): + assert _resolve_max_text_length("minimax", {}) == 10000 + + def test_mistral_default(self): + assert _resolve_max_text_length("mistral", {}) == PROVIDER_MAX_TEXT_LENGTH["mistral"] + + def test_gemini_default(self): + assert _resolve_max_text_length("gemini", {}) == PROVIDER_MAX_TEXT_LENGTH["gemini"] + + def test_unknown_provider_falls_back(self): + assert _resolve_max_text_length("does-not-exist", {}) == FALLBACK_MAX_TEXT_LENGTH + + def test_empty_provider_falls_back(self): + assert _resolve_max_text_length("", {}) == FALLBACK_MAX_TEXT_LENGTH + assert _resolve_max_text_length(None, {}) == FALLBACK_MAX_TEXT_LENGTH + + def test_case_insensitive(self): + assert _resolve_max_text_length("OpenAI", {}) == 4096 + assert _resolve_max_text_length(" XAI ", {}) == 15000 + + # --- Overrides --- + + def test_override_wins(self): + cfg = {"openai": {"max_text_length": 9999}} + assert _resolve_max_text_length("openai", cfg) == 9999 + + def test_override_zero_falls_through(self): + # A broken/zero override must not disable truncation + cfg = {"openai": {"max_text_length": 0}} + assert _resolve_max_text_length("openai", cfg) == 4096 + + def test_override_negative_falls_through(self): + cfg = {"xai": {"max_text_length": -1}} + assert _resolve_max_text_length("xai", cfg) == 15000 + + def test_override_non_int_falls_through(self): + cfg = {"minimax": {"max_text_length": "lots"}} + assert _resolve_max_text_length("minimax", cfg) == 10000 + + def test_override_bool_falls_through(self): + # bool is technically an int; make sure we don't treat True as 1 char + cfg = {"openai": {"max_text_length": True}} + assert _resolve_max_text_length("openai", cfg) == 4096 + + def test_missing_provider_section_uses_default(self): + cfg = {"provider": "openai"} # no "openai" key + assert _resolve_max_text_length("openai", cfg) == 4096 + + # --- ElevenLabs model-aware --- + + def test_elevenlabs_default_model_multilingual_v2(self): + cfg = {"elevenlabs": {"model_id": "eleven_multilingual_v2"}} + assert _resolve_max_text_length("elevenlabs", cfg) == 10000 + + def test_elevenlabs_flash_v2_5_gets_40k(self): + cfg = {"elevenlabs": {"model_id": "eleven_flash_v2_5"}} + assert _resolve_max_text_length("elevenlabs", cfg) == 40000 + + def test_elevenlabs_flash_v2_gets_30k(self): + cfg = {"elevenlabs": {"model_id": "eleven_flash_v2"}} + assert _resolve_max_text_length("elevenlabs", cfg) == 30000 + + def test_elevenlabs_v3_gets_5k(self): + cfg = {"elevenlabs": {"model_id": "eleven_v3"}} + assert _resolve_max_text_length("elevenlabs", cfg) == 5000 + + def test_elevenlabs_unknown_model_falls_back_to_provider_default(self): + cfg = {"elevenlabs": {"model_id": "eleven_experimental_xyz"}} + assert _resolve_max_text_length("elevenlabs", cfg) == PROVIDER_MAX_TEXT_LENGTH["elevenlabs"] + + def test_elevenlabs_override_beats_model_lookup(self): + cfg = {"elevenlabs": {"model_id": "eleven_flash_v2_5", "max_text_length": 1000}} + assert _resolve_max_text_length("elevenlabs", cfg) == 1000 + + def test_elevenlabs_no_model_id_uses_default_model_mapping(self): + # Falls back to DEFAULT_ELEVENLABS_MODEL_ID = eleven_multilingual_v2 -> 10000 + assert _resolve_max_text_length("elevenlabs", {}) == 10000 + + def test_provider_config_not_a_dict(self): + cfg = {"openai": "not-a-dict"} + assert _resolve_max_text_length("openai", cfg) == 4096 + + # --- Sanity: the table covers every provider listed in the schema --- + + def test_all_documented_providers_have_defaults(self): + expected = {"edge", "openai", "xai", "minimax", "mistral", + "gemini", "elevenlabs", "neutts", "kittentts"} + assert expected.issubset(PROVIDER_MAX_TEXT_LENGTH.keys()) + + +class TestTextToSpeechToolTruncation: + """End-to-end: verify the resolver actually drives the text_to_speech_tool + truncation path rather than the old 4000-char global.""" + + def test_openai_truncates_at_4096_not_4000(self, tmp_path, monkeypatch, caplog): + import logging + caplog.set_level(logging.WARNING, logger="tools.tts_tool") + + # 5000 chars -- over OpenAI's 4096 limit but under xAI's 15k + text = "A" * 5000 + captured_text = {} + + def fake_openai(t, out, cfg): + captured_text["text"] = t + with open(out, "wb") as f: + f.write(b"\x00") + return out + + monkeypatch.setattr("tools.tts_tool._generate_openai_tts", fake_openai) + monkeypatch.setattr("tools.tts_tool._load_tts_config", + lambda: {"provider": "openai"}) + + from tools.tts_tool import text_to_speech_tool + out = str(tmp_path / "out.mp3") + result = json.loads(text_to_speech_tool(text=text, output_path=out)) + + assert result["success"] is True + # Should be truncated to 4096, not the old 4000 + assert len(captured_text["text"]) == 4096 + # And the warning should mention the provider + assert any("openai" in rec.message.lower() for rec in caplog.records) + + def test_xai_accepts_much_longer_input(self, tmp_path, monkeypatch): + # 12000 chars -- over old global 4000, under xAI's 15000 + text = "B" * 12000 + captured_text = {} + + def fake_xai(t, out, cfg): + captured_text["text"] = t + with open(out, "wb") as f: + f.write(b"\x00") + return out + + monkeypatch.setattr("tools.tts_tool._generate_xai_tts", fake_xai) + monkeypatch.setattr("tools.tts_tool._load_tts_config", + lambda: {"provider": "xai"}) + + from tools.tts_tool import text_to_speech_tool + out = str(tmp_path / "out.mp3") + result = json.loads(text_to_speech_tool(text=text, output_path=out)) + + assert result["success"] is True + # xAI should accept the full 12000 chars + assert len(captured_text["text"]) == 12000 + + def test_user_override_is_respected(self, tmp_path, monkeypatch): + # User says "cap openai at 100 chars" -- we must honor it + text = "C" * 500 + captured_text = {} + + def fake_openai(t, out, cfg): + captured_text["text"] = t + with open(out, "wb") as f: + f.write(b"\x00") + return out + + monkeypatch.setattr("tools.tts_tool._generate_openai_tts", fake_openai) + monkeypatch.setattr("tools.tts_tool._load_tts_config", + lambda: {"provider": "openai", + "openai": {"max_text_length": 100}}) + + from tools.tts_tool import text_to_speech_tool + out = str(tmp_path / "out.mp3") + result = json.loads(text_to_speech_tool(text=text, output_path=out)) + + assert result["success"] is True + assert len(captured_text["text"]) == 100 diff --git a/tools/tts_tool.py b/tools/tts_tool.py index b83fa4d73..a7ca57fab 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -121,7 +121,80 @@ def _get_default_output_dir() -> str: return str(get_hermes_dir("cache/audio", "audio_cache")) DEFAULT_OUTPUT_DIR = _get_default_output_dir() -MAX_TEXT_LENGTH = 4000 + +# --------------------------------------------------------------------------- +# Per-provider input-character limits (from official provider docs). +# A single global cap was wrong: OpenAI is 4096, xAI is 15k, MiniMax is 10k, +# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini caps at ~8k +# input tokens. Users can override any of these via +# ``tts..max_text_length`` in config.yaml. +# --------------------------------------------------------------------------- +PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = { + "edge": 5000, # edge-tts practical sync limit + "openai": 4096, # https://platform.openai.com/docs/guides/text-to-speech + "xai": 15000, # https://docs.x.ai/developers/model-capabilities/audio/text-to-speech + "minimax": 10000, # https://platform.minimax.io/docs/api-reference/speech-t2a-http (sync) + "mistral": 4000, # conservative; no published per-request cap + "gemini": 5000, # Gemini TTS caps at ~8k input tokens / ~655s audio + "elevenlabs": 10000, # fallback when model-aware lookup can't resolve (multilingual_v2) + "neutts": 2000, # local model, quality falls off on long text + "kittentts": 2000, # local 25MB model +} + +# ElevenLabs caps vary by model_id. https://elevenlabs.io/docs/overview/models +ELEVENLABS_MODEL_MAX_TEXT_LENGTH: Dict[str, int] = { + "eleven_v3": 5000, + "eleven_ttv_v3": 5000, + "eleven_multilingual_v2": 10000, + "eleven_multilingual_v1": 10000, + "eleven_english_sts_v2": 10000, + "eleven_english_sts_v1": 10000, + "eleven_flash_v2": 30000, + "eleven_flash_v2_5": 40000, +} + +# Final fallback when provider isn't recognised at all. +FALLBACK_MAX_TEXT_LENGTH = 4000 + +# Back-compat alias. Prefer ``_resolve_max_text_length()`` for new code. +MAX_TEXT_LENGTH = FALLBACK_MAX_TEXT_LENGTH + + +def _resolve_max_text_length( + provider: Optional[str], + tts_config: Optional[Dict[str, Any]] = None, +) -> int: + """Return the input-character cap for *provider*. + + Resolution order: + 1. ``tts..max_text_length`` (user override in config.yaml) + 2. ElevenLabs model-aware table (keyed on configured ``model_id``) + 3. ``PROVIDER_MAX_TEXT_LENGTH`` default + 4. ``FALLBACK_MAX_TEXT_LENGTH`` (4000) + + Non-positive or non-integer overrides fall through to the default so a + broken config can't accidentally disable truncation entirely. + """ + if not provider: + return FALLBACK_MAX_TEXT_LENGTH + key = provider.lower().strip() + cfg = tts_config or {} + prov_cfg = cfg.get(key) if isinstance(cfg.get(key), dict) else {} + + override = prov_cfg.get("max_text_length") if prov_cfg else None + if isinstance(override, bool): + # bool is an int subclass; treat explicit booleans as "not set" + override = None + if isinstance(override, int) and override > 0: + return override + + if key == "elevenlabs": + model_id = (prov_cfg or {}).get("model_id") or DEFAULT_ELEVENLABS_MODEL_ID + mapped = ELEVENLABS_MODEL_MAX_TEXT_LENGTH.get(str(model_id).strip()) + if mapped: + return mapped + + return PROVIDER_MAX_TEXT_LENGTH.get(key, FALLBACK_MAX_TEXT_LENGTH) # =========================================================================== @@ -865,14 +938,19 @@ def text_to_speech_tool( if not text or not text.strip(): return tool_error("Text is required", success=False) - # Truncate very long text with a warning - if len(text) > MAX_TEXT_LENGTH: - logger.warning("TTS text too long (%d chars), truncating to %d", len(text), MAX_TEXT_LENGTH) - text = text[:MAX_TEXT_LENGTH] - tts_config = _load_tts_config() provider = _get_provider(tts_config) + # Truncate very long text with a warning. The cap is per-provider + # (OpenAI 4096, xAI 15k, MiniMax 10k, ElevenLabs model-aware, etc.). + max_len = _resolve_max_text_length(provider, tts_config) + if len(text) > max_len: + logger.warning( + "TTS text too long for provider %s (%d chars), truncating to %d", + provider, len(text), max_len, + ) + text = text[:max_len] + # Detect platform from gateway env var to choose the best output format. # Telegram voice bubbles require Opus (.ogg); OpenAI and ElevenLabs can # produce Opus natively (no ffmpeg needed). Edge TTS always outputs MP3 @@ -1191,6 +1269,14 @@ def stream_tts_to_speaker( voice_id = el_config.get("voice_id", voice_id) model_id = el_config.get("streaming_model_id", el_config.get("model_id", model_id)) + # Per-sentence cap for the streaming path. Look up the cap against + # the *streaming* model_id (defaults to eleven_flash_v2_5 = 40k chars), + # not the sync model_id. A user override + # (tts.elevenlabs.max_text_length) still wins. + stream_max_len = _resolve_max_text_length( + "elevenlabs", + {**tts_config, "elevenlabs": {**el_config, "model_id": model_id}}, + ) api_key = os.getenv("ELEVENLABS_API_KEY", "") if not api_key: @@ -1246,9 +1332,9 @@ def stream_tts_to_speaker( # Skip audio generation if no TTS client available if client is None: return - # Truncate very long sentences - if len(cleaned) > MAX_TEXT_LENGTH: - cleaned = cleaned[:MAX_TEXT_LENGTH] + # Truncate very long sentences (ElevenLabs streaming path) + if len(cleaned) > stream_max_len: + cleaned = cleaned[:stream_max_len] try: audio_iter = client.text_to_speech.convert( text=cleaned, @@ -1406,7 +1492,7 @@ TTS_SCHEMA = { "properties": { "text": { "type": "string", - "description": "The text to convert to speech. Keep under 4000 characters." + "description": "The text to convert to speech. Provider-specific character caps apply and are enforced automatically (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k depending on model); over-long input is truncated." }, "output_path": { "type": "string",