mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(tts): use per-provider input-character caps instead of global 4000 (#13743)
A single global MAX_TEXT_LENGTH = 4000 truncated every TTS provider at
4000 chars, causing long inputs to be silently chopped even though the
underlying APIs allow much more:
- OpenAI: 4096
- xAI: 15000
- MiniMax: 10000
- ElevenLabs: 5000 / 10000 / 30000 / 40000 (model-aware)
- Gemini: ~5000
- Edge: ~5000
The schema description also told the model 'Keep under 4000 characters',
which encouraged the agent to self-chunk long briefs into multiple TTS
calls (producing 3 separate audio files instead of one).
New behavior:
- PROVIDER_MAX_TEXT_LENGTH table + ELEVENLABS_MODEL_MAX_TEXT_LENGTH
encode the documented per-provider limits.
- _resolve_max_text_length(provider, cfg) resolves:
1. tts.<provider>.max_text_length user override
2. ElevenLabs model_id lookup
3. provider default
4. 4000 fallback
- text_to_speech_tool() and stream_tts_to_speaker() both call the
resolver; old MAX_TEXT_LENGTH alias kept for back-compat.
- Schema description no longer hardcodes 4000.
Tests: 27 new unit + E2E tests; all 53 existing TTS tests and 253
voice-command/voice-cli tests still pass.
This commit is contained in:
parent
1e5daa4ece
commit
8f167e8791
3 changed files with 297 additions and 10 deletions
|
|
@ -613,6 +613,10 @@ DEFAULT_CONFIG = {
|
||||||
},
|
},
|
||||||
|
|
||||||
# Text-to-speech configuration
|
# Text-to-speech configuration
|
||||||
|
# Each provider supports an optional `max_text_length:` override for the
|
||||||
|
# per-request input-character cap. Omit it to use the provider's documented
|
||||||
|
# limit (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k model-aware,
|
||||||
|
# Gemini 5000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000).
|
||||||
"tts": {
|
"tts": {
|
||||||
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local)
|
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "neutts" (local)
|
||||||
"edge": {
|
"edge": {
|
||||||
|
|
|
||||||
197
tests/tools/test_tts_max_text_length.py
Normal file
197
tests/tools/test_tts_max_text_length.py
Normal file
|
|
@ -0,0 +1,197 @@
|
||||||
|
"""Tests for per-provider TTS input-character limits.
|
||||||
|
|
||||||
|
Replaces the old global ``MAX_TEXT_LENGTH = 4000`` cap that truncated every
|
||||||
|
provider at 4000 chars even though OpenAI allows 4096, xAI allows 15000,
|
||||||
|
MiniMax allows 10000, and ElevenLabs allows 5000-40000 depending on model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tools.tts_tool import (
|
||||||
|
ELEVENLABS_MODEL_MAX_TEXT_LENGTH,
|
||||||
|
FALLBACK_MAX_TEXT_LENGTH,
|
||||||
|
PROVIDER_MAX_TEXT_LENGTH,
|
||||||
|
_resolve_max_text_length,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolveMaxTextLength:
|
||||||
|
def test_edge_default(self):
|
||||||
|
assert _resolve_max_text_length("edge", {}) == PROVIDER_MAX_TEXT_LENGTH["edge"]
|
||||||
|
|
||||||
|
def test_openai_default_is_4096(self):
|
||||||
|
assert _resolve_max_text_length("openai", {}) == 4096
|
||||||
|
|
||||||
|
def test_xai_default_is_15000(self):
|
||||||
|
assert _resolve_max_text_length("xai", {}) == 15000
|
||||||
|
|
||||||
|
def test_minimax_default_is_10000(self):
|
||||||
|
assert _resolve_max_text_length("minimax", {}) == 10000
|
||||||
|
|
||||||
|
def test_mistral_default(self):
|
||||||
|
assert _resolve_max_text_length("mistral", {}) == PROVIDER_MAX_TEXT_LENGTH["mistral"]
|
||||||
|
|
||||||
|
def test_gemini_default(self):
|
||||||
|
assert _resolve_max_text_length("gemini", {}) == PROVIDER_MAX_TEXT_LENGTH["gemini"]
|
||||||
|
|
||||||
|
def test_unknown_provider_falls_back(self):
|
||||||
|
assert _resolve_max_text_length("does-not-exist", {}) == FALLBACK_MAX_TEXT_LENGTH
|
||||||
|
|
||||||
|
def test_empty_provider_falls_back(self):
|
||||||
|
assert _resolve_max_text_length("", {}) == FALLBACK_MAX_TEXT_LENGTH
|
||||||
|
assert _resolve_max_text_length(None, {}) == FALLBACK_MAX_TEXT_LENGTH
|
||||||
|
|
||||||
|
def test_case_insensitive(self):
|
||||||
|
assert _resolve_max_text_length("OpenAI", {}) == 4096
|
||||||
|
assert _resolve_max_text_length(" XAI ", {}) == 15000
|
||||||
|
|
||||||
|
# --- Overrides ---
|
||||||
|
|
||||||
|
def test_override_wins(self):
|
||||||
|
cfg = {"openai": {"max_text_length": 9999}}
|
||||||
|
assert _resolve_max_text_length("openai", cfg) == 9999
|
||||||
|
|
||||||
|
def test_override_zero_falls_through(self):
|
||||||
|
# A broken/zero override must not disable truncation
|
||||||
|
cfg = {"openai": {"max_text_length": 0}}
|
||||||
|
assert _resolve_max_text_length("openai", cfg) == 4096
|
||||||
|
|
||||||
|
def test_override_negative_falls_through(self):
|
||||||
|
cfg = {"xai": {"max_text_length": -1}}
|
||||||
|
assert _resolve_max_text_length("xai", cfg) == 15000
|
||||||
|
|
||||||
|
def test_override_non_int_falls_through(self):
|
||||||
|
cfg = {"minimax": {"max_text_length": "lots"}}
|
||||||
|
assert _resolve_max_text_length("minimax", cfg) == 10000
|
||||||
|
|
||||||
|
def test_override_bool_falls_through(self):
|
||||||
|
# bool is technically an int; make sure we don't treat True as 1 char
|
||||||
|
cfg = {"openai": {"max_text_length": True}}
|
||||||
|
assert _resolve_max_text_length("openai", cfg) == 4096
|
||||||
|
|
||||||
|
def test_missing_provider_section_uses_default(self):
|
||||||
|
cfg = {"provider": "openai"} # no "openai" key
|
||||||
|
assert _resolve_max_text_length("openai", cfg) == 4096
|
||||||
|
|
||||||
|
# --- ElevenLabs model-aware ---
|
||||||
|
|
||||||
|
def test_elevenlabs_default_model_multilingual_v2(self):
|
||||||
|
cfg = {"elevenlabs": {"model_id": "eleven_multilingual_v2"}}
|
||||||
|
assert _resolve_max_text_length("elevenlabs", cfg) == 10000
|
||||||
|
|
||||||
|
def test_elevenlabs_flash_v2_5_gets_40k(self):
|
||||||
|
cfg = {"elevenlabs": {"model_id": "eleven_flash_v2_5"}}
|
||||||
|
assert _resolve_max_text_length("elevenlabs", cfg) == 40000
|
||||||
|
|
||||||
|
def test_elevenlabs_flash_v2_gets_30k(self):
|
||||||
|
cfg = {"elevenlabs": {"model_id": "eleven_flash_v2"}}
|
||||||
|
assert _resolve_max_text_length("elevenlabs", cfg) == 30000
|
||||||
|
|
||||||
|
def test_elevenlabs_v3_gets_5k(self):
|
||||||
|
cfg = {"elevenlabs": {"model_id": "eleven_v3"}}
|
||||||
|
assert _resolve_max_text_length("elevenlabs", cfg) == 5000
|
||||||
|
|
||||||
|
def test_elevenlabs_unknown_model_falls_back_to_provider_default(self):
|
||||||
|
cfg = {"elevenlabs": {"model_id": "eleven_experimental_xyz"}}
|
||||||
|
assert _resolve_max_text_length("elevenlabs", cfg) == PROVIDER_MAX_TEXT_LENGTH["elevenlabs"]
|
||||||
|
|
||||||
|
def test_elevenlabs_override_beats_model_lookup(self):
|
||||||
|
cfg = {"elevenlabs": {"model_id": "eleven_flash_v2_5", "max_text_length": 1000}}
|
||||||
|
assert _resolve_max_text_length("elevenlabs", cfg) == 1000
|
||||||
|
|
||||||
|
def test_elevenlabs_no_model_id_uses_default_model_mapping(self):
|
||||||
|
# Falls back to DEFAULT_ELEVENLABS_MODEL_ID = eleven_multilingual_v2 -> 10000
|
||||||
|
assert _resolve_max_text_length("elevenlabs", {}) == 10000
|
||||||
|
|
||||||
|
def test_provider_config_not_a_dict(self):
|
||||||
|
cfg = {"openai": "not-a-dict"}
|
||||||
|
assert _resolve_max_text_length("openai", cfg) == 4096
|
||||||
|
|
||||||
|
# --- Sanity: the table covers every provider listed in the schema ---
|
||||||
|
|
||||||
|
def test_all_documented_providers_have_defaults(self):
|
||||||
|
expected = {"edge", "openai", "xai", "minimax", "mistral",
|
||||||
|
"gemini", "elevenlabs", "neutts", "kittentts"}
|
||||||
|
assert expected.issubset(PROVIDER_MAX_TEXT_LENGTH.keys())
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextToSpeechToolTruncation:
|
||||||
|
"""End-to-end: verify the resolver actually drives the text_to_speech_tool
|
||||||
|
truncation path rather than the old 4000-char global."""
|
||||||
|
|
||||||
|
def test_openai_truncates_at_4096_not_4000(self, tmp_path, monkeypatch, caplog):
|
||||||
|
import logging
|
||||||
|
caplog.set_level(logging.WARNING, logger="tools.tts_tool")
|
||||||
|
|
||||||
|
# 5000 chars -- over OpenAI's 4096 limit but under xAI's 15k
|
||||||
|
text = "A" * 5000
|
||||||
|
captured_text = {}
|
||||||
|
|
||||||
|
def fake_openai(t, out, cfg):
|
||||||
|
captured_text["text"] = t
|
||||||
|
with open(out, "wb") as f:
|
||||||
|
f.write(b"\x00")
|
||||||
|
return out
|
||||||
|
|
||||||
|
monkeypatch.setattr("tools.tts_tool._generate_openai_tts", fake_openai)
|
||||||
|
monkeypatch.setattr("tools.tts_tool._load_tts_config",
|
||||||
|
lambda: {"provider": "openai"})
|
||||||
|
|
||||||
|
from tools.tts_tool import text_to_speech_tool
|
||||||
|
out = str(tmp_path / "out.mp3")
|
||||||
|
result = json.loads(text_to_speech_tool(text=text, output_path=out))
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
# Should be truncated to 4096, not the old 4000
|
||||||
|
assert len(captured_text["text"]) == 4096
|
||||||
|
# And the warning should mention the provider
|
||||||
|
assert any("openai" in rec.message.lower() for rec in caplog.records)
|
||||||
|
|
||||||
|
def test_xai_accepts_much_longer_input(self, tmp_path, monkeypatch):
|
||||||
|
# 12000 chars -- over old global 4000, under xAI's 15000
|
||||||
|
text = "B" * 12000
|
||||||
|
captured_text = {}
|
||||||
|
|
||||||
|
def fake_xai(t, out, cfg):
|
||||||
|
captured_text["text"] = t
|
||||||
|
with open(out, "wb") as f:
|
||||||
|
f.write(b"\x00")
|
||||||
|
return out
|
||||||
|
|
||||||
|
monkeypatch.setattr("tools.tts_tool._generate_xai_tts", fake_xai)
|
||||||
|
monkeypatch.setattr("tools.tts_tool._load_tts_config",
|
||||||
|
lambda: {"provider": "xai"})
|
||||||
|
|
||||||
|
from tools.tts_tool import text_to_speech_tool
|
||||||
|
out = str(tmp_path / "out.mp3")
|
||||||
|
result = json.loads(text_to_speech_tool(text=text, output_path=out))
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
# xAI should accept the full 12000 chars
|
||||||
|
assert len(captured_text["text"]) == 12000
|
||||||
|
|
||||||
|
def test_user_override_is_respected(self, tmp_path, monkeypatch):
|
||||||
|
# User says "cap openai at 100 chars" -- we must honor it
|
||||||
|
text = "C" * 500
|
||||||
|
captured_text = {}
|
||||||
|
|
||||||
|
def fake_openai(t, out, cfg):
|
||||||
|
captured_text["text"] = t
|
||||||
|
with open(out, "wb") as f:
|
||||||
|
f.write(b"\x00")
|
||||||
|
return out
|
||||||
|
|
||||||
|
monkeypatch.setattr("tools.tts_tool._generate_openai_tts", fake_openai)
|
||||||
|
monkeypatch.setattr("tools.tts_tool._load_tts_config",
|
||||||
|
lambda: {"provider": "openai",
|
||||||
|
"openai": {"max_text_length": 100}})
|
||||||
|
|
||||||
|
from tools.tts_tool import text_to_speech_tool
|
||||||
|
out = str(tmp_path / "out.mp3")
|
||||||
|
result = json.loads(text_to_speech_tool(text=text, output_path=out))
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert len(captured_text["text"]) == 100
|
||||||
|
|
@ -121,7 +121,80 @@ def _get_default_output_dir() -> str:
|
||||||
return str(get_hermes_dir("cache/audio", "audio_cache"))
|
return str(get_hermes_dir("cache/audio", "audio_cache"))
|
||||||
|
|
||||||
DEFAULT_OUTPUT_DIR = _get_default_output_dir()
|
DEFAULT_OUTPUT_DIR = _get_default_output_dir()
|
||||||
MAX_TEXT_LENGTH = 4000
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-provider input-character limits (from official provider docs).
|
||||||
|
# A single global cap was wrong: OpenAI is 4096, xAI is 15k, MiniMax is 10k,
|
||||||
|
# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini caps at ~8k
|
||||||
|
# input tokens. Users can override any of these via
|
||||||
|
# ``tts.<provider>.max_text_length`` in config.yaml.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = {
|
||||||
|
"edge": 5000, # edge-tts practical sync limit
|
||||||
|
"openai": 4096, # https://platform.openai.com/docs/guides/text-to-speech
|
||||||
|
"xai": 15000, # https://docs.x.ai/developers/model-capabilities/audio/text-to-speech
|
||||||
|
"minimax": 10000, # https://platform.minimax.io/docs/api-reference/speech-t2a-http (sync)
|
||||||
|
"mistral": 4000, # conservative; no published per-request cap
|
||||||
|
"gemini": 5000, # Gemini TTS caps at ~8k input tokens / ~655s audio
|
||||||
|
"elevenlabs": 10000, # fallback when model-aware lookup can't resolve (multilingual_v2)
|
||||||
|
"neutts": 2000, # local model, quality falls off on long text
|
||||||
|
"kittentts": 2000, # local 25MB model
|
||||||
|
}
|
||||||
|
|
||||||
|
# ElevenLabs caps vary by model_id. https://elevenlabs.io/docs/overview/models
|
||||||
|
ELEVENLABS_MODEL_MAX_TEXT_LENGTH: Dict[str, int] = {
|
||||||
|
"eleven_v3": 5000,
|
||||||
|
"eleven_ttv_v3": 5000,
|
||||||
|
"eleven_multilingual_v2": 10000,
|
||||||
|
"eleven_multilingual_v1": 10000,
|
||||||
|
"eleven_english_sts_v2": 10000,
|
||||||
|
"eleven_english_sts_v1": 10000,
|
||||||
|
"eleven_flash_v2": 30000,
|
||||||
|
"eleven_flash_v2_5": 40000,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Final fallback when provider isn't recognised at all.
|
||||||
|
FALLBACK_MAX_TEXT_LENGTH = 4000
|
||||||
|
|
||||||
|
# Back-compat alias. Prefer ``_resolve_max_text_length()`` for new code.
|
||||||
|
MAX_TEXT_LENGTH = FALLBACK_MAX_TEXT_LENGTH
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_max_text_length(
|
||||||
|
provider: Optional[str],
|
||||||
|
tts_config: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> int:
|
||||||
|
"""Return the input-character cap for *provider*.
|
||||||
|
|
||||||
|
Resolution order:
|
||||||
|
1. ``tts.<provider>.max_text_length`` (user override in config.yaml)
|
||||||
|
2. ElevenLabs model-aware table (keyed on configured ``model_id``)
|
||||||
|
3. ``PROVIDER_MAX_TEXT_LENGTH`` default
|
||||||
|
4. ``FALLBACK_MAX_TEXT_LENGTH`` (4000)
|
||||||
|
|
||||||
|
Non-positive or non-integer overrides fall through to the default so a
|
||||||
|
broken config can't accidentally disable truncation entirely.
|
||||||
|
"""
|
||||||
|
if not provider:
|
||||||
|
return FALLBACK_MAX_TEXT_LENGTH
|
||||||
|
key = provider.lower().strip()
|
||||||
|
cfg = tts_config or {}
|
||||||
|
prov_cfg = cfg.get(key) if isinstance(cfg.get(key), dict) else {}
|
||||||
|
|
||||||
|
override = prov_cfg.get("max_text_length") if prov_cfg else None
|
||||||
|
if isinstance(override, bool):
|
||||||
|
# bool is an int subclass; treat explicit booleans as "not set"
|
||||||
|
override = None
|
||||||
|
if isinstance(override, int) and override > 0:
|
||||||
|
return override
|
||||||
|
|
||||||
|
if key == "elevenlabs":
|
||||||
|
model_id = (prov_cfg or {}).get("model_id") or DEFAULT_ELEVENLABS_MODEL_ID
|
||||||
|
mapped = ELEVENLABS_MODEL_MAX_TEXT_LENGTH.get(str(model_id).strip())
|
||||||
|
if mapped:
|
||||||
|
return mapped
|
||||||
|
|
||||||
|
return PROVIDER_MAX_TEXT_LENGTH.get(key, FALLBACK_MAX_TEXT_LENGTH)
|
||||||
|
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|
@ -865,14 +938,19 @@ def text_to_speech_tool(
|
||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
return tool_error("Text is required", success=False)
|
return tool_error("Text is required", success=False)
|
||||||
|
|
||||||
# Truncate very long text with a warning
|
|
||||||
if len(text) > MAX_TEXT_LENGTH:
|
|
||||||
logger.warning("TTS text too long (%d chars), truncating to %d", len(text), MAX_TEXT_LENGTH)
|
|
||||||
text = text[:MAX_TEXT_LENGTH]
|
|
||||||
|
|
||||||
tts_config = _load_tts_config()
|
tts_config = _load_tts_config()
|
||||||
provider = _get_provider(tts_config)
|
provider = _get_provider(tts_config)
|
||||||
|
|
||||||
|
# Truncate very long text with a warning. The cap is per-provider
|
||||||
|
# (OpenAI 4096, xAI 15k, MiniMax 10k, ElevenLabs model-aware, etc.).
|
||||||
|
max_len = _resolve_max_text_length(provider, tts_config)
|
||||||
|
if len(text) > max_len:
|
||||||
|
logger.warning(
|
||||||
|
"TTS text too long for provider %s (%d chars), truncating to %d",
|
||||||
|
provider, len(text), max_len,
|
||||||
|
)
|
||||||
|
text = text[:max_len]
|
||||||
|
|
||||||
# Detect platform from gateway env var to choose the best output format.
|
# Detect platform from gateway env var to choose the best output format.
|
||||||
# Telegram voice bubbles require Opus (.ogg); OpenAI and ElevenLabs can
|
# Telegram voice bubbles require Opus (.ogg); OpenAI and ElevenLabs can
|
||||||
# produce Opus natively (no ffmpeg needed). Edge TTS always outputs MP3
|
# produce Opus natively (no ffmpeg needed). Edge TTS always outputs MP3
|
||||||
|
|
@ -1191,6 +1269,14 @@ def stream_tts_to_speaker(
|
||||||
voice_id = el_config.get("voice_id", voice_id)
|
voice_id = el_config.get("voice_id", voice_id)
|
||||||
model_id = el_config.get("streaming_model_id",
|
model_id = el_config.get("streaming_model_id",
|
||||||
el_config.get("model_id", model_id))
|
el_config.get("model_id", model_id))
|
||||||
|
# Per-sentence cap for the streaming path. Look up the cap against
|
||||||
|
# the *streaming* model_id (defaults to eleven_flash_v2_5 = 40k chars),
|
||||||
|
# not the sync model_id. A user override
|
||||||
|
# (tts.elevenlabs.max_text_length) still wins.
|
||||||
|
stream_max_len = _resolve_max_text_length(
|
||||||
|
"elevenlabs",
|
||||||
|
{**tts_config, "elevenlabs": {**el_config, "model_id": model_id}},
|
||||||
|
)
|
||||||
|
|
||||||
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
|
|
@ -1246,9 +1332,9 @@ def stream_tts_to_speaker(
|
||||||
# Skip audio generation if no TTS client available
|
# Skip audio generation if no TTS client available
|
||||||
if client is None:
|
if client is None:
|
||||||
return
|
return
|
||||||
# Truncate very long sentences
|
# Truncate very long sentences (ElevenLabs streaming path)
|
||||||
if len(cleaned) > MAX_TEXT_LENGTH:
|
if len(cleaned) > stream_max_len:
|
||||||
cleaned = cleaned[:MAX_TEXT_LENGTH]
|
cleaned = cleaned[:stream_max_len]
|
||||||
try:
|
try:
|
||||||
audio_iter = client.text_to_speech.convert(
|
audio_iter = client.text_to_speech.convert(
|
||||||
text=cleaned,
|
text=cleaned,
|
||||||
|
|
@ -1406,7 +1492,7 @@ TTS_SCHEMA = {
|
||||||
"properties": {
|
"properties": {
|
||||||
"text": {
|
"text": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"description": "The text to convert to speech. Keep under 4000 characters."
|
"description": "The text to convert to speech. Provider-specific character caps apply and are enforced automatically (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k depending on model); over-long input is truncated."
|
||||||
},
|
},
|
||||||
"output_path": {
|
"output_path": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue