diff --git a/tests/tools/test_tts_xai_speech_tags.py b/tests/tools/test_tts_xai_speech_tags.py new file mode 100644 index 00000000000..6ab72452ac7 --- /dev/null +++ b/tests/tools/test_tts_xai_speech_tags.py @@ -0,0 +1,81 @@ +"""Tests for xAI TTS speech-tag handling.""" + +from unittest.mock import Mock + +from tools.tts_tool import _apply_xai_auto_speech_tags, _generate_xai_tts + + +def test_apply_xai_auto_speech_tags_adds_light_pause_after_first_sentence(): + text = "Bonjour Monsieur Talbot. Ceci est un test de réponse vocale." + + assert _apply_xai_auto_speech_tags(text) == ( + "Bonjour Monsieur Talbot. [pause] Ceci est un test de réponse vocale." + ) + + +def test_apply_xai_auto_speech_tags_preserves_explicit_tags(): + text = "Bonjour. [pause] Déjà balisé." + + assert _apply_xai_auto_speech_tags(text) == text + + +def test_apply_xai_auto_speech_tags_preserves_all_documented_xai_tags(): + text = "Bonjour Monsieur Talbot. [sigh] Je parle lentement. Important." + + assert _apply_xai_auto_speech_tags(text) == text + + +def test_generate_xai_tts_sends_auto_speech_tags_when_enabled(tmp_path, monkeypatch): + captured = {} + + class FakeResponse: + content = b"mp3" + + def raise_for_status(self): + pass + + def fake_post(url, headers, json, timeout): + captured["url"] = url + captured["headers"] = headers + captured["json"] = json + captured["timeout"] = timeout + return FakeResponse() + + monkeypatch.setenv("XAI_API_KEY", "test-xai-key") + monkeypatch.setattr("requests.post", fake_post) + + out = tmp_path / "out.mp3" + _generate_xai_tts( + "Bonjour Monsieur Talbot. Ceci est un test.", + str(out), + {"xai": {"voice_id": "ara", "language": "fr", "auto_speech_tags": True}}, + ) + + assert out.read_bytes() == b"mp3" + assert captured["url"] == "https://api.x.ai/v1/tts" + assert captured["json"]["voice_id"] == "ara" + assert captured["json"]["language"] == "fr" + assert captured["json"]["text"] == "Bonjour Monsieur Talbot. [pause] Ceci est un test." + + +def test_generate_xai_tts_leaves_text_plain_by_default(tmp_path, monkeypatch): + captured = {} + + fake_response = Mock() + fake_response.content = b"mp3" + fake_response.raise_for_status.return_value = None + + def fake_post(url, headers, json, timeout): + captured["json"] = json + return fake_response + + monkeypatch.setenv("XAI_API_KEY", "test-xai-key") + monkeypatch.setattr("requests.post", fake_post) + + _generate_xai_tts( + "Bonjour Monsieur Talbot. Ceci est un test.", + str(tmp_path / "out.mp3"), + {"xai": {"voice_id": "ara", "language": "fr"}}, + ) + + assert captured["json"]["text"] == "Bonjour Monsieur Talbot. Ceci est un test." diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 469cb6608d4..71535aed827 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -167,6 +167,7 @@ DEFAULT_XAI_VOICE_ID = "eve" DEFAULT_XAI_LANGUAGE = "en" DEFAULT_XAI_SAMPLE_RATE = 24000 DEFAULT_XAI_BIT_RATE = 128000 +DEFAULT_XAI_AUTO_SPEECH_TAGS = False DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" DEFAULT_GEMINI_TTS_VOICE = "Kore" @@ -892,6 +893,79 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any] # =========================================================================== # Provider: xAI TTS # =========================================================================== +_XAI_INLINE_SPEECH_TAGS = ( + "pause", + "long-pause", + "hum-tune", + "laugh", + "chuckle", + "giggle", + "cry", + "tsk", + "tongue-click", + "lip-smack", + "breath", + "inhale", + "exhale", + "sigh", +) +_XAI_WRAPPING_SPEECH_TAGS = ( + "soft", + "whisper", + "loud", + "build-intensity", + "decrease-intensity", + "higher-pitch", + "lower-pitch", + "slow", + "fast", + "sing-song", + "singing", + "laugh-speak", + "emphasis", +) +_XAI_SPEECH_TAG_RE = re.compile( + r"(\[(?:" + "|".join(_XAI_INLINE_SPEECH_TAGS) + r")\]|)", + flags=re.IGNORECASE, +) +_XAI_FIRST_SENTENCE_RE = re.compile(r"^(.{12,120}?[.!?…])\s+(?=\S)", flags=re.DOTALL) + + +def _xai_bool_config(value: Any, default: bool = False) -> bool: + """Coerce common YAML/env bool spellings without treating random strings as true.""" + if isinstance(value, bool): + return value + if value is None: + return default + if isinstance(value, (int, float)): + return bool(value) + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"1", "true", "yes", "on", "enabled"}: + return True + if normalized in {"0", "false", "no", "off", "disabled"}: + return False + return default + + +def _apply_xai_auto_speech_tags(text: str) -> str: + """Add light xAI speech tags for more natural voice-mode replies. + + The transform is intentionally conservative: it only inserts pauses. It + never fabricates laughter or whispering, and it leaves explicit user/model + speech tags untouched. + """ + clean = text.strip() + if not clean or _XAI_SPEECH_TAG_RE.search(clean): + return text + + clean = re.sub(r"\n\s*\n+", " [pause] ", clean) + clean = re.sub(r"\s*\n\s*", " ", clean) + clean = _XAI_FIRST_SENTENCE_RE.sub(r"\1 [pause] ", clean, count=1) + clean = re.sub(r"\s{2,}", " ", clean).strip() + return clean + + def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: """ Generate audio using xAI TTS. @@ -913,6 +987,12 @@ def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) - language = str(xai_config.get("language", DEFAULT_XAI_LANGUAGE)).strip() or DEFAULT_XAI_LANGUAGE sample_rate = int(xai_config.get("sample_rate", DEFAULT_XAI_SAMPLE_RATE)) bit_rate = int(xai_config.get("bit_rate", DEFAULT_XAI_BIT_RATE)) + auto_speech_tags = _xai_bool_config( + xai_config.get("auto_speech_tags", xai_config.get("speech_tags")), + DEFAULT_XAI_AUTO_SPEECH_TAGS, + ) + if auto_speech_tags: + text = _apply_xai_auto_speech_tags(text) base_url = str( xai_config.get("base_url") or creds.get("base_url")