feat(tts): add auxiliary-model auto speech tags for xAI

Mirrors the existing Gemini TTS audio-tag rewrite path. When the input has no explicit user/model speech tags, ask the configured auxiliary model to insert a richer set of xAI-supported tags (laughs, sighs, whispers, soft/loud, slow/fast, etc.) so voice-mode replies sound more expressive. Falls back to the local conservative [pause]-only transform on any auxiliary-model failure.
2026-06-21 10:22:18 +00:00 · 2026-06-17 22:36:44 -06:00 · 2026-06-17 22:36:44 -06:00 · 5a506da3d8
commit 5a506da3d8
parent fad4b40d9d
1 changed files with 60 additions and 11 deletions
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -1092,22 +1092,71 @@ def _xai_bool_config(value: Any, default: bool = False) -> bool:


 def _apply_xai_auto_speech_tags(text: str) -> str:
-    """Add light xAI speech tags for more natural voice-mode replies.
+    """Add xAI speech tags for more natural voice-mode replies.

-    The transform is intentionally conservative: it only inserts pauses. It
-    never fabricates laughter or whispering, and it leaves explicit user/model
-    speech tags untouched.
+    First applies a conservative local transform (inserts [pause] between
+    paragraphs and after the first sentence). Then, if the result contains
+    no explicit user/model speech tags, asks the configured auxiliary model
+    to rewrite the transcript with a richer set of xAI-supported tags
+    (laughs, sighs, whispers, soft/loud, slow/fast, etc.) so the voice
+    output sounds more expressive. Falls back to the local result on any
+    auxiliary-model failure.
    """
    clean = text.strip()
-    if not clean or _XAI_SPEECH_TAG_RE.search(clean):
+    if not clean:
        return text

-    clean = re.sub(r"\n\s*\n+", " [pause] ", clean)
-    clean = re.sub(r"\s*\n\s*", " ", clean)
-    if not _XAI_SPEECH_TAG_RE.search(clean):
-        clean = _XAI_FIRST_SENTENCE_RE.sub(r"\1 [pause] ", clean, count=1)
-    clean = re.sub(r"\s{2,}", " ", clean).strip()
-    return clean
+    # Local conservative pass: pauses only.
+    local = clean
+    local = re.sub(r"\n\s*\n+", " [pause] ", local)
+    local = re.sub(r"\s*\n\s*", " ", local)
+    if not _XAI_SPEECH_TAG_RE.search(local):
+        local = _XAI_FIRST_SENTENCE_RE.sub(r"\1 [pause] ", local, count=1)
+    local = re.sub(r"\s{2,}", " ", local).strip()
+
+    # If the user/model already supplied explicit speech tags, trust them
+    # and don't re-rewrite.
+    if _XAI_SPEECH_TAG_RE.search(clean):
+        return local
+
+    # Auxiliary rewrite for richer emotion tags (mirrors the Gemini path).
+    inline = ", ".join(_XAI_INLINE_SPEECH_TAGS)
+    wrapping = ", ".join(_XAI_WRAPPING_SPEECH_TAGS)
+    system_prompt = (
+        "You rewrite transcripts for the xAI /v1/tts endpoint by inserting "
+        "expressive speech tags.\n\n"
+        "Valid inline tags (use as `[tag]`): " + inline + ".\n"
+        "Valid wrapping tags (use as `[tag]...[/tag]`): " + wrapping + ".\n\n"
+        "Rules:\n"
+        "- Preserve the spoken words, order, and meaning.\n"
+        "- Do not add new spoken sentences or remove existing spoken words.\n"
+        "- Use inline `[tag]` for short modifiers (laughs, sighs, pause, etc.).\n"
+        "- Use wrapping `[tag]...[/tag]` for sustained effects (whisper, soft, slow, fast, loud, etc.).\n"
+        "- Do not use angle-bracket tags like `<tag>...</tag>` — xAI uses BBCode-style closing tags with `[/tag]`.\n"
+        "- Do not use SSML.\n"
+        "- Do not explain or comment.\n"
+        "- Return only the tagged TTS script."
+    )
+    try:
+        from agent.auxiliary_client import call_llm
+
+        response = call_llm(
+            task="tts_audio_tags",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"TRANSCRIPT TO TAG:\n{local}"},
+            ],
+            temperature=0.7,
+        )
+        tagged = _extract_auxiliary_message_content(response).strip()
+        # Strip markdown fences if the LLM wrapped the response.
+        fence = re.fullmatch(r"```(?:[A-Za-z0-9_-]+)?\s*(.*?)\s*```", tagged, flags=re.DOTALL)
+        if fence:
+            tagged = fence.group(1).strip()
+        return tagged or local
+    except Exception as exc:
+        logger.debug("xAI TTS audio tag rewrite failed; using locally-tagged text: %s", exc)
+        return local


 def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: