feat(tts): add Gemini audio tag rewrite

2026-07-29 18:46:59 +00:00 · 2026-06-08 21:04:45 -07:00 · 2026-06-08 21:04:45 -07:00 · 2c19208224
commit 2c19208224
parent 5718811de0
7 changed files with 297 additions and 20 deletions
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@ -415,7 +415,8 @@ prompt_caching:
 # Auxiliary Models (Advanced — Experimental)
 # =============================================================================
 # Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
-# browser screenshot analysis, web page summarization, and context compression.
+# browser screenshot analysis, web page summarization, TTS audio-tag insertion,
+# and context compression.
 #
 # By default these use Gemini Flash via OpenRouter or Nous Portal and are
 # auto-detected from your credentials.  You do NOT need to change anything
@ -460,6 +461,12 @@ prompt_caching:
 #     provider: "auto"
 #     model: ""
 #
+#   # Gemini 3.1 TTS hidden audio-tag insertion
+#   tts_audio_tags:
+#     provider: "auto"       # empty model = your main chat model
+#     model: ""
+#     timeout: 30
+#
 #   # Session search — summarizes matching past sessions
 #   session_search:
 #     provider: "auto"
@ -835,6 +842,22 @@ platform_toolsets:
 #       max_tool_rounds: 5      # tool loop limit (0 = disable)
 #       log_level: "info"       # audit verbosity

+# =============================================================================
+# Text-to-Speech
+# =============================================================================
+# TTS defaults to Edge TTS unless changed in ~/.hermes/config.yaml.
+# Gemini TTS supports persona/director prompt files, and Gemini 3.1 Flash TTS
+# can use a hidden auxiliary rewrite pass to insert expressive square-bracket
+# audio tags into the TTS script without showing tags in chat.
+#
+# tts:
+#   provider: "gemini"
+#   gemini:
+#     model: "gemini-3.1-flash-tts-preview"
+#     voice: "Kore"
+#     audio_tags: false
+#     persona_prompt_file: ""  # e.g. ~/.hermes/tts/radio-host.md
+
 # =============================================================================
 # Voice Transcription (Speech-to-Text)
 # =============================================================================
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -1290,6 +1290,14 @@ DEFAULT_CONFIG = {
            "timeout": 30,
            "extra_body": {},
        },
+        "tts_audio_tags": {
+            "provider": "auto",
+            "model": "",
+            "base_url": "",
+            "api_key": "",
+            "timeout": 30,
+            "extra_body": {},
+        },
        # Triage specifier — flesh out a rough one-liner in the Kanban
        # Triage column into a concrete spec, then promote it to ``todo``.
        # Invoked by ``hermes kanban specify`` (single id or --all). Set a
@ -1575,6 +1583,10 @@ DEFAULT_CONFIG = {
        "gemini": {
            "model": "gemini-2.5-flash-preview-tts",
            "voice": "Kore",
+            # When true, Gemini 3.1 TTS uses a hidden auxiliary-model rewrite
+            # pass to insert freeform square-bracket audio tags into the TTS
+            # script. Visible chat replies are unchanged.
+            "audio_tags": False,
            # Optional local Markdown/text file with Gemini TTS performance
            # direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES,
            # SAMPLE CONTEXT, and either a `{transcript}` placeholder or no
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -2980,6 +2980,7 @@ _AUX_TASKS: list[tuple[str, str, str]] = [
    ("approval", "Approval", "smart command approval"),
    ("mcp", "MCP", "MCP tool reasoning"),
    ("title_generation", "Title generation", "session titles"),
+    ("tts_audio_tags", "TTS audio tags", "Gemini TTS tag insertion"),
    ("skills_hub", "Skills hub", "skills search/install"),
    ("triage_specifier", "Triage specifier", "kanban spec fleshing"),
    ("kanban_decomposer", "Kanban decomposer", "task decomposition"),
--- a/tests/tools/test_tts_gemini.py
+++ b/tests/tools/test_tts_gemini.py
@ -2,6 +2,7 @@

 import base64
 import struct
+from types import SimpleNamespace
 from unittest.mock import MagicMock, patch

 import pytest
@ -312,6 +313,112 @@ class TestGenerateGeminiTts:
        assert prompt_text == "Hi"
        assert "persona prompt file unavailable" in caplog.text

+    def test_audio_tags_disabled_does_not_call_rewriter(
+        self, tmp_path, monkeypatch, mock_gemini_response
+    ):
+        from tools.tts_tool import _generate_gemini_tts
+
+        config = {
+            "gemini": {
+                "model": "gemini-3.1-flash-tts-preview",
+                "audio_tags": False,
+            }
+        }
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+
+        with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \
+             patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
+
+        mock_call_llm.assert_not_called()
+        prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
+        assert prompt_text == "Hi there."
+
+    def test_audio_tags_enabled_rewrites_hidden_tts_script(
+        self, tmp_path, monkeypatch, mock_gemini_response
+    ):
+        from tools.tts_tool import _generate_gemini_tts
+
+        persona_file = tmp_path / "voice-persona.md"
+        persona_file.write_text(
+            "### DIRECTOR'S NOTES\nStyle: Warm and amused.",
+            encoding="utf-8",
+        )
+        response = SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    message=SimpleNamespace(content="[warmly] Hi there. [soft laugh]")
+                )
+            ]
+        )
+        config = {
+            "gemini": {
+                "model": "gemini-3.1-flash-tts-preview",
+                "audio_tags": True,
+                "persona_prompt_file": str(persona_file),
+            }
+        }
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+
+        with patch("agent.auxiliary_client.call_llm", return_value=response) as mock_call_llm, \
+             patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
+
+        mock_call_llm.assert_called_once()
+        call_kwargs = mock_call_llm.call_args.kwargs
+        assert call_kwargs["task"] == "tts_audio_tags"
+        assert "Audio tags are inline square-bracket modifiers" in call_kwargs["messages"][0]["content"]
+        assert "Style: Warm and amused." in call_kwargs["messages"][1]["content"]
+        assert "Hi there." in call_kwargs["messages"][1]["content"]
+
+        prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
+        assert "Synthesize speech from the TRANSCRIPT only" in prompt_text
+        assert "### DIRECTOR'S NOTES\nStyle: Warm and amused." in prompt_text
+        assert "#### TRANSCRIPT\n[warmly] Hi there. [soft laugh]" in prompt_text
+
+    def test_audio_tags_enabled_skips_non_tag_capable_model(
+        self, tmp_path, monkeypatch, mock_gemini_response, caplog
+    ):
+        from tools.tts_tool import _generate_gemini_tts
+
+        config = {
+            "gemini": {
+                "model": "gemini-2.5-flash-preview-tts",
+                "audio_tags": True,
+            }
+        }
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+
+        with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \
+             patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
+
+        mock_call_llm.assert_not_called()
+        prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
+        assert prompt_text == "Hi there."
+        assert "not known to support Gemini audio tags" in caplog.text
+
+    def test_audio_tag_rewrite_failure_falls_back_to_original_text(
+        self, tmp_path, monkeypatch, mock_gemini_response, caplog
+    ):
+        from tools.tts_tool import _generate_gemini_tts
+
+        config = {
+            "gemini": {
+                "model": "gemini-3.1-flash-tts-preview",
+                "audio_tags": True,
+            }
+        }
+        monkeypatch.setenv("GEMINI_API_KEY", "test-key")
+
+        with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("boom")), \
+             patch("requests.post", return_value=mock_gemini_response) as mock_post:
+            _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
+
+        prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
+        assert prompt_text == "Hi there."
+        assert "audio tag rewrite failed" in caplog.text
+

 class TestGeminiInCheckRequirements:
    def test_gemini_api_key_satisfies_requirements(self, monkeypatch):
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -190,6 +190,8 @@ DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
 DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
 DEFAULT_GEMINI_TTS_VOICE = "Kore"
 DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
+DEFAULT_GEMINI_AUDIO_TAGS = False
+GEMINI_AUDIO_TAG_REWRITE_TASK = "tts_audio_tags"
 # PCM output specs for Gemini TTS (fixed by the API)
 GEMINI_TTS_SAMPLE_RATE = 24000
 GEMINI_TTS_CHANNELS = 1
@ -233,6 +235,23 @@ ELEVENLABS_MODEL_MAX_TEXT_LENGTH: Dict[str, int] = {
    "eleven_flash_v2_5": 40000,
 }

+
+def _config_bool(value: Any, default: bool = False) -> bool:
+    """Coerce common YAML/env bool spellings without treating random strings as true."""
+    if isinstance(value, bool):
+        return value
+    if value is None:
+        return default
+    if isinstance(value, (int, float)):
+        return bool(value)
+    if isinstance(value, str):
+        normalized = value.strip().lower()
+        if normalized in {"1", "true", "yes", "on", "enabled"}:
+            return True
+        if normalized in {"0", "false", "no", "off", "disabled"}:
+            return False
+    return default
+
 # Final fallback when provider isn't recognised at all.
 FALLBACK_MAX_TEXT_LENGTH = 4000

@ -1069,20 +1088,7 @@ _XAI_FIRST_SENTENCE_RE = re.compile(r"^(.{12,120}?[.!?…])\s+(?=\S)", flags=re.


 def _xai_bool_config(value: Any, default: bool = False) -> bool:
-    """Coerce common YAML/env bool spellings without treating random strings as true."""
-    if isinstance(value, bool):
-        return value
-    if value is None:
-        return default
-    if isinstance(value, (int, float)):
-        return bool(value)
-    if isinstance(value, str):
-        normalized = value.strip().lower()
-        if normalized in {"1", "true", "yes", "on", "enabled"}:
-            return True
-        if normalized in {"0", "false", "no", "off", "disabled"}:
-            return False
-    return default
+    return _config_bool(value, default=default)


 def _apply_xai_auto_speech_tags(text: str) -> str:
@ -1427,10 +1433,105 @@ def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str:
        return ""


-def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str:
+def _gemini_model_supports_audio_tags(model: str) -> bool:
+    """Return True for Gemini TTS models known to support expressive audio tags."""
+    normalized = (model or "").strip().lower().rsplit("/", 1)[-1]
+    return "gemini-3.1" in normalized and "tts" in normalized
+
+
+def _gemini_audio_tags_enabled(gemini_config: Dict[str, Any], model: str) -> bool:
+    raw = gemini_config.get("audio_tags")
+    if isinstance(raw, dict):
+        raw = raw.get("enabled")
+    enabled = _config_bool(raw, default=DEFAULT_GEMINI_AUDIO_TAGS)
+    if not enabled:
+        return False
+    if not _gemini_model_supports_audio_tags(model):
+        logger.warning(
+            "Gemini TTS audio_tags enabled, but model %s is not known to support "
+            "Gemini audio tags; skipping hidden tag rewrite",
+            model,
+        )
+        return False
+    return True
+
+
+def _clean_gemini_audio_tag_rewrite(content: str) -> str:
+    clean = (content or "").strip()
+    fence = re.fullmatch(r"```(?:[A-Za-z0-9_-]+)?\s*(.*?)\s*```", clean, flags=re.DOTALL)
+    if fence:
+        clean = fence.group(1).strip()
+    return clean
+
+
+def _extract_auxiliary_message_content(response: Any) -> str:
+    try:
+        choice = response.choices[0]
+        message = getattr(choice, "message", None)
+        if isinstance(message, dict):
+            return str(message.get("content") or "")
+        return str(getattr(message, "content", "") or "")
+    except Exception:
+        return ""
+
+
+def _rewrite_gemini_tts_audio_tags(text: str, persona_prompt: str = "") -> str:
+    """Use the configured auxiliary model to insert Gemini audio tags."""
+    transcript = text.strip()
+    if not transcript:
+        return text
+
+    system_prompt = (
+        "You rewrite transcripts for Gemini 3.1 Flash TTS by inserting expressive "
+        "audio tags.\n\n"
+        "Audio tags are inline square-bracket modifiers such as [whispers], "
+        "[excitedly], [very slow], [sarcastically], [laughs], [sighs], or [gasp]. "
+        "There is no fixed allowlist. Use creative freeform tags generously but "
+        "naturally to control tone, pace, emotional vibe, emphasis, section-level "
+        "delivery, and non-verbal sounds. Use English audio tags even when the "
+        "spoken transcript is not English.\n\n"
+        "Rules:\n"
+        "- Preserve the spoken words, order, and meaning.\n"
+        "- Do not add new spoken sentences or remove existing spoken words.\n"
+        "- Use square brackets for every audio tag.\n"
+        "- Do not use SSML or XML tags.\n"
+        "- Do not explain or comment.\n"
+        "- Return only the tagged TTS script."
+    )
+    context = persona_prompt.strip() or "(none)"
+    user_prompt = (
+        "PERSONA AND DIRECTOR CONTEXT:\n"
+        f"{context}\n\n"
+        "TRANSCRIPT TO TAG:\n"
+        f"{transcript}"
+    )
+    try:
+        from agent.auxiliary_client import call_llm
+
+        response = call_llm(
+            task=GEMINI_AUDIO_TAG_REWRITE_TASK,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=0.7,
+        )
+        tagged = _clean_gemini_audio_tag_rewrite(_extract_auxiliary_message_content(response))
+        return tagged or text
+    except Exception as exc:
+        logger.warning("Gemini TTS audio tag rewrite failed; using untagged text: %s", exc)
+        return text
+
+
+def _compose_gemini_tts_prompt(
+    text: str,
+    gemini_config: Dict[str, Any],
+    persona_prompt: Optional[str] = None,
+) -> str:
    """Build the Gemini prompt from persona direction plus the live transcript."""
    transcript = text.strip()
-    persona_prompt = _read_gemini_persona_prompt(gemini_config)
+    if persona_prompt is None:
+        persona_prompt = _read_gemini_persona_prompt(gemini_config)
    if not persona_prompt:
        return transcript

@ -1487,7 +1588,15 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
        or get_env_value("GEMINI_BASE_URL")
        or DEFAULT_GEMINI_TTS_BASE_URL
    ).strip().rstrip("/")
-    prompt_text = _compose_gemini_tts_prompt(text, gemini_config)
+    persona_prompt = _read_gemini_persona_prompt(gemini_config)
+    tts_script = text
+    if _gemini_audio_tags_enabled(gemini_config, model):
+        tts_script = _rewrite_gemini_tts_audio_tags(text, persona_prompt=persona_prompt)
+    prompt_text = _compose_gemini_tts_prompt(
+        tts_script,
+        gemini_config,
+        persona_prompt=persona_prompt,
+    )
    max_len = _resolve_max_text_length("gemini", tts_config)
    if len(prompt_text) > max_len:
        logger.warning(
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@ -835,6 +835,7 @@ $ hermes model
 [ ] vision               currently: auto / main model
 [ ] web_extract          currently: auto / main model
 [ ] title_generation     currently: openrouter / google/gemini-3-flash-preview
+[ ] tts_audio_tags       currently: auto / main model
 [ ] compression          currently: auto / main model
 [ ] approval             currently: auto / main model
 [ ] triage_specifier     currently: auto / main model
@ -911,6 +912,14 @@ auxiliary:
    api_key: ""
    timeout: 30                # seconds

+  # Gemini 3.1 TTS hidden audio-tag insertion
+  tts_audio_tags:
+    provider: "auto"
+    model: ""                  # empty = main chat model
+    base_url: ""
+    api_key: ""
+    timeout: 30
+
  # Context compression timeout (separate from compression.* config)
  compression:
    timeout: 120               # seconds — compression summarizes long conversations, needs more time
@ -1197,8 +1206,9 @@ tts:
    model: "voxtral-mini-tts-2603"
    voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8"  # Paul - Neutral (default)
  gemini:
-    model: "gemini-2.5-flash-preview-tts"   # or gemini-2.5-pro-preview-tts
+    model: "gemini-2.5-flash-preview-tts"   # or gemini-3.1-flash-tts-preview
    voice: "Kore"               # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc.
+    audio_tags: false           # Hidden Gemini 3.1 TTS audio-tag insertion
    persona_prompt_file: ""      # Optional Markdown/text file with Gemini voice direction
  xai:
    voice_id: "eve"             # xAI TTS voice
--- a/website/docs/user-guide/features/tts.md
+++ b/website/docs/user-guide/features/tts.md
@ -66,8 +66,9 @@ tts:
    model: "voxtral-mini-tts-2603"
    voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8"  # Paul - Neutral (default)
  gemini:
-    model: "gemini-2.5-flash-preview-tts"  # or gemini-2.5-pro-preview-tts
+    model: "gemini-2.5-flash-preview-tts"  # or gemini-3.1-flash-tts-preview
    voice: "Kore"               # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
+    audio_tags: false           # Enable hidden Gemini 3.1 TTS audio-tag insertion
    persona_prompt_file: ""      # Optional Markdown/text file with Gemini voice direction
  xai:
    voice_id: "eve"             # or a custom voice ID — see docs below
@ -112,6 +113,20 @@ tts:
    persona_prompt_file: ~/.hermes/tts/butler-voice.md
 ```

+### Gemini Audio Tags
+
+Gemini 3.1 Flash TTS supports freeform square-bracket audio tags such as `[whispers]`, `[excitedly]`, `[very slow]`, `[laughs]`, and other expressive delivery notes. Enable `tts.gemini.audio_tags` to have Hermes run a hidden rewrite pass before Gemini TTS. The rewrite inserts inline tags into the TTS script only; the visible chat reply stays unchanged.
+
+```yaml
+tts:
+  provider: gemini
+  gemini:
+    model: gemini-3.1-flash-tts-preview
+    audio_tags: true
+```
+
+The rewrite uses `auxiliary.tts_audio_tags` and defaults to your main chat model. Override that auxiliary task if you want tag insertion handled by a cheaper or faster model.
+

 ### Input length limits