diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 41a424a7d18..8ce9ad8e19a 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -415,7 +415,8 @@ prompt_caching: # Auxiliary Models (Advanced — Experimental) # ============================================================================= # Hermes uses lightweight "auxiliary" models for side tasks: image analysis, -# browser screenshot analysis, web page summarization, and context compression. +# browser screenshot analysis, web page summarization, TTS audio-tag insertion, +# and context compression. # # By default these use Gemini Flash via OpenRouter or Nous Portal and are # auto-detected from your credentials. You do NOT need to change anything @@ -460,6 +461,12 @@ prompt_caching: # provider: "auto" # model: "" # +# # Gemini 3.1 TTS hidden audio-tag insertion +# tts_audio_tags: +# provider: "auto" # empty model = your main chat model +# model: "" +# timeout: 30 +# # # Session search — summarizes matching past sessions # session_search: # provider: "auto" @@ -835,6 +842,22 @@ platform_toolsets: # max_tool_rounds: 5 # tool loop limit (0 = disable) # log_level: "info" # audit verbosity +# ============================================================================= +# Text-to-Speech +# ============================================================================= +# TTS defaults to Edge TTS unless changed in ~/.hermes/config.yaml. +# Gemini TTS supports persona/director prompt files, and Gemini 3.1 Flash TTS +# can use a hidden auxiliary rewrite pass to insert expressive square-bracket +# audio tags into the TTS script without showing tags in chat. +# +# tts: +# provider: "gemini" +# gemini: +# model: "gemini-3.1-flash-tts-preview" +# voice: "Kore" +# audio_tags: false +# persona_prompt_file: "" # e.g. ~/.hermes/tts/radio-host.md + # ============================================================================= # Voice Transcription (Speech-to-Text) # ============================================================================= diff --git a/hermes_cli/config.py b/hermes_cli/config.py index c4017373681..494c5ddfe3a 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1290,6 +1290,14 @@ DEFAULT_CONFIG = { "timeout": 30, "extra_body": {}, }, + "tts_audio_tags": { + "provider": "auto", + "model": "", + "base_url": "", + "api_key": "", + "timeout": 30, + "extra_body": {}, + }, # Triage specifier — flesh out a rough one-liner in the Kanban # Triage column into a concrete spec, then promote it to ``todo``. # Invoked by ``hermes kanban specify`` (single id or --all). Set a @@ -1575,6 +1583,10 @@ DEFAULT_CONFIG = { "gemini": { "model": "gemini-2.5-flash-preview-tts", "voice": "Kore", + # When true, Gemini 3.1 TTS uses a hidden auxiliary-model rewrite + # pass to insert freeform square-bracket audio tags into the TTS + # script. Visible chat replies are unchanged. + "audio_tags": False, # Optional local Markdown/text file with Gemini TTS performance # direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES, # SAMPLE CONTEXT, and either a `{transcript}` placeholder or no diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 34d563a6696..20728c4f336 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -2980,6 +2980,7 @@ _AUX_TASKS: list[tuple[str, str, str]] = [ ("approval", "Approval", "smart command approval"), ("mcp", "MCP", "MCP tool reasoning"), ("title_generation", "Title generation", "session titles"), + ("tts_audio_tags", "TTS audio tags", "Gemini TTS tag insertion"), ("skills_hub", "Skills hub", "skills search/install"), ("triage_specifier", "Triage specifier", "kanban spec fleshing"), ("kanban_decomposer", "Kanban decomposer", "task decomposition"), diff --git a/tests/tools/test_tts_gemini.py b/tests/tools/test_tts_gemini.py index 6a52a48c02c..85254649d53 100644 --- a/tests/tools/test_tts_gemini.py +++ b/tests/tools/test_tts_gemini.py @@ -2,6 +2,7 @@ import base64 import struct +from types import SimpleNamespace from unittest.mock import MagicMock, patch import pytest @@ -312,6 +313,112 @@ class TestGenerateGeminiTts: assert prompt_text == "Hi" assert "persona prompt file unavailable" in caplog.text + def test_audio_tags_disabled_does_not_call_rewriter( + self, tmp_path, monkeypatch, mock_gemini_response + ): + from tools.tts_tool import _generate_gemini_tts + + config = { + "gemini": { + "model": "gemini-3.1-flash-tts-preview", + "audio_tags": False, + } + } + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \ + patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config) + + mock_call_llm.assert_not_called() + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert prompt_text == "Hi there." + + def test_audio_tags_enabled_rewrites_hidden_tts_script( + self, tmp_path, monkeypatch, mock_gemini_response + ): + from tools.tts_tool import _generate_gemini_tts + + persona_file = tmp_path / "voice-persona.md" + persona_file.write_text( + "### DIRECTOR'S NOTES\nStyle: Warm and amused.", + encoding="utf-8", + ) + response = SimpleNamespace( + choices=[ + SimpleNamespace( + message=SimpleNamespace(content="[warmly] Hi there. [soft laugh]") + ) + ] + ) + config = { + "gemini": { + "model": "gemini-3.1-flash-tts-preview", + "audio_tags": True, + "persona_prompt_file": str(persona_file), + } + } + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("agent.auxiliary_client.call_llm", return_value=response) as mock_call_llm, \ + patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config) + + mock_call_llm.assert_called_once() + call_kwargs = mock_call_llm.call_args.kwargs + assert call_kwargs["task"] == "tts_audio_tags" + assert "Audio tags are inline square-bracket modifiers" in call_kwargs["messages"][0]["content"] + assert "Style: Warm and amused." in call_kwargs["messages"][1]["content"] + assert "Hi there." in call_kwargs["messages"][1]["content"] + + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert "Synthesize speech from the TRANSCRIPT only" in prompt_text + assert "### DIRECTOR'S NOTES\nStyle: Warm and amused." in prompt_text + assert "#### TRANSCRIPT\n[warmly] Hi there. [soft laugh]" in prompt_text + + def test_audio_tags_enabled_skips_non_tag_capable_model( + self, tmp_path, monkeypatch, mock_gemini_response, caplog + ): + from tools.tts_tool import _generate_gemini_tts + + config = { + "gemini": { + "model": "gemini-2.5-flash-preview-tts", + "audio_tags": True, + } + } + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \ + patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config) + + mock_call_llm.assert_not_called() + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert prompt_text == "Hi there." + assert "not known to support Gemini audio tags" in caplog.text + + def test_audio_tag_rewrite_failure_falls_back_to_original_text( + self, tmp_path, monkeypatch, mock_gemini_response, caplog + ): + from tools.tts_tool import _generate_gemini_tts + + config = { + "gemini": { + "model": "gemini-3.1-flash-tts-preview", + "audio_tags": True, + } + } + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("boom")), \ + patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config) + + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert prompt_text == "Hi there." + assert "audio tag rewrite failed" in caplog.text + class TestGeminiInCheckRequirements: def test_gemini_api_key_satisfies_requirements(self, monkeypatch): diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 8b223da60bd..c6e7c22de0f 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -190,6 +190,8 @@ DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1" DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" DEFAULT_GEMINI_TTS_VOICE = "Kore" DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta" +DEFAULT_GEMINI_AUDIO_TAGS = False +GEMINI_AUDIO_TAG_REWRITE_TASK = "tts_audio_tags" # PCM output specs for Gemini TTS (fixed by the API) GEMINI_TTS_SAMPLE_RATE = 24000 GEMINI_TTS_CHANNELS = 1 @@ -233,6 +235,23 @@ ELEVENLABS_MODEL_MAX_TEXT_LENGTH: Dict[str, int] = { "eleven_flash_v2_5": 40000, } + +def _config_bool(value: Any, default: bool = False) -> bool: + """Coerce common YAML/env bool spellings without treating random strings as true.""" + if isinstance(value, bool): + return value + if value is None: + return default + if isinstance(value, (int, float)): + return bool(value) + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"1", "true", "yes", "on", "enabled"}: + return True + if normalized in {"0", "false", "no", "off", "disabled"}: + return False + return default + # Final fallback when provider isn't recognised at all. FALLBACK_MAX_TEXT_LENGTH = 4000 @@ -1069,20 +1088,7 @@ _XAI_FIRST_SENTENCE_RE = re.compile(r"^(.{12,120}?[.!?…])\s+(?=\S)", flags=re. def _xai_bool_config(value: Any, default: bool = False) -> bool: - """Coerce common YAML/env bool spellings without treating random strings as true.""" - if isinstance(value, bool): - return value - if value is None: - return default - if isinstance(value, (int, float)): - return bool(value) - if isinstance(value, str): - normalized = value.strip().lower() - if normalized in {"1", "true", "yes", "on", "enabled"}: - return True - if normalized in {"0", "false", "no", "off", "disabled"}: - return False - return default + return _config_bool(value, default=default) def _apply_xai_auto_speech_tags(text: str) -> str: @@ -1427,10 +1433,105 @@ def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str: return "" -def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str: +def _gemini_model_supports_audio_tags(model: str) -> bool: + """Return True for Gemini TTS models known to support expressive audio tags.""" + normalized = (model or "").strip().lower().rsplit("/", 1)[-1] + return "gemini-3.1" in normalized and "tts" in normalized + + +def _gemini_audio_tags_enabled(gemini_config: Dict[str, Any], model: str) -> bool: + raw = gemini_config.get("audio_tags") + if isinstance(raw, dict): + raw = raw.get("enabled") + enabled = _config_bool(raw, default=DEFAULT_GEMINI_AUDIO_TAGS) + if not enabled: + return False + if not _gemini_model_supports_audio_tags(model): + logger.warning( + "Gemini TTS audio_tags enabled, but model %s is not known to support " + "Gemini audio tags; skipping hidden tag rewrite", + model, + ) + return False + return True + + +def _clean_gemini_audio_tag_rewrite(content: str) -> str: + clean = (content or "").strip() + fence = re.fullmatch(r"```(?:[A-Za-z0-9_-]+)?\s*(.*?)\s*```", clean, flags=re.DOTALL) + if fence: + clean = fence.group(1).strip() + return clean + + +def _extract_auxiliary_message_content(response: Any) -> str: + try: + choice = response.choices[0] + message = getattr(choice, "message", None) + if isinstance(message, dict): + return str(message.get("content") or "") + return str(getattr(message, "content", "") or "") + except Exception: + return "" + + +def _rewrite_gemini_tts_audio_tags(text: str, persona_prompt: str = "") -> str: + """Use the configured auxiliary model to insert Gemini audio tags.""" + transcript = text.strip() + if not transcript: + return text + + system_prompt = ( + "You rewrite transcripts for Gemini 3.1 Flash TTS by inserting expressive " + "audio tags.\n\n" + "Audio tags are inline square-bracket modifiers such as [whispers], " + "[excitedly], [very slow], [sarcastically], [laughs], [sighs], or [gasp]. " + "There is no fixed allowlist. Use creative freeform tags generously but " + "naturally to control tone, pace, emotional vibe, emphasis, section-level " + "delivery, and non-verbal sounds. Use English audio tags even when the " + "spoken transcript is not English.\n\n" + "Rules:\n" + "- Preserve the spoken words, order, and meaning.\n" + "- Do not add new spoken sentences or remove existing spoken words.\n" + "- Use square brackets for every audio tag.\n" + "- Do not use SSML or XML tags.\n" + "- Do not explain or comment.\n" + "- Return only the tagged TTS script." + ) + context = persona_prompt.strip() or "(none)" + user_prompt = ( + "PERSONA AND DIRECTOR CONTEXT:\n" + f"{context}\n\n" + "TRANSCRIPT TO TAG:\n" + f"{transcript}" + ) + try: + from agent.auxiliary_client import call_llm + + response = call_llm( + task=GEMINI_AUDIO_TAG_REWRITE_TASK, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + temperature=0.7, + ) + tagged = _clean_gemini_audio_tag_rewrite(_extract_auxiliary_message_content(response)) + return tagged or text + except Exception as exc: + logger.warning("Gemini TTS audio tag rewrite failed; using untagged text: %s", exc) + return text + + +def _compose_gemini_tts_prompt( + text: str, + gemini_config: Dict[str, Any], + persona_prompt: Optional[str] = None, +) -> str: """Build the Gemini prompt from persona direction plus the live transcript.""" transcript = text.strip() - persona_prompt = _read_gemini_persona_prompt(gemini_config) + if persona_prompt is None: + persona_prompt = _read_gemini_persona_prompt(gemini_config) if not persona_prompt: return transcript @@ -1487,7 +1588,15 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any] or get_env_value("GEMINI_BASE_URL") or DEFAULT_GEMINI_TTS_BASE_URL ).strip().rstrip("/") - prompt_text = _compose_gemini_tts_prompt(text, gemini_config) + persona_prompt = _read_gemini_persona_prompt(gemini_config) + tts_script = text + if _gemini_audio_tags_enabled(gemini_config, model): + tts_script = _rewrite_gemini_tts_audio_tags(text, persona_prompt=persona_prompt) + prompt_text = _compose_gemini_tts_prompt( + tts_script, + gemini_config, + persona_prompt=persona_prompt, + ) max_len = _resolve_max_text_length("gemini", tts_config) if len(prompt_text) > max_len: logger.warning( diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index bf91953f6a3..4b2d2c40e93 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -835,6 +835,7 @@ $ hermes model [ ] vision currently: auto / main model [ ] web_extract currently: auto / main model [ ] title_generation currently: openrouter / google/gemini-3-flash-preview +[ ] tts_audio_tags currently: auto / main model [ ] compression currently: auto / main model [ ] approval currently: auto / main model [ ] triage_specifier currently: auto / main model @@ -911,6 +912,14 @@ auxiliary: api_key: "" timeout: 30 # seconds + # Gemini 3.1 TTS hidden audio-tag insertion + tts_audio_tags: + provider: "auto" + model: "" # empty = main chat model + base_url: "" + api_key: "" + timeout: 30 + # Context compression timeout (separate from compression.* config) compression: timeout: 120 # seconds — compression summarizes long conversations, needs more time @@ -1197,8 +1206,9 @@ tts: model: "voxtral-mini-tts-2603" voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default) gemini: - model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts + model: "gemini-2.5-flash-preview-tts" # or gemini-3.1-flash-tts-preview voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc. + audio_tags: false # Hidden Gemini 3.1 TTS audio-tag insertion persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction xai: voice_id: "eve" # xAI TTS voice diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index d67efc3e2c7..9912d834972 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -66,8 +66,9 @@ tts: model: "voxtral-mini-tts-2603" voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default) gemini: - model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts + model: "gemini-2.5-flash-preview-tts" # or gemini-3.1-flash-tts-preview voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc. + audio_tags: false # Enable hidden Gemini 3.1 TTS audio-tag insertion persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction xai: voice_id: "eve" # or a custom voice ID — see docs below @@ -112,6 +113,20 @@ tts: persona_prompt_file: ~/.hermes/tts/butler-voice.md ``` +### Gemini Audio Tags + +Gemini 3.1 Flash TTS supports freeform square-bracket audio tags such as `[whispers]`, `[excitedly]`, `[very slow]`, `[laughs]`, and other expressive delivery notes. Enable `tts.gemini.audio_tags` to have Hermes run a hidden rewrite pass before Gemini TTS. The rewrite inserts inline tags into the TTS script only; the visible chat reply stays unchanged. + +```yaml +tts: + provider: gemini + gemini: + model: gemini-3.1-flash-tts-preview + audio_tags: true +``` + +The rewrite uses `auxiliary.tts_audio_tags` and defaults to your main chat model. Override that auxiliary task if you want tag insertion handled by a cheaper or faster model. + ### Input length limits