feat(tts): add Gemini audio tag rewrite

This commit is contained in:
Barron Roth 2026-06-08 21:04:45 -07:00 committed by Teknium
parent 5718811de0
commit 2c19208224
7 changed files with 297 additions and 20 deletions

View file

@ -415,7 +415,8 @@ prompt_caching:
# Auxiliary Models (Advanced — Experimental)
# =============================================================================
# Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
# browser screenshot analysis, web page summarization, and context compression.
# browser screenshot analysis, web page summarization, TTS audio-tag insertion,
# and context compression.
#
# By default these use Gemini Flash via OpenRouter or Nous Portal and are
# auto-detected from your credentials. You do NOT need to change anything
@ -460,6 +461,12 @@ prompt_caching:
# provider: "auto"
# model: ""
#
# # Gemini 3.1 TTS hidden audio-tag insertion
# tts_audio_tags:
# provider: "auto" # empty model = your main chat model
# model: ""
# timeout: 30
#
# # Session search — summarizes matching past sessions
# session_search:
# provider: "auto"
@ -835,6 +842,22 @@ platform_toolsets:
# max_tool_rounds: 5 # tool loop limit (0 = disable)
# log_level: "info" # audit verbosity
# =============================================================================
# Text-to-Speech
# =============================================================================
# TTS defaults to Edge TTS unless changed in ~/.hermes/config.yaml.
# Gemini TTS supports persona/director prompt files, and Gemini 3.1 Flash TTS
# can use a hidden auxiliary rewrite pass to insert expressive square-bracket
# audio tags into the TTS script without showing tags in chat.
#
# tts:
# provider: "gemini"
# gemini:
# model: "gemini-3.1-flash-tts-preview"
# voice: "Kore"
# audio_tags: false
# persona_prompt_file: "" # e.g. ~/.hermes/tts/radio-host.md
# =============================================================================
# Voice Transcription (Speech-to-Text)
# =============================================================================

View file

@ -1290,6 +1290,14 @@ DEFAULT_CONFIG = {
"timeout": 30,
"extra_body": {},
},
"tts_audio_tags": {
"provider": "auto",
"model": "",
"base_url": "",
"api_key": "",
"timeout": 30,
"extra_body": {},
},
# Triage specifier — flesh out a rough one-liner in the Kanban
# Triage column into a concrete spec, then promote it to ``todo``.
# Invoked by ``hermes kanban specify`` (single id or --all). Set a
@ -1575,6 +1583,10 @@ DEFAULT_CONFIG = {
"gemini": {
"model": "gemini-2.5-flash-preview-tts",
"voice": "Kore",
# When true, Gemini 3.1 TTS uses a hidden auxiliary-model rewrite
# pass to insert freeform square-bracket audio tags into the TTS
# script. Visible chat replies are unchanged.
"audio_tags": False,
# Optional local Markdown/text file with Gemini TTS performance
# direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES,
# SAMPLE CONTEXT, and either a `{transcript}` placeholder or no

View file

@ -2980,6 +2980,7 @@ _AUX_TASKS: list[tuple[str, str, str]] = [
("approval", "Approval", "smart command approval"),
("mcp", "MCP", "MCP tool reasoning"),
("title_generation", "Title generation", "session titles"),
("tts_audio_tags", "TTS audio tags", "Gemini TTS tag insertion"),
("skills_hub", "Skills hub", "skills search/install"),
("triage_specifier", "Triage specifier", "kanban spec fleshing"),
("kanban_decomposer", "Kanban decomposer", "task decomposition"),

View file

@ -2,6 +2,7 @@
import base64
import struct
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
import pytest
@ -312,6 +313,112 @@ class TestGenerateGeminiTts:
assert prompt_text == "Hi"
assert "persona prompt file unavailable" in caplog.text
def test_audio_tags_disabled_does_not_call_rewriter(
self, tmp_path, monkeypatch, mock_gemini_response
):
from tools.tts_tool import _generate_gemini_tts
config = {
"gemini": {
"model": "gemini-3.1-flash-tts-preview",
"audio_tags": False,
}
}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \
patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
mock_call_llm.assert_not_called()
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert prompt_text == "Hi there."
def test_audio_tags_enabled_rewrites_hidden_tts_script(
self, tmp_path, monkeypatch, mock_gemini_response
):
from tools.tts_tool import _generate_gemini_tts
persona_file = tmp_path / "voice-persona.md"
persona_file.write_text(
"### DIRECTOR'S NOTES\nStyle: Warm and amused.",
encoding="utf-8",
)
response = SimpleNamespace(
choices=[
SimpleNamespace(
message=SimpleNamespace(content="[warmly] Hi there. [soft laugh]")
)
]
)
config = {
"gemini": {
"model": "gemini-3.1-flash-tts-preview",
"audio_tags": True,
"persona_prompt_file": str(persona_file),
}
}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("agent.auxiliary_client.call_llm", return_value=response) as mock_call_llm, \
patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
mock_call_llm.assert_called_once()
call_kwargs = mock_call_llm.call_args.kwargs
assert call_kwargs["task"] == "tts_audio_tags"
assert "Audio tags are inline square-bracket modifiers" in call_kwargs["messages"][0]["content"]
assert "Style: Warm and amused." in call_kwargs["messages"][1]["content"]
assert "Hi there." in call_kwargs["messages"][1]["content"]
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert "Synthesize speech from the TRANSCRIPT only" in prompt_text
assert "### DIRECTOR'S NOTES\nStyle: Warm and amused." in prompt_text
assert "#### TRANSCRIPT\n[warmly] Hi there. [soft laugh]" in prompt_text
def test_audio_tags_enabled_skips_non_tag_capable_model(
self, tmp_path, monkeypatch, mock_gemini_response, caplog
):
from tools.tts_tool import _generate_gemini_tts
config = {
"gemini": {
"model": "gemini-2.5-flash-preview-tts",
"audio_tags": True,
}
}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \
patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
mock_call_llm.assert_not_called()
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert prompt_text == "Hi there."
assert "not known to support Gemini audio tags" in caplog.text
def test_audio_tag_rewrite_failure_falls_back_to_original_text(
self, tmp_path, monkeypatch, mock_gemini_response, caplog
):
from tools.tts_tool import _generate_gemini_tts
config = {
"gemini": {
"model": "gemini-3.1-flash-tts-preview",
"audio_tags": True,
}
}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("boom")), \
patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert prompt_text == "Hi there."
assert "audio tag rewrite failed" in caplog.text
class TestGeminiInCheckRequirements:
def test_gemini_api_key_satisfies_requirements(self, monkeypatch):

View file

@ -190,6 +190,8 @@ DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
DEFAULT_GEMINI_TTS_VOICE = "Kore"
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
DEFAULT_GEMINI_AUDIO_TAGS = False
GEMINI_AUDIO_TAG_REWRITE_TASK = "tts_audio_tags"
# PCM output specs for Gemini TTS (fixed by the API)
GEMINI_TTS_SAMPLE_RATE = 24000
GEMINI_TTS_CHANNELS = 1
@ -233,6 +235,23 @@ ELEVENLABS_MODEL_MAX_TEXT_LENGTH: Dict[str, int] = {
"eleven_flash_v2_5": 40000,
}
def _config_bool(value: Any, default: bool = False) -> bool:
"""Coerce common YAML/env bool spellings without treating random strings as true."""
if isinstance(value, bool):
return value
if value is None:
return default
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"1", "true", "yes", "on", "enabled"}:
return True
if normalized in {"0", "false", "no", "off", "disabled"}:
return False
return default
# Final fallback when provider isn't recognised at all.
FALLBACK_MAX_TEXT_LENGTH = 4000
@ -1069,20 +1088,7 @@ _XAI_FIRST_SENTENCE_RE = re.compile(r"^(.{12,120}?[.!?…])\s+(?=\S)", flags=re.
def _xai_bool_config(value: Any, default: bool = False) -> bool:
"""Coerce common YAML/env bool spellings without treating random strings as true."""
if isinstance(value, bool):
return value
if value is None:
return default
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"1", "true", "yes", "on", "enabled"}:
return True
if normalized in {"0", "false", "no", "off", "disabled"}:
return False
return default
return _config_bool(value, default=default)
def _apply_xai_auto_speech_tags(text: str) -> str:
@ -1427,10 +1433,105 @@ def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str:
return ""
def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str:
def _gemini_model_supports_audio_tags(model: str) -> bool:
"""Return True for Gemini TTS models known to support expressive audio tags."""
normalized = (model or "").strip().lower().rsplit("/", 1)[-1]
return "gemini-3.1" in normalized and "tts" in normalized
def _gemini_audio_tags_enabled(gemini_config: Dict[str, Any], model: str) -> bool:
raw = gemini_config.get("audio_tags")
if isinstance(raw, dict):
raw = raw.get("enabled")
enabled = _config_bool(raw, default=DEFAULT_GEMINI_AUDIO_TAGS)
if not enabled:
return False
if not _gemini_model_supports_audio_tags(model):
logger.warning(
"Gemini TTS audio_tags enabled, but model %s is not known to support "
"Gemini audio tags; skipping hidden tag rewrite",
model,
)
return False
return True
def _clean_gemini_audio_tag_rewrite(content: str) -> str:
clean = (content or "").strip()
fence = re.fullmatch(r"```(?:[A-Za-z0-9_-]+)?\s*(.*?)\s*```", clean, flags=re.DOTALL)
if fence:
clean = fence.group(1).strip()
return clean
def _extract_auxiliary_message_content(response: Any) -> str:
try:
choice = response.choices[0]
message = getattr(choice, "message", None)
if isinstance(message, dict):
return str(message.get("content") or "")
return str(getattr(message, "content", "") or "")
except Exception:
return ""
def _rewrite_gemini_tts_audio_tags(text: str, persona_prompt: str = "") -> str:
"""Use the configured auxiliary model to insert Gemini audio tags."""
transcript = text.strip()
if not transcript:
return text
system_prompt = (
"You rewrite transcripts for Gemini 3.1 Flash TTS by inserting expressive "
"audio tags.\n\n"
"Audio tags are inline square-bracket modifiers such as [whispers], "
"[excitedly], [very slow], [sarcastically], [laughs], [sighs], or [gasp]. "
"There is no fixed allowlist. Use creative freeform tags generously but "
"naturally to control tone, pace, emotional vibe, emphasis, section-level "
"delivery, and non-verbal sounds. Use English audio tags even when the "
"spoken transcript is not English.\n\n"
"Rules:\n"
"- Preserve the spoken words, order, and meaning.\n"
"- Do not add new spoken sentences or remove existing spoken words.\n"
"- Use square brackets for every audio tag.\n"
"- Do not use SSML or XML tags.\n"
"- Do not explain or comment.\n"
"- Return only the tagged TTS script."
)
context = persona_prompt.strip() or "(none)"
user_prompt = (
"PERSONA AND DIRECTOR CONTEXT:\n"
f"{context}\n\n"
"TRANSCRIPT TO TAG:\n"
f"{transcript}"
)
try:
from agent.auxiliary_client import call_llm
response = call_llm(
task=GEMINI_AUDIO_TAG_REWRITE_TASK,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
temperature=0.7,
)
tagged = _clean_gemini_audio_tag_rewrite(_extract_auxiliary_message_content(response))
return tagged or text
except Exception as exc:
logger.warning("Gemini TTS audio tag rewrite failed; using untagged text: %s", exc)
return text
def _compose_gemini_tts_prompt(
text: str,
gemini_config: Dict[str, Any],
persona_prompt: Optional[str] = None,
) -> str:
"""Build the Gemini prompt from persona direction plus the live transcript."""
transcript = text.strip()
persona_prompt = _read_gemini_persona_prompt(gemini_config)
if persona_prompt is None:
persona_prompt = _read_gemini_persona_prompt(gemini_config)
if not persona_prompt:
return transcript
@ -1487,7 +1588,15 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
or get_env_value("GEMINI_BASE_URL")
or DEFAULT_GEMINI_TTS_BASE_URL
).strip().rstrip("/")
prompt_text = _compose_gemini_tts_prompt(text, gemini_config)
persona_prompt = _read_gemini_persona_prompt(gemini_config)
tts_script = text
if _gemini_audio_tags_enabled(gemini_config, model):
tts_script = _rewrite_gemini_tts_audio_tags(text, persona_prompt=persona_prompt)
prompt_text = _compose_gemini_tts_prompt(
tts_script,
gemini_config,
persona_prompt=persona_prompt,
)
max_len = _resolve_max_text_length("gemini", tts_config)
if len(prompt_text) > max_len:
logger.warning(

View file

@ -835,6 +835,7 @@ $ hermes model
[ ] vision currently: auto / main model
[ ] web_extract currently: auto / main model
[ ] title_generation currently: openrouter / google/gemini-3-flash-preview
[ ] tts_audio_tags currently: auto / main model
[ ] compression currently: auto / main model
[ ] approval currently: auto / main model
[ ] triage_specifier currently: auto / main model
@ -911,6 +912,14 @@ auxiliary:
api_key: ""
timeout: 30 # seconds
# Gemini 3.1 TTS hidden audio-tag insertion
tts_audio_tags:
provider: "auto"
model: "" # empty = main chat model
base_url: ""
api_key: ""
timeout: 30
# Context compression timeout (separate from compression.* config)
compression:
timeout: 120 # seconds — compression summarizes long conversations, needs more time
@ -1197,8 +1206,9 @@ tts:
model: "voxtral-mini-tts-2603"
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
gemini:
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
model: "gemini-2.5-flash-preview-tts" # or gemini-3.1-flash-tts-preview
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc.
audio_tags: false # Hidden Gemini 3.1 TTS audio-tag insertion
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
xai:
voice_id: "eve" # xAI TTS voice

View file

@ -66,8 +66,9 @@ tts:
model: "voxtral-mini-tts-2603"
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
gemini:
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
model: "gemini-2.5-flash-preview-tts" # or gemini-3.1-flash-tts-preview
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
audio_tags: false # Enable hidden Gemini 3.1 TTS audio-tag insertion
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
xai:
voice_id: "eve" # or a custom voice ID — see docs below
@ -112,6 +113,20 @@ tts:
persona_prompt_file: ~/.hermes/tts/butler-voice.md
```
### Gemini Audio Tags
Gemini 3.1 Flash TTS supports freeform square-bracket audio tags such as `[whispers]`, `[excitedly]`, `[very slow]`, `[laughs]`, and other expressive delivery notes. Enable `tts.gemini.audio_tags` to have Hermes run a hidden rewrite pass before Gemini TTS. The rewrite inserts inline tags into the TTS script only; the visible chat reply stays unchanged.
```yaml
tts:
provider: gemini
gemini:
model: gemini-3.1-flash-tts-preview
audio_tags: true
```
The rewrite uses `auxiliary.tts_audio_tags` and defaults to your main chat model. Override that auxiliary task if you want tag insertion handled by a cheaper or faster model.
### Input length limits