mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
feat(tts): add Gemini audio tag rewrite
This commit is contained in:
parent
5718811de0
commit
2c19208224
7 changed files with 297 additions and 20 deletions
|
|
@ -415,7 +415,8 @@ prompt_caching:
|
|||
# Auxiliary Models (Advanced — Experimental)
|
||||
# =============================================================================
|
||||
# Hermes uses lightweight "auxiliary" models for side tasks: image analysis,
|
||||
# browser screenshot analysis, web page summarization, and context compression.
|
||||
# browser screenshot analysis, web page summarization, TTS audio-tag insertion,
|
||||
# and context compression.
|
||||
#
|
||||
# By default these use Gemini Flash via OpenRouter or Nous Portal and are
|
||||
# auto-detected from your credentials. You do NOT need to change anything
|
||||
|
|
@ -460,6 +461,12 @@ prompt_caching:
|
|||
# provider: "auto"
|
||||
# model: ""
|
||||
#
|
||||
# # Gemini 3.1 TTS hidden audio-tag insertion
|
||||
# tts_audio_tags:
|
||||
# provider: "auto" # empty model = your main chat model
|
||||
# model: ""
|
||||
# timeout: 30
|
||||
#
|
||||
# # Session search — summarizes matching past sessions
|
||||
# session_search:
|
||||
# provider: "auto"
|
||||
|
|
@ -835,6 +842,22 @@ platform_toolsets:
|
|||
# max_tool_rounds: 5 # tool loop limit (0 = disable)
|
||||
# log_level: "info" # audit verbosity
|
||||
|
||||
# =============================================================================
|
||||
# Text-to-Speech
|
||||
# =============================================================================
|
||||
# TTS defaults to Edge TTS unless changed in ~/.hermes/config.yaml.
|
||||
# Gemini TTS supports persona/director prompt files, and Gemini 3.1 Flash TTS
|
||||
# can use a hidden auxiliary rewrite pass to insert expressive square-bracket
|
||||
# audio tags into the TTS script without showing tags in chat.
|
||||
#
|
||||
# tts:
|
||||
# provider: "gemini"
|
||||
# gemini:
|
||||
# model: "gemini-3.1-flash-tts-preview"
|
||||
# voice: "Kore"
|
||||
# audio_tags: false
|
||||
# persona_prompt_file: "" # e.g. ~/.hermes/tts/radio-host.md
|
||||
|
||||
# =============================================================================
|
||||
# Voice Transcription (Speech-to-Text)
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -1290,6 +1290,14 @@ DEFAULT_CONFIG = {
|
|||
"timeout": 30,
|
||||
"extra_body": {},
|
||||
},
|
||||
"tts_audio_tags": {
|
||||
"provider": "auto",
|
||||
"model": "",
|
||||
"base_url": "",
|
||||
"api_key": "",
|
||||
"timeout": 30,
|
||||
"extra_body": {},
|
||||
},
|
||||
# Triage specifier — flesh out a rough one-liner in the Kanban
|
||||
# Triage column into a concrete spec, then promote it to ``todo``.
|
||||
# Invoked by ``hermes kanban specify`` (single id or --all). Set a
|
||||
|
|
@ -1575,6 +1583,10 @@ DEFAULT_CONFIG = {
|
|||
"gemini": {
|
||||
"model": "gemini-2.5-flash-preview-tts",
|
||||
"voice": "Kore",
|
||||
# When true, Gemini 3.1 TTS uses a hidden auxiliary-model rewrite
|
||||
# pass to insert freeform square-bracket audio tags into the TTS
|
||||
# script. Visible chat replies are unchanged.
|
||||
"audio_tags": False,
|
||||
# Optional local Markdown/text file with Gemini TTS performance
|
||||
# direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES,
|
||||
# SAMPLE CONTEXT, and either a `{transcript}` placeholder or no
|
||||
|
|
|
|||
|
|
@ -2980,6 +2980,7 @@ _AUX_TASKS: list[tuple[str, str, str]] = [
|
|||
("approval", "Approval", "smart command approval"),
|
||||
("mcp", "MCP", "MCP tool reasoning"),
|
||||
("title_generation", "Title generation", "session titles"),
|
||||
("tts_audio_tags", "TTS audio tags", "Gemini TTS tag insertion"),
|
||||
("skills_hub", "Skills hub", "skills search/install"),
|
||||
("triage_specifier", "Triage specifier", "kanban spec fleshing"),
|
||||
("kanban_decomposer", "Kanban decomposer", "task decomposition"),
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
import base64
|
||||
import struct
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
|
@ -312,6 +313,112 @@ class TestGenerateGeminiTts:
|
|||
assert prompt_text == "Hi"
|
||||
assert "persona prompt file unavailable" in caplog.text
|
||||
|
||||
def test_audio_tags_disabled_does_not_call_rewriter(
|
||||
self, tmp_path, monkeypatch, mock_gemini_response
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
config = {
|
||||
"gemini": {
|
||||
"model": "gemini-3.1-flash-tts-preview",
|
||||
"audio_tags": False,
|
||||
}
|
||||
}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \
|
||||
patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
|
||||
|
||||
mock_call_llm.assert_not_called()
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert prompt_text == "Hi there."
|
||||
|
||||
def test_audio_tags_enabled_rewrites_hidden_tts_script(
|
||||
self, tmp_path, monkeypatch, mock_gemini_response
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
persona_file = tmp_path / "voice-persona.md"
|
||||
persona_file.write_text(
|
||||
"### DIRECTOR'S NOTES\nStyle: Warm and amused.",
|
||||
encoding="utf-8",
|
||||
)
|
||||
response = SimpleNamespace(
|
||||
choices=[
|
||||
SimpleNamespace(
|
||||
message=SimpleNamespace(content="[warmly] Hi there. [soft laugh]")
|
||||
)
|
||||
]
|
||||
)
|
||||
config = {
|
||||
"gemini": {
|
||||
"model": "gemini-3.1-flash-tts-preview",
|
||||
"audio_tags": True,
|
||||
"persona_prompt_file": str(persona_file),
|
||||
}
|
||||
}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("agent.auxiliary_client.call_llm", return_value=response) as mock_call_llm, \
|
||||
patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
|
||||
|
||||
mock_call_llm.assert_called_once()
|
||||
call_kwargs = mock_call_llm.call_args.kwargs
|
||||
assert call_kwargs["task"] == "tts_audio_tags"
|
||||
assert "Audio tags are inline square-bracket modifiers" in call_kwargs["messages"][0]["content"]
|
||||
assert "Style: Warm and amused." in call_kwargs["messages"][1]["content"]
|
||||
assert "Hi there." in call_kwargs["messages"][1]["content"]
|
||||
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert "Synthesize speech from the TRANSCRIPT only" in prompt_text
|
||||
assert "### DIRECTOR'S NOTES\nStyle: Warm and amused." in prompt_text
|
||||
assert "#### TRANSCRIPT\n[warmly] Hi there. [soft laugh]" in prompt_text
|
||||
|
||||
def test_audio_tags_enabled_skips_non_tag_capable_model(
|
||||
self, tmp_path, monkeypatch, mock_gemini_response, caplog
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
config = {
|
||||
"gemini": {
|
||||
"model": "gemini-2.5-flash-preview-tts",
|
||||
"audio_tags": True,
|
||||
}
|
||||
}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("agent.auxiliary_client.call_llm") as mock_call_llm, \
|
||||
patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
|
||||
|
||||
mock_call_llm.assert_not_called()
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert prompt_text == "Hi there."
|
||||
assert "not known to support Gemini audio tags" in caplog.text
|
||||
|
||||
def test_audio_tag_rewrite_failure_falls_back_to_original_text(
|
||||
self, tmp_path, monkeypatch, mock_gemini_response, caplog
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
config = {
|
||||
"gemini": {
|
||||
"model": "gemini-3.1-flash-tts-preview",
|
||||
"audio_tags": True,
|
||||
}
|
||||
}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("boom")), \
|
||||
patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi there.", str(tmp_path / "test.wav"), config)
|
||||
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert prompt_text == "Hi there."
|
||||
assert "audio tag rewrite failed" in caplog.text
|
||||
|
||||
|
||||
class TestGeminiInCheckRequirements:
|
||||
def test_gemini_api_key_satisfies_requirements(self, monkeypatch):
|
||||
|
|
|
|||
|
|
@ -190,6 +190,8 @@ DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
|
|||
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
||||
DEFAULT_GEMINI_TTS_VOICE = "Kore"
|
||||
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
||||
DEFAULT_GEMINI_AUDIO_TAGS = False
|
||||
GEMINI_AUDIO_TAG_REWRITE_TASK = "tts_audio_tags"
|
||||
# PCM output specs for Gemini TTS (fixed by the API)
|
||||
GEMINI_TTS_SAMPLE_RATE = 24000
|
||||
GEMINI_TTS_CHANNELS = 1
|
||||
|
|
@ -233,6 +235,23 @@ ELEVENLABS_MODEL_MAX_TEXT_LENGTH: Dict[str, int] = {
|
|||
"eleven_flash_v2_5": 40000,
|
||||
}
|
||||
|
||||
|
||||
def _config_bool(value: Any, default: bool = False) -> bool:
|
||||
"""Coerce common YAML/env bool spellings without treating random strings as true."""
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, (int, float)):
|
||||
return bool(value)
|
||||
if isinstance(value, str):
|
||||
normalized = value.strip().lower()
|
||||
if normalized in {"1", "true", "yes", "on", "enabled"}:
|
||||
return True
|
||||
if normalized in {"0", "false", "no", "off", "disabled"}:
|
||||
return False
|
||||
return default
|
||||
|
||||
# Final fallback when provider isn't recognised at all.
|
||||
FALLBACK_MAX_TEXT_LENGTH = 4000
|
||||
|
||||
|
|
@ -1069,20 +1088,7 @@ _XAI_FIRST_SENTENCE_RE = re.compile(r"^(.{12,120}?[.!?…])\s+(?=\S)", flags=re.
|
|||
|
||||
|
||||
def _xai_bool_config(value: Any, default: bool = False) -> bool:
|
||||
"""Coerce common YAML/env bool spellings without treating random strings as true."""
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, (int, float)):
|
||||
return bool(value)
|
||||
if isinstance(value, str):
|
||||
normalized = value.strip().lower()
|
||||
if normalized in {"1", "true", "yes", "on", "enabled"}:
|
||||
return True
|
||||
if normalized in {"0", "false", "no", "off", "disabled"}:
|
||||
return False
|
||||
return default
|
||||
return _config_bool(value, default=default)
|
||||
|
||||
|
||||
def _apply_xai_auto_speech_tags(text: str) -> str:
|
||||
|
|
@ -1427,10 +1433,105 @@ def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str:
|
|||
return ""
|
||||
|
||||
|
||||
def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str:
|
||||
def _gemini_model_supports_audio_tags(model: str) -> bool:
|
||||
"""Return True for Gemini TTS models known to support expressive audio tags."""
|
||||
normalized = (model or "").strip().lower().rsplit("/", 1)[-1]
|
||||
return "gemini-3.1" in normalized and "tts" in normalized
|
||||
|
||||
|
||||
def _gemini_audio_tags_enabled(gemini_config: Dict[str, Any], model: str) -> bool:
|
||||
raw = gemini_config.get("audio_tags")
|
||||
if isinstance(raw, dict):
|
||||
raw = raw.get("enabled")
|
||||
enabled = _config_bool(raw, default=DEFAULT_GEMINI_AUDIO_TAGS)
|
||||
if not enabled:
|
||||
return False
|
||||
if not _gemini_model_supports_audio_tags(model):
|
||||
logger.warning(
|
||||
"Gemini TTS audio_tags enabled, but model %s is not known to support "
|
||||
"Gemini audio tags; skipping hidden tag rewrite",
|
||||
model,
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _clean_gemini_audio_tag_rewrite(content: str) -> str:
|
||||
clean = (content or "").strip()
|
||||
fence = re.fullmatch(r"```(?:[A-Za-z0-9_-]+)?\s*(.*?)\s*```", clean, flags=re.DOTALL)
|
||||
if fence:
|
||||
clean = fence.group(1).strip()
|
||||
return clean
|
||||
|
||||
|
||||
def _extract_auxiliary_message_content(response: Any) -> str:
|
||||
try:
|
||||
choice = response.choices[0]
|
||||
message = getattr(choice, "message", None)
|
||||
if isinstance(message, dict):
|
||||
return str(message.get("content") or "")
|
||||
return str(getattr(message, "content", "") or "")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _rewrite_gemini_tts_audio_tags(text: str, persona_prompt: str = "") -> str:
|
||||
"""Use the configured auxiliary model to insert Gemini audio tags."""
|
||||
transcript = text.strip()
|
||||
if not transcript:
|
||||
return text
|
||||
|
||||
system_prompt = (
|
||||
"You rewrite transcripts for Gemini 3.1 Flash TTS by inserting expressive "
|
||||
"audio tags.\n\n"
|
||||
"Audio tags are inline square-bracket modifiers such as [whispers], "
|
||||
"[excitedly], [very slow], [sarcastically], [laughs], [sighs], or [gasp]. "
|
||||
"There is no fixed allowlist. Use creative freeform tags generously but "
|
||||
"naturally to control tone, pace, emotional vibe, emphasis, section-level "
|
||||
"delivery, and non-verbal sounds. Use English audio tags even when the "
|
||||
"spoken transcript is not English.\n\n"
|
||||
"Rules:\n"
|
||||
"- Preserve the spoken words, order, and meaning.\n"
|
||||
"- Do not add new spoken sentences or remove existing spoken words.\n"
|
||||
"- Use square brackets for every audio tag.\n"
|
||||
"- Do not use SSML or XML tags.\n"
|
||||
"- Do not explain or comment.\n"
|
||||
"- Return only the tagged TTS script."
|
||||
)
|
||||
context = persona_prompt.strip() or "(none)"
|
||||
user_prompt = (
|
||||
"PERSONA AND DIRECTOR CONTEXT:\n"
|
||||
f"{context}\n\n"
|
||||
"TRANSCRIPT TO TAG:\n"
|
||||
f"{transcript}"
|
||||
)
|
||||
try:
|
||||
from agent.auxiliary_client import call_llm
|
||||
|
||||
response = call_llm(
|
||||
task=GEMINI_AUDIO_TAG_REWRITE_TASK,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
temperature=0.7,
|
||||
)
|
||||
tagged = _clean_gemini_audio_tag_rewrite(_extract_auxiliary_message_content(response))
|
||||
return tagged or text
|
||||
except Exception as exc:
|
||||
logger.warning("Gemini TTS audio tag rewrite failed; using untagged text: %s", exc)
|
||||
return text
|
||||
|
||||
|
||||
def _compose_gemini_tts_prompt(
|
||||
text: str,
|
||||
gemini_config: Dict[str, Any],
|
||||
persona_prompt: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Build the Gemini prompt from persona direction plus the live transcript."""
|
||||
transcript = text.strip()
|
||||
persona_prompt = _read_gemini_persona_prompt(gemini_config)
|
||||
if persona_prompt is None:
|
||||
persona_prompt = _read_gemini_persona_prompt(gemini_config)
|
||||
if not persona_prompt:
|
||||
return transcript
|
||||
|
||||
|
|
@ -1487,7 +1588,15 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
|||
or get_env_value("GEMINI_BASE_URL")
|
||||
or DEFAULT_GEMINI_TTS_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
prompt_text = _compose_gemini_tts_prompt(text, gemini_config)
|
||||
persona_prompt = _read_gemini_persona_prompt(gemini_config)
|
||||
tts_script = text
|
||||
if _gemini_audio_tags_enabled(gemini_config, model):
|
||||
tts_script = _rewrite_gemini_tts_audio_tags(text, persona_prompt=persona_prompt)
|
||||
prompt_text = _compose_gemini_tts_prompt(
|
||||
tts_script,
|
||||
gemini_config,
|
||||
persona_prompt=persona_prompt,
|
||||
)
|
||||
max_len = _resolve_max_text_length("gemini", tts_config)
|
||||
if len(prompt_text) > max_len:
|
||||
logger.warning(
|
||||
|
|
|
|||
|
|
@ -835,6 +835,7 @@ $ hermes model
|
|||
[ ] vision currently: auto / main model
|
||||
[ ] web_extract currently: auto / main model
|
||||
[ ] title_generation currently: openrouter / google/gemini-3-flash-preview
|
||||
[ ] tts_audio_tags currently: auto / main model
|
||||
[ ] compression currently: auto / main model
|
||||
[ ] approval currently: auto / main model
|
||||
[ ] triage_specifier currently: auto / main model
|
||||
|
|
@ -911,6 +912,14 @@ auxiliary:
|
|||
api_key: ""
|
||||
timeout: 30 # seconds
|
||||
|
||||
# Gemini 3.1 TTS hidden audio-tag insertion
|
||||
tts_audio_tags:
|
||||
provider: "auto"
|
||||
model: "" # empty = main chat model
|
||||
base_url: ""
|
||||
api_key: ""
|
||||
timeout: 30
|
||||
|
||||
# Context compression timeout (separate from compression.* config)
|
||||
compression:
|
||||
timeout: 120 # seconds — compression summarizes long conversations, needs more time
|
||||
|
|
@ -1197,8 +1206,9 @@ tts:
|
|||
model: "voxtral-mini-tts-2603"
|
||||
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
|
||||
gemini:
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-3.1-flash-tts-preview
|
||||
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc.
|
||||
audio_tags: false # Hidden Gemini 3.1 TTS audio-tag insertion
|
||||
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
|
||||
xai:
|
||||
voice_id: "eve" # xAI TTS voice
|
||||
|
|
|
|||
|
|
@ -66,8 +66,9 @@ tts:
|
|||
model: "voxtral-mini-tts-2603"
|
||||
voice_id: "c69964a6-ab8b-4f8a-9465-ec0925096ec8" # Paul - Neutral (default)
|
||||
gemini:
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-3.1-flash-tts-preview
|
||||
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
|
||||
audio_tags: false # Enable hidden Gemini 3.1 TTS audio-tag insertion
|
||||
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
|
||||
xai:
|
||||
voice_id: "eve" # or a custom voice ID — see docs below
|
||||
|
|
@ -112,6 +113,20 @@ tts:
|
|||
persona_prompt_file: ~/.hermes/tts/butler-voice.md
|
||||
```
|
||||
|
||||
### Gemini Audio Tags
|
||||
|
||||
Gemini 3.1 Flash TTS supports freeform square-bracket audio tags such as `[whispers]`, `[excitedly]`, `[very slow]`, `[laughs]`, and other expressive delivery notes. Enable `tts.gemini.audio_tags` to have Hermes run a hidden rewrite pass before Gemini TTS. The rewrite inserts inline tags into the TTS script only; the visible chat reply stays unchanged.
|
||||
|
||||
```yaml
|
||||
tts:
|
||||
provider: gemini
|
||||
gemini:
|
||||
model: gemini-3.1-flash-tts-preview
|
||||
audio_tags: true
|
||||
```
|
||||
|
||||
The rewrite uses `auxiliary.tts_audio_tags` and defaults to your main chat model. Override that auxiliary task if you want tag insertion handled by a cheaper or faster model.
|
||||
|
||||
|
||||
### Input length limits
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue