mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
feat(tts): add auxiliary-model auto speech tags for xAI
Mirrors the existing Gemini TTS audio-tag rewrite path. When the input has no explicit user/model speech tags, ask the configured auxiliary model to insert a richer set of xAI-supported tags (laughs, sighs, whispers, soft/loud, slow/fast, etc.) so voice-mode replies sound more expressive. Falls back to the local conservative [pause]-only transform on any auxiliary-model failure.
This commit is contained in:
parent
fad4b40d9d
commit
5a506da3d8
1 changed files with 60 additions and 11 deletions
|
|
@ -1092,22 +1092,71 @@ def _xai_bool_config(value: Any, default: bool = False) -> bool:
|
|||
|
||||
|
||||
def _apply_xai_auto_speech_tags(text: str) -> str:
|
||||
"""Add light xAI speech tags for more natural voice-mode replies.
|
||||
"""Add xAI speech tags for more natural voice-mode replies.
|
||||
|
||||
The transform is intentionally conservative: it only inserts pauses. It
|
||||
never fabricates laughter or whispering, and it leaves explicit user/model
|
||||
speech tags untouched.
|
||||
First applies a conservative local transform (inserts [pause] between
|
||||
paragraphs and after the first sentence). Then, if the result contains
|
||||
no explicit user/model speech tags, asks the configured auxiliary model
|
||||
to rewrite the transcript with a richer set of xAI-supported tags
|
||||
(laughs, sighs, whispers, soft/loud, slow/fast, etc.) so the voice
|
||||
output sounds more expressive. Falls back to the local result on any
|
||||
auxiliary-model failure.
|
||||
"""
|
||||
clean = text.strip()
|
||||
if not clean or _XAI_SPEECH_TAG_RE.search(clean):
|
||||
if not clean:
|
||||
return text
|
||||
|
||||
clean = re.sub(r"\n\s*\n+", " [pause] ", clean)
|
||||
clean = re.sub(r"\s*\n\s*", " ", clean)
|
||||
if not _XAI_SPEECH_TAG_RE.search(clean):
|
||||
clean = _XAI_FIRST_SENTENCE_RE.sub(r"\1 [pause] ", clean, count=1)
|
||||
clean = re.sub(r"\s{2,}", " ", clean).strip()
|
||||
return clean
|
||||
# Local conservative pass: pauses only.
|
||||
local = clean
|
||||
local = re.sub(r"\n\s*\n+", " [pause] ", local)
|
||||
local = re.sub(r"\s*\n\s*", " ", local)
|
||||
if not _XAI_SPEECH_TAG_RE.search(local):
|
||||
local = _XAI_FIRST_SENTENCE_RE.sub(r"\1 [pause] ", local, count=1)
|
||||
local = re.sub(r"\s{2,}", " ", local).strip()
|
||||
|
||||
# If the user/model already supplied explicit speech tags, trust them
|
||||
# and don't re-rewrite.
|
||||
if _XAI_SPEECH_TAG_RE.search(clean):
|
||||
return local
|
||||
|
||||
# Auxiliary rewrite for richer emotion tags (mirrors the Gemini path).
|
||||
inline = ", ".join(_XAI_INLINE_SPEECH_TAGS)
|
||||
wrapping = ", ".join(_XAI_WRAPPING_SPEECH_TAGS)
|
||||
system_prompt = (
|
||||
"You rewrite transcripts for the xAI /v1/tts endpoint by inserting "
|
||||
"expressive speech tags.\n\n"
|
||||
"Valid inline tags (use as `[tag]`): " + inline + ".\n"
|
||||
"Valid wrapping tags (use as `[tag]...[/tag]`): " + wrapping + ".\n\n"
|
||||
"Rules:\n"
|
||||
"- Preserve the spoken words, order, and meaning.\n"
|
||||
"- Do not add new spoken sentences or remove existing spoken words.\n"
|
||||
"- Use inline `[tag]` for short modifiers (laughs, sighs, pause, etc.).\n"
|
||||
"- Use wrapping `[tag]...[/tag]` for sustained effects (whisper, soft, slow, fast, loud, etc.).\n"
|
||||
"- Do not use angle-bracket tags like `<tag>...</tag>` — xAI uses BBCode-style closing tags with `[/tag]`.\n"
|
||||
"- Do not use SSML.\n"
|
||||
"- Do not explain or comment.\n"
|
||||
"- Return only the tagged TTS script."
|
||||
)
|
||||
try:
|
||||
from agent.auxiliary_client import call_llm
|
||||
|
||||
response = call_llm(
|
||||
task="tts_audio_tags",
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": f"TRANSCRIPT TO TAG:\n{local}"},
|
||||
],
|
||||
temperature=0.7,
|
||||
)
|
||||
tagged = _extract_auxiliary_message_content(response).strip()
|
||||
# Strip markdown fences if the LLM wrapped the response.
|
||||
fence = re.fullmatch(r"```(?:[A-Za-z0-9_-]+)?\s*(.*?)\s*```", tagged, flags=re.DOTALL)
|
||||
if fence:
|
||||
tagged = fence.group(1).strip()
|
||||
return tagged or local
|
||||
except Exception as exc:
|
||||
logger.debug("xAI TTS audio tag rewrite failed; using locally-tagged text: %s", exc)
|
||||
return local
|
||||
|
||||
|
||||
def _generate_xai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue