mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(tts): add speed support for Edge TTS and OpenAI TTS
Read tts.speed (global) or tts.<provider>.speed (provider-specific) from config. Provider-specific takes precedence over global. - Edge TTS: converts speed float to SSML prosody rate string - OpenAI TTS: passes speed param clamped to 0.25-4.0 - MiniMax: wired into global tts.speed fallback for consistency Co-authored-by: 0xbyt4 <0xbyt4@users.noreply.github.com>
This commit is contained in:
parent
651419b014
commit
8ec0656f53
1 changed files with 13 additions and 3 deletions
|
|
@ -188,8 +188,14 @@ async def _generate_edge_tts(text: str, output_path: str, tts_config: Dict[str,
|
|||
_edge_tts = _import_edge_tts()
|
||||
edge_config = tts_config.get("edge", {})
|
||||
voice = edge_config.get("voice", DEFAULT_EDGE_VOICE)
|
||||
speed = float(edge_config.get("speed", tts_config.get("speed", 1.0)))
|
||||
|
||||
communicate = _edge_tts.Communicate(text, voice)
|
||||
kwargs = {"voice": voice}
|
||||
if speed != 1.0:
|
||||
pct = round((speed - 1.0) * 100)
|
||||
kwargs["rate"] = f"{pct:+d}%"
|
||||
|
||||
communicate = _edge_tts.Communicate(text, **kwargs)
|
||||
await communicate.save(output_path)
|
||||
return output_path
|
||||
|
||||
|
|
@ -261,6 +267,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
|||
model = oai_config.get("model", DEFAULT_OPENAI_MODEL)
|
||||
voice = oai_config.get("voice", DEFAULT_OPENAI_VOICE)
|
||||
base_url = oai_config.get("base_url", base_url)
|
||||
speed = float(oai_config.get("speed", tts_config.get("speed", 1.0)))
|
||||
|
||||
# Determine response format from extension
|
||||
if output_path.endswith(".ogg"):
|
||||
|
|
@ -271,13 +278,16 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
|||
OpenAIClient = _import_openai_client()
|
||||
client = OpenAIClient(api_key=api_key, base_url=base_url)
|
||||
try:
|
||||
response = client.audio.speech.create(
|
||||
create_kwargs = dict(
|
||||
model=model,
|
||||
voice=voice,
|
||||
input=text,
|
||||
response_format=response_format,
|
||||
extra_headers={"x-idempotency-key": str(uuid.uuid4())},
|
||||
)
|
||||
if speed != 1.0:
|
||||
create_kwargs["speed"] = max(0.25, min(4.0, speed))
|
||||
response = client.audio.speech.create(**create_kwargs)
|
||||
|
||||
response.stream_to_file(output_path)
|
||||
return output_path
|
||||
|
|
@ -314,7 +324,7 @@ def _generate_minimax_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
|||
mm_config = tts_config.get("minimax", {})
|
||||
model = mm_config.get("model", DEFAULT_MINIMAX_MODEL)
|
||||
voice_id = mm_config.get("voice_id", DEFAULT_MINIMAX_VOICE_ID)
|
||||
speed = mm_config.get("speed", 1)
|
||||
speed = mm_config.get("speed", tts_config.get("speed", 1))
|
||||
vol = mm_config.get("vol", 1)
|
||||
pitch = mm_config.get("pitch", 0)
|
||||
base_url = mm_config.get("base_url", DEFAULT_MINIMAX_BASE_URL)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue