mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor
This commit is contained in:
commit
3746c60439
18 changed files with 1723 additions and 377 deletions
|
|
@ -14,6 +14,7 @@ import os
|
|||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import unicodedata
|
||||
from typing import Optional
|
||||
|
||||
|
|
@ -834,13 +835,43 @@ def check_all_command_guards(command: str, env_type: str,
|
|||
"description": combined_desc,
|
||||
}
|
||||
|
||||
# Block until the user responds or timeout (default 5 min)
|
||||
# Block until the user responds or timeout (default 5 min).
|
||||
# Poll in short slices so we can fire activity heartbeats every
|
||||
# ~10s to the agent's inactivity tracker. Without this, the
|
||||
# blocking event.wait() never touches activity, and the
|
||||
# gateway's inactivity watchdog (agent.gateway_timeout, default
|
||||
# 1800s) kills the agent while the user is still responding to
|
||||
# the approval prompt. Mirrors the _wait_for_process() cadence
|
||||
# in tools/environments/base.py.
|
||||
timeout = _get_approval_config().get("gateway_timeout", 300)
|
||||
try:
|
||||
timeout = int(timeout)
|
||||
except (ValueError, TypeError):
|
||||
timeout = 300
|
||||
resolved = entry.event.wait(timeout=timeout)
|
||||
|
||||
try:
|
||||
from tools.environments.base import touch_activity_if_due
|
||||
except Exception: # pragma: no cover
|
||||
touch_activity_if_due = None
|
||||
|
||||
_now = time.monotonic()
|
||||
_deadline = _now + max(timeout, 0)
|
||||
_activity_state = {"last_touch": _now, "start": _now}
|
||||
resolved = False
|
||||
while True:
|
||||
_remaining = _deadline - time.monotonic()
|
||||
if _remaining <= 0:
|
||||
break
|
||||
# 1s poll slice — the event is set immediately when the
|
||||
# user responds, so slice length only controls heartbeat
|
||||
# cadence, not user-visible responsiveness.
|
||||
if entry.event.wait(timeout=min(1.0, _remaining)):
|
||||
resolved = True
|
||||
break
|
||||
if touch_activity_if_due is not None:
|
||||
touch_activity_if_due(
|
||||
_activity_state, "waiting for user approval"
|
||||
)
|
||||
|
||||
# Clean up this entry from the queue
|
||||
with _lock:
|
||||
|
|
|
|||
|
|
@ -126,7 +126,22 @@ def _shadow_repo_path(working_dir: str) -> Path:
|
|||
|
||||
|
||||
def _git_env(shadow_repo: Path, working_dir: str) -> dict:
|
||||
"""Build env dict that redirects git to the shadow repo."""
|
||||
"""Build env dict that redirects git to the shadow repo.
|
||||
|
||||
The shadow repo is internal Hermes infrastructure — it must NOT inherit
|
||||
the user's global or system git config. User-level settings like
|
||||
``commit.gpgsign = true``, signing hooks, or credential helpers would
|
||||
either break background snapshots or, worse, spawn interactive prompts
|
||||
(pinentry GUI windows) mid-session every time a file is written.
|
||||
|
||||
Isolation strategy:
|
||||
* ``GIT_CONFIG_GLOBAL=<os.devnull>`` — ignore ``~/.gitconfig`` (git 2.32+).
|
||||
* ``GIT_CONFIG_SYSTEM=<os.devnull>`` — ignore ``/etc/gitconfig`` (git 2.32+).
|
||||
* ``GIT_CONFIG_NOSYSTEM=1`` — legacy belt-and-suspenders for older git.
|
||||
|
||||
The shadow repo still has its own per-repo config (user.email, user.name,
|
||||
commit.gpgsign=false) set in ``_init_shadow_repo``.
|
||||
"""
|
||||
normalized_working_dir = _normalize_path(working_dir)
|
||||
env = os.environ.copy()
|
||||
env["GIT_DIR"] = str(shadow_repo)
|
||||
|
|
@ -134,6 +149,13 @@ def _git_env(shadow_repo: Path, working_dir: str) -> dict:
|
|||
env.pop("GIT_INDEX_FILE", None)
|
||||
env.pop("GIT_NAMESPACE", None)
|
||||
env.pop("GIT_ALTERNATE_OBJECT_DIRECTORIES", None)
|
||||
# Isolate the shadow repo from the user's global/system git config.
|
||||
# Prevents commit.gpgsign, hooks, aliases, credential helpers, etc. from
|
||||
# leaking into background snapshots. Uses os.devnull for cross-platform
|
||||
# support (``/dev/null`` on POSIX, ``nul`` on Windows).
|
||||
env["GIT_CONFIG_GLOBAL"] = os.devnull
|
||||
env["GIT_CONFIG_SYSTEM"] = os.devnull
|
||||
env["GIT_CONFIG_NOSYSTEM"] = "1"
|
||||
return env
|
||||
|
||||
|
||||
|
|
@ -211,6 +233,13 @@ def _init_shadow_repo(shadow_repo: Path, working_dir: str) -> Optional[str]:
|
|||
|
||||
_run_git(["config", "user.email", "hermes@local"], shadow_repo, working_dir)
|
||||
_run_git(["config", "user.name", "Hermes Checkpoint"], shadow_repo, working_dir)
|
||||
# Explicitly disable commit/tag signing in the shadow repo. _git_env
|
||||
# already isolates from the user's global config, but writing these into
|
||||
# the shadow's own config is belt-and-suspenders — it guarantees the
|
||||
# shadow repo is correct even if someone inspects or runs git against it
|
||||
# directly (without the GIT_CONFIG_* env vars).
|
||||
_run_git(["config", "commit.gpgsign", "false"], shadow_repo, working_dir)
|
||||
_run_git(["config", "tag.gpgSign", "false"], shadow_repo, working_dir)
|
||||
|
||||
info_dir = shadow_repo / "info"
|
||||
info_dir.mkdir(exist_ok=True)
|
||||
|
|
@ -552,9 +581,11 @@ class CheckpointManager:
|
|||
logger.debug("Checkpoint skipped: no changes in %s", working_dir)
|
||||
return False
|
||||
|
||||
# Commit
|
||||
# Commit. ``--no-gpg-sign`` inline covers shadow repos created before
|
||||
# the commit.gpgsign=false config was added to _init_shadow_repo — so
|
||||
# users with existing checkpoints never hit a GPG pinentry popup.
|
||||
ok, _, err = _run_git(
|
||||
["commit", "-m", reason, "--allow-empty-message"],
|
||||
["commit", "-m", reason, "--allow-empty-message", "--no-gpg-sign"],
|
||||
shadow, working_dir, timeout=_GIT_TIMEOUT * 2,
|
||||
)
|
||||
if not ok:
|
||||
|
|
|
|||
|
|
@ -2,12 +2,13 @@
|
|||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports six TTS providers:
|
||||
Supports seven TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
|
||||
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
|
||||
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
|
||||
Output formats:
|
||||
|
|
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
|
|||
DEFAULT_XAI_SAMPLE_RATE = 24000
|
||||
DEFAULT_XAI_BIT_RATE = 128000
|
||||
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
|
||||
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
||||
DEFAULT_GEMINI_TTS_VOICE = "Kore"
|
||||
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
|
||||
# PCM output specs for Gemini TTS (fixed by the API)
|
||||
GEMINI_TTS_SAMPLE_RATE = 24000
|
||||
GEMINI_TTS_CHANNELS = 1
|
||||
GEMINI_TTS_SAMPLE_WIDTH = 2 # 16-bit PCM (L16)
|
||||
|
||||
def _get_default_output_dir() -> str:
|
||||
from hermes_constants import get_hermes_dir
|
||||
|
|
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
|
|||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Provider: Google Gemini TTS
|
||||
# ===========================================================================
|
||||
def _wrap_pcm_as_wav(
|
||||
pcm_bytes: bytes,
|
||||
sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
|
||||
channels: int = GEMINI_TTS_CHANNELS,
|
||||
sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
|
||||
) -> bytes:
|
||||
"""Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
|
||||
|
||||
Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
|
||||
no container. We add a minimal WAV header so the file is playable and
|
||||
ffmpeg can re-encode it to MP3/Opus downstream.
|
||||
"""
|
||||
import struct
|
||||
|
||||
byte_rate = sample_rate * channels * sample_width
|
||||
block_align = channels * sample_width
|
||||
data_size = len(pcm_bytes)
|
||||
fmt_chunk = struct.pack(
|
||||
"<4sIHHIIHH",
|
||||
b"fmt ",
|
||||
16, # fmt chunk size (PCM)
|
||||
1, # audio format (PCM)
|
||||
channels,
|
||||
sample_rate,
|
||||
byte_rate,
|
||||
block_align,
|
||||
sample_width * 8,
|
||||
)
|
||||
data_chunk_header = struct.pack("<4sI", b"data", data_size)
|
||||
riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
|
||||
riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
|
||||
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
|
||||
|
||||
|
||||
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate audio using Google Gemini TTS.
|
||||
|
||||
Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
|
||||
raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
|
||||
header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
|
||||
the caller requested those formats (same pattern as NeuTTS).
|
||||
|
||||
Args:
|
||||
text: Text to convert (prompt-style; supports inline direction like
|
||||
"Say cheerfully:" and audio tags like [whispers]).
|
||||
output_path: Where to save the audio file (.wav, .mp3, or .ogg).
|
||||
tts_config: TTS config dict.
|
||||
|
||||
Returns:
|
||||
Path to the saved audio file.
|
||||
"""
|
||||
import requests
|
||||
|
||||
api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
|
||||
if not api_key:
|
||||
raise ValueError(
|
||||
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
|
||||
)
|
||||
|
||||
gemini_config = tts_config.get("gemini", {})
|
||||
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
|
||||
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
|
||||
base_url = str(
|
||||
gemini_config.get("base_url")
|
||||
or os.getenv("GEMINI_BASE_URL")
|
||||
or DEFAULT_GEMINI_TTS_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"contents": [{"parts": [{"text": text}]}],
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {
|
||||
"voiceConfig": {
|
||||
"prebuiltVoiceConfig": {"voiceName": voice},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
endpoint = f"{base_url}/models/{model}:generateContent"
|
||||
response = requests.post(
|
||||
endpoint,
|
||||
params={"key": api_key},
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=60,
|
||||
)
|
||||
if response.status_code != 200:
|
||||
# Surface the API error message when present
|
||||
try:
|
||||
err = response.json().get("error", {})
|
||||
detail = err.get("message") or response.text[:300]
|
||||
except Exception:
|
||||
detail = response.text[:300]
|
||||
raise RuntimeError(
|
||||
f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
|
||||
)
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
parts = data["candidates"][0]["content"]["parts"]
|
||||
audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
|
||||
if audio_part is None:
|
||||
raise RuntimeError("Gemini TTS response contained no audio data")
|
||||
inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
|
||||
audio_b64 = inline.get("data", "")
|
||||
except (KeyError, IndexError, TypeError) as e:
|
||||
raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
|
||||
|
||||
if not audio_b64:
|
||||
raise RuntimeError("Gemini TTS returned empty audio data")
|
||||
|
||||
pcm_bytes = base64.b64decode(audio_b64)
|
||||
wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
|
||||
|
||||
# Fast path: caller wants WAV directly, just write.
|
||||
if output_path.lower().endswith(".wav"):
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(wav_bytes)
|
||||
return output_path
|
||||
|
||||
# Otherwise write WAV to a temp file and ffmpeg-convert to the target
|
||||
# format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
|
||||
# WAV -- this matches the NeuTTS behavior and keeps the tool usable on
|
||||
# systems without ffmpeg (audio still plays, just with a misleading
|
||||
# extension).
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp.write(wav_bytes)
|
||||
wav_path = tmp.name
|
||||
|
||||
try:
|
||||
ffmpeg = shutil.which("ffmpeg")
|
||||
if ffmpeg:
|
||||
# For .ogg output, force libopus encoding (Telegram voice bubbles
|
||||
# require Opus specifically; ffmpeg's default for .ogg is Vorbis).
|
||||
if output_path.lower().endswith(".ogg"):
|
||||
cmd = [
|
||||
ffmpeg, "-i", wav_path,
|
||||
"-acodec", "libopus", "-ac", "1",
|
||||
"-b:a", "64k", "-vbr", "off",
|
||||
"-y", "-loglevel", "error",
|
||||
output_path,
|
||||
]
|
||||
else:
|
||||
cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=30)
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
|
||||
raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
|
||||
else:
|
||||
logger.warning(
|
||||
"ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
|
||||
output_path,
|
||||
)
|
||||
shutil.copyfile(wav_path, output_path)
|
||||
finally:
|
||||
try:
|
||||
os.remove(wav_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NeuTTS (local, on-device TTS via neutts_cli)
|
||||
# ===========================================================================
|
||||
|
|
@ -634,7 +810,7 @@ def text_to_speech_tool(
|
|||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Use .ogg for Telegram with providers that support native Opus output,
|
||||
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
|
||||
if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
|
||||
file_path = out_dir / f"tts_{timestamp}.ogg"
|
||||
else:
|
||||
file_path = out_dir / f"tts_{timestamp}.mp3"
|
||||
|
|
@ -687,6 +863,10 @@ def text_to_speech_tool(
|
|||
logger.info("Generating speech with Mistral Voxtral TTS...")
|
||||
_generate_mistral_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "gemini":
|
||||
logger.info("Generating speech with Google Gemini TTS...")
|
||||
_generate_gemini_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "neutts":
|
||||
if not _check_neutts_available():
|
||||
return json.dumps({
|
||||
|
|
@ -741,7 +921,7 @@ def text_to_speech_tool(
|
|||
if opus_path:
|
||||
file_str = opus_path
|
||||
voice_compatible = True
|
||||
elif provider in ("elevenlabs", "openai", "mistral"):
|
||||
elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
|
||||
voice_compatible = file_str.endswith(".ogg")
|
||||
|
||||
file_size = os.path.getsize(file_str)
|
||||
|
|
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
|
|||
return True
|
||||
if os.getenv("XAI_API_KEY"):
|
||||
return True
|
||||
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
|
||||
return True
|
||||
try:
|
||||
_import_mistral_client()
|
||||
if os.getenv("MISTRAL_API_KEY"):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue