Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor

This commit is contained in:
Brooklyn Nicholson 2026-04-16 18:25:49 -05:00
commit 3746c60439
18 changed files with 1723 additions and 377 deletions

View file

@ -14,6 +14,7 @@ import os
import re
import sys
import threading
import time
import unicodedata
from typing import Optional
@ -834,13 +835,43 @@ def check_all_command_guards(command: str, env_type: str,
"description": combined_desc,
}
# Block until the user responds or timeout (default 5 min)
# Block until the user responds or timeout (default 5 min).
# Poll in short slices so we can fire activity heartbeats every
# ~10s to the agent's inactivity tracker. Without this, the
# blocking event.wait() never touches activity, and the
# gateway's inactivity watchdog (agent.gateway_timeout, default
# 1800s) kills the agent while the user is still responding to
# the approval prompt. Mirrors the _wait_for_process() cadence
# in tools/environments/base.py.
timeout = _get_approval_config().get("gateway_timeout", 300)
try:
timeout = int(timeout)
except (ValueError, TypeError):
timeout = 300
resolved = entry.event.wait(timeout=timeout)
try:
from tools.environments.base import touch_activity_if_due
except Exception: # pragma: no cover
touch_activity_if_due = None
_now = time.monotonic()
_deadline = _now + max(timeout, 0)
_activity_state = {"last_touch": _now, "start": _now}
resolved = False
while True:
_remaining = _deadline - time.monotonic()
if _remaining <= 0:
break
# 1s poll slice — the event is set immediately when the
# user responds, so slice length only controls heartbeat
# cadence, not user-visible responsiveness.
if entry.event.wait(timeout=min(1.0, _remaining)):
resolved = True
break
if touch_activity_if_due is not None:
touch_activity_if_due(
_activity_state, "waiting for user approval"
)
# Clean up this entry from the queue
with _lock:

View file

@ -126,7 +126,22 @@ def _shadow_repo_path(working_dir: str) -> Path:
def _git_env(shadow_repo: Path, working_dir: str) -> dict:
"""Build env dict that redirects git to the shadow repo."""
"""Build env dict that redirects git to the shadow repo.
The shadow repo is internal Hermes infrastructure it must NOT inherit
the user's global or system git config. User-level settings like
``commit.gpgsign = true``, signing hooks, or credential helpers would
either break background snapshots or, worse, spawn interactive prompts
(pinentry GUI windows) mid-session every time a file is written.
Isolation strategy:
* ``GIT_CONFIG_GLOBAL=<os.devnull>`` ignore ``~/.gitconfig`` (git 2.32+).
* ``GIT_CONFIG_SYSTEM=<os.devnull>`` ignore ``/etc/gitconfig`` (git 2.32+).
* ``GIT_CONFIG_NOSYSTEM=1`` legacy belt-and-suspenders for older git.
The shadow repo still has its own per-repo config (user.email, user.name,
commit.gpgsign=false) set in ``_init_shadow_repo``.
"""
normalized_working_dir = _normalize_path(working_dir)
env = os.environ.copy()
env["GIT_DIR"] = str(shadow_repo)
@ -134,6 +149,13 @@ def _git_env(shadow_repo: Path, working_dir: str) -> dict:
env.pop("GIT_INDEX_FILE", None)
env.pop("GIT_NAMESPACE", None)
env.pop("GIT_ALTERNATE_OBJECT_DIRECTORIES", None)
# Isolate the shadow repo from the user's global/system git config.
# Prevents commit.gpgsign, hooks, aliases, credential helpers, etc. from
# leaking into background snapshots. Uses os.devnull for cross-platform
# support (``/dev/null`` on POSIX, ``nul`` on Windows).
env["GIT_CONFIG_GLOBAL"] = os.devnull
env["GIT_CONFIG_SYSTEM"] = os.devnull
env["GIT_CONFIG_NOSYSTEM"] = "1"
return env
@ -211,6 +233,13 @@ def _init_shadow_repo(shadow_repo: Path, working_dir: str) -> Optional[str]:
_run_git(["config", "user.email", "hermes@local"], shadow_repo, working_dir)
_run_git(["config", "user.name", "Hermes Checkpoint"], shadow_repo, working_dir)
# Explicitly disable commit/tag signing in the shadow repo. _git_env
# already isolates from the user's global config, but writing these into
# the shadow's own config is belt-and-suspenders — it guarantees the
# shadow repo is correct even if someone inspects or runs git against it
# directly (without the GIT_CONFIG_* env vars).
_run_git(["config", "commit.gpgsign", "false"], shadow_repo, working_dir)
_run_git(["config", "tag.gpgSign", "false"], shadow_repo, working_dir)
info_dir = shadow_repo / "info"
info_dir.mkdir(exist_ok=True)
@ -552,9 +581,11 @@ class CheckpointManager:
logger.debug("Checkpoint skipped: no changes in %s", working_dir)
return False
# Commit
# Commit. ``--no-gpg-sign`` inline covers shadow repos created before
# the commit.gpgsign=false config was added to _init_shadow_repo — so
# users with existing checkpoints never hit a GPG pinentry popup.
ok, _, err = _run_git(
["commit", "-m", reason, "--allow-empty-message"],
["commit", "-m", reason, "--allow-empty-message", "--no-gpg-sign"],
shadow, working_dir, timeout=_GIT_TIMEOUT * 2,
)
if not ok:

View file

@ -2,12 +2,13 @@
"""
Text-to-Speech Tool Module
Supports six TTS providers:
Supports seven TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
- Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats:
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
DEFAULT_XAI_SAMPLE_RATE = 24000
DEFAULT_XAI_BIT_RATE = 128000
DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
DEFAULT_GEMINI_TTS_VOICE = "Kore"
DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
# PCM output specs for Gemini TTS (fixed by the API)
GEMINI_TTS_SAMPLE_RATE = 24000
GEMINI_TTS_CHANNELS = 1
GEMINI_TTS_SAMPLE_WIDTH = 2 # 16-bit PCM (L16)
def _get_default_output_dir() -> str:
from hermes_constants import get_hermes_dir
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
return output_path
# ===========================================================================
# Provider: Google Gemini TTS
# ===========================================================================
def _wrap_pcm_as_wav(
pcm_bytes: bytes,
sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
channels: int = GEMINI_TTS_CHANNELS,
sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
) -> bytes:
"""Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
no container. We add a minimal WAV header so the file is playable and
ffmpeg can re-encode it to MP3/Opus downstream.
"""
import struct
byte_rate = sample_rate * channels * sample_width
block_align = channels * sample_width
data_size = len(pcm_bytes)
fmt_chunk = struct.pack(
"<4sIHHIIHH",
b"fmt ",
16, # fmt chunk size (PCM)
1, # audio format (PCM)
channels,
sample_rate,
byte_rate,
block_align,
sample_width * 8,
)
data_chunk_header = struct.pack("<4sI", b"data", data_size)
riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Google Gemini TTS.
Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
the caller requested those formats (same pattern as NeuTTS).
Args:
text: Text to convert (prompt-style; supports inline direction like
"Say cheerfully:" and audio tags like [whispers]).
output_path: Where to save the audio file (.wav, .mp3, or .ogg).
tts_config: TTS config dict.
Returns:
Path to the saved audio file.
"""
import requests
api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
if not api_key:
raise ValueError(
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
)
gemini_config = tts_config.get("gemini", {})
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
base_url = str(
gemini_config.get("base_url")
or os.getenv("GEMINI_BASE_URL")
or DEFAULT_GEMINI_TTS_BASE_URL
).strip().rstrip("/")
payload: Dict[str, Any] = {
"contents": [{"parts": [{"text": text}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": voice},
},
},
},
}
endpoint = f"{base_url}/models/{model}:generateContent"
response = requests.post(
endpoint,
params={"key": api_key},
headers={"Content-Type": "application/json"},
json=payload,
timeout=60,
)
if response.status_code != 200:
# Surface the API error message when present
try:
err = response.json().get("error", {})
detail = err.get("message") or response.text[:300]
except Exception:
detail = response.text[:300]
raise RuntimeError(
f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
)
try:
data = response.json()
parts = data["candidates"][0]["content"]["parts"]
audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
if audio_part is None:
raise RuntimeError("Gemini TTS response contained no audio data")
inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
audio_b64 = inline.get("data", "")
except (KeyError, IndexError, TypeError) as e:
raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
if not audio_b64:
raise RuntimeError("Gemini TTS returned empty audio data")
pcm_bytes = base64.b64decode(audio_b64)
wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
# Fast path: caller wants WAV directly, just write.
if output_path.lower().endswith(".wav"):
with open(output_path, "wb") as f:
f.write(wav_bytes)
return output_path
# Otherwise write WAV to a temp file and ffmpeg-convert to the target
# format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
# WAV -- this matches the NeuTTS behavior and keeps the tool usable on
# systems without ffmpeg (audio still plays, just with a misleading
# extension).
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
tmp.write(wav_bytes)
wav_path = tmp.name
try:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
# For .ogg output, force libopus encoding (Telegram voice bubbles
# require Opus specifically; ffmpeg's default for .ogg is Vorbis).
if output_path.lower().endswith(".ogg"):
cmd = [
ffmpeg, "-i", wav_path,
"-acodec", "libopus", "-ac", "1",
"-b:a", "64k", "-vbr", "off",
"-y", "-loglevel", "error",
output_path,
]
else:
cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
result = subprocess.run(cmd, capture_output=True, timeout=30)
if result.returncode != 0:
stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
else:
logger.warning(
"ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
output_path,
)
shutil.copyfile(wav_path, output_path)
finally:
try:
os.remove(wav_path)
except OSError:
pass
return output_path
# ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli)
# ===========================================================================
@ -634,7 +810,7 @@ def text_to_speech_tool(
out_dir.mkdir(parents=True, exist_ok=True)
# Use .ogg for Telegram with providers that support native Opus output,
# otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
if want_opus and provider in ("openai", "elevenlabs", "mistral"):
if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
file_path = out_dir / f"tts_{timestamp}.ogg"
else:
file_path = out_dir / f"tts_{timestamp}.mp3"
@ -687,6 +863,10 @@ def text_to_speech_tool(
logger.info("Generating speech with Mistral Voxtral TTS...")
_generate_mistral_tts(text, file_str, tts_config)
elif provider == "gemini":
logger.info("Generating speech with Google Gemini TTS...")
_generate_gemini_tts(text, file_str, tts_config)
elif provider == "neutts":
if not _check_neutts_available():
return json.dumps({
@ -741,7 +921,7 @@ def text_to_speech_tool(
if opus_path:
file_str = opus_path
voice_compatible = True
elif provider in ("elevenlabs", "openai", "mistral"):
elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
voice_compatible = file_str.endswith(".ogg")
file_size = os.path.getsize(file_str)
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
return True
if os.getenv("XAI_API_KEY"):
return True
if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
return True
try:
_import_mistral_client()
if os.getenv("MISTRAL_API_KEY"):