Merge branch 'main' of github.com:NousResearch/hermes-agent into feat/ink-refactor

2026-04-25 00:51:20 +00:00 · 2026-04-16 18:25:49 -05:00 · 2026-04-16 18:25:49 -05:00 · 3746c60439
commit 3746c60439
parent 727f0eaf74 edefec4e68
18 changed files with 1723 additions and 377 deletions
--- a/tools/approval.py
+++ b/tools/approval.py
@ -14,6 +14,7 @@ import os
 import re
 import sys
 import threading
+import time
 import unicodedata
 from typing import Optional

@ -834,13 +835,43 @@ def check_all_command_guards(command: str, env_type: str,
                    "description": combined_desc,
                }

-            # Block until the user responds or timeout (default 5 min)
+            # Block until the user responds or timeout (default 5 min).
+            # Poll in short slices so we can fire activity heartbeats every
+            # ~10s to the agent's inactivity tracker.  Without this, the
+            # blocking event.wait() never touches activity, and the
+            # gateway's inactivity watchdog (agent.gateway_timeout, default
+            # 1800s) kills the agent while the user is still responding to
+            # the approval prompt.  Mirrors the _wait_for_process() cadence
+            # in tools/environments/base.py.
            timeout = _get_approval_config().get("gateway_timeout", 300)
            try:
                timeout = int(timeout)
            except (ValueError, TypeError):
                timeout = 300
-            resolved = entry.event.wait(timeout=timeout)
+
+            try:
+                from tools.environments.base import touch_activity_if_due
+            except Exception:  # pragma: no cover
+                touch_activity_if_due = None
+
+            _now = time.monotonic()
+            _deadline = _now + max(timeout, 0)
+            _activity_state = {"last_touch": _now, "start": _now}
+            resolved = False
+            while True:
+                _remaining = _deadline - time.monotonic()
+                if _remaining <= 0:
+                    break
+                # 1s poll slice — the event is set immediately when the
+                # user responds, so slice length only controls heartbeat
+                # cadence, not user-visible responsiveness.
+                if entry.event.wait(timeout=min(1.0, _remaining)):
+                    resolved = True
+                    break
+                if touch_activity_if_due is not None:
+                    touch_activity_if_due(
+                        _activity_state, "waiting for user approval"
+                    )

            # Clean up this entry from the queue
            with _lock:
--- a/tools/checkpoint_manager.py
+++ b/tools/checkpoint_manager.py
@ -126,7 +126,22 @@ def _shadow_repo_path(working_dir: str) -> Path:


 def _git_env(shadow_repo: Path, working_dir: str) -> dict:
-    """Build env dict that redirects git to the shadow repo."""
+    """Build env dict that redirects git to the shadow repo.
+
+    The shadow repo is internal Hermes infrastructure — it must NOT inherit
+    the user's global or system git config.  User-level settings like
+    ``commit.gpgsign = true``, signing hooks, or credential helpers would
+    either break background snapshots or, worse, spawn interactive prompts
+    (pinentry GUI windows) mid-session every time a file is written.
+
+    Isolation strategy:
+    * ``GIT_CONFIG_GLOBAL=<os.devnull>`` — ignore ``~/.gitconfig`` (git 2.32+).
+    * ``GIT_CONFIG_SYSTEM=<os.devnull>`` — ignore ``/etc/gitconfig`` (git 2.32+).
+    * ``GIT_CONFIG_NOSYSTEM=1`` — legacy belt-and-suspenders for older git.
+
+    The shadow repo still has its own per-repo config (user.email, user.name,
+    commit.gpgsign=false) set in ``_init_shadow_repo``.
+    """
    normalized_working_dir = _normalize_path(working_dir)
    env = os.environ.copy()
    env["GIT_DIR"] = str(shadow_repo)
@ -134,6 +149,13 @@ def _git_env(shadow_repo: Path, working_dir: str) -> dict:
    env.pop("GIT_INDEX_FILE", None)
    env.pop("GIT_NAMESPACE", None)
    env.pop("GIT_ALTERNATE_OBJECT_DIRECTORIES", None)
+    # Isolate the shadow repo from the user's global/system git config.
+    # Prevents commit.gpgsign, hooks, aliases, credential helpers, etc. from
+    # leaking into background snapshots.  Uses os.devnull for cross-platform
+    # support (``/dev/null`` on POSIX, ``nul`` on Windows).
+    env["GIT_CONFIG_GLOBAL"] = os.devnull
+    env["GIT_CONFIG_SYSTEM"] = os.devnull
+    env["GIT_CONFIG_NOSYSTEM"] = "1"
    return env


@ -211,6 +233,13 @@ def _init_shadow_repo(shadow_repo: Path, working_dir: str) -> Optional[str]:

    _run_git(["config", "user.email", "hermes@local"], shadow_repo, working_dir)
    _run_git(["config", "user.name", "Hermes Checkpoint"], shadow_repo, working_dir)
+    # Explicitly disable commit/tag signing in the shadow repo.  _git_env
+    # already isolates from the user's global config, but writing these into
+    # the shadow's own config is belt-and-suspenders — it guarantees the
+    # shadow repo is correct even if someone inspects or runs git against it
+    # directly (without the GIT_CONFIG_* env vars).
+    _run_git(["config", "commit.gpgsign", "false"], shadow_repo, working_dir)
+    _run_git(["config", "tag.gpgSign", "false"], shadow_repo, working_dir)

    info_dir = shadow_repo / "info"
    info_dir.mkdir(exist_ok=True)
@ -552,9 +581,11 @@ class CheckpointManager:
            logger.debug("Checkpoint skipped: no changes in %s", working_dir)
            return False

-        # Commit
+        # Commit.  ``--no-gpg-sign`` inline covers shadow repos created before
+        # the commit.gpgsign=false config was added to _init_shadow_repo — so
+        # users with existing checkpoints never hit a GPG pinentry popup.
        ok, _, err = _run_git(
-            ["commit", "-m", reason, "--allow-empty-message"],
+            ["commit", "-m", reason, "--allow-empty-message", "--no-gpg-sign"],
            shadow, working_dir, timeout=_GIT_TIMEOUT * 2,
        )
        if not ok:
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -2,12 +2,13 @@
 """
 Text-to-Speech Tool Module

-Supports six TTS providers:
+Supports seven TTS providers:
 - Edge TTS (default, free, no API key): Microsoft Edge neural voices
 - ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
 - OpenAI TTS: Good quality, needs OPENAI_API_KEY
 - MiniMax TTS: High-quality with voice cloning, needs MINIMAX_API_KEY
 - Mistral (Voxtral TTS): Multilingual, native Opus, needs MISTRAL_API_KEY
+- Google Gemini TTS: Controllable, 30 prebuilt voices, needs GEMINI_API_KEY
 - NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed

 Output formats:
@ -99,6 +100,13 @@ DEFAULT_XAI_LANGUAGE = "en"
 DEFAULT_XAI_SAMPLE_RATE = 24000
 DEFAULT_XAI_BIT_RATE = 128000
 DEFAULT_XAI_BASE_URL = "https://api.x.ai/v1"
+DEFAULT_GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts"
+DEFAULT_GEMINI_TTS_VOICE = "Kore"
+DEFAULT_GEMINI_TTS_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"
+# PCM output specs for Gemini TTS (fixed by the API)
+GEMINI_TTS_SAMPLE_RATE = 24000
+GEMINI_TTS_CHANNELS = 1
+GEMINI_TTS_SAMPLE_WIDTH = 2  # 16-bit PCM (L16)

 def _get_default_output_dir() -> str:
    from hermes_constants import get_hermes_dir
@ -506,6 +514,174 @@ def _generate_mistral_tts(text: str, output_path: str, tts_config: Dict[str, Any
    return output_path


+# ===========================================================================
+# Provider: Google Gemini TTS
+# ===========================================================================
+def _wrap_pcm_as_wav(
+    pcm_bytes: bytes,
+    sample_rate: int = GEMINI_TTS_SAMPLE_RATE,
+    channels: int = GEMINI_TTS_CHANNELS,
+    sample_width: int = GEMINI_TTS_SAMPLE_WIDTH,
+) -> bytes:
+    """Wrap raw signed-little-endian PCM with a standard WAV RIFF header.
+
+    Gemini TTS returns audio/L16;codec=pcm;rate=24000 -- raw PCM samples with
+    no container. We add a minimal WAV header so the file is playable and
+    ffmpeg can re-encode it to MP3/Opus downstream.
+    """
+    import struct
+
+    byte_rate = sample_rate * channels * sample_width
+    block_align = channels * sample_width
+    data_size = len(pcm_bytes)
+    fmt_chunk = struct.pack(
+        "<4sIHHIIHH",
+        b"fmt ",
+        16,             # fmt chunk size (PCM)
+        1,              # audio format (PCM)
+        channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        sample_width * 8,
+    )
+    data_chunk_header = struct.pack("<4sI", b"data", data_size)
+    riff_size = 4 + len(fmt_chunk) + len(data_chunk_header) + data_size
+    riff_header = struct.pack("<4sI4s", b"RIFF", riff_size, b"WAVE")
+    return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
+
+
+def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
+    """Generate audio using Google Gemini TTS.
+
+    Gemini's generateContent endpoint with responseModalities=["AUDIO"] returns
+    raw 24kHz mono 16-bit PCM (L16) as base64. We wrap it with a WAV RIFF
+    header to produce a playable file, then ffmpeg-convert to MP3 / Opus if
+    the caller requested those formats (same pattern as NeuTTS).
+
+    Args:
+        text: Text to convert (prompt-style; supports inline direction like
+              "Say cheerfully:" and audio tags like [whispers]).
+        output_path: Where to save the audio file (.wav, .mp3, or .ogg).
+        tts_config: TTS config dict.
+
+    Returns:
+        Path to the saved audio file.
+    """
+    import requests
+
+    api_key = (os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY") or "").strip()
+    if not api_key:
+        raise ValueError(
+            "GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
+        )
+
+    gemini_config = tts_config.get("gemini", {})
+    model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
+    voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
+    base_url = str(
+        gemini_config.get("base_url")
+        or os.getenv("GEMINI_BASE_URL")
+        or DEFAULT_GEMINI_TTS_BASE_URL
+    ).strip().rstrip("/")
+
+    payload: Dict[str, Any] = {
+        "contents": [{"parts": [{"text": text}]}],
+        "generationConfig": {
+            "responseModalities": ["AUDIO"],
+            "speechConfig": {
+                "voiceConfig": {
+                    "prebuiltVoiceConfig": {"voiceName": voice},
+                },
+            },
+        },
+    }
+
+    endpoint = f"{base_url}/models/{model}:generateContent"
+    response = requests.post(
+        endpoint,
+        params={"key": api_key},
+        headers={"Content-Type": "application/json"},
+        json=payload,
+        timeout=60,
+    )
+    if response.status_code != 200:
+        # Surface the API error message when present
+        try:
+            err = response.json().get("error", {})
+            detail = err.get("message") or response.text[:300]
+        except Exception:
+            detail = response.text[:300]
+        raise RuntimeError(
+            f"Gemini TTS API error (HTTP {response.status_code}): {detail}"
+        )
+
+    try:
+        data = response.json()
+        parts = data["candidates"][0]["content"]["parts"]
+        audio_part = next((p for p in parts if "inlineData" in p or "inline_data" in p), None)
+        if audio_part is None:
+            raise RuntimeError("Gemini TTS response contained no audio data")
+        inline = audio_part.get("inlineData") or audio_part.get("inline_data") or {}
+        audio_b64 = inline.get("data", "")
+    except (KeyError, IndexError, TypeError) as e:
+        raise RuntimeError(f"Gemini TTS response was malformed: {e}") from e
+
+    if not audio_b64:
+        raise RuntimeError("Gemini TTS returned empty audio data")
+
+    pcm_bytes = base64.b64decode(audio_b64)
+    wav_bytes = _wrap_pcm_as_wav(pcm_bytes)
+
+    # Fast path: caller wants WAV directly, just write.
+    if output_path.lower().endswith(".wav"):
+        with open(output_path, "wb") as f:
+            f.write(wav_bytes)
+        return output_path
+
+    # Otherwise write WAV to a temp file and ffmpeg-convert to the target
+    # format (.mp3 or .ogg). If ffmpeg is missing, fall back to renaming the
+    # WAV -- this matches the NeuTTS behavior and keeps the tool usable on
+    # systems without ffmpeg (audio still plays, just with a misleading
+    # extension).
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        tmp.write(wav_bytes)
+        wav_path = tmp.name
+
+    try:
+        ffmpeg = shutil.which("ffmpeg")
+        if ffmpeg:
+            # For .ogg output, force libopus encoding (Telegram voice bubbles
+            # require Opus specifically; ffmpeg's default for .ogg is Vorbis).
+            if output_path.lower().endswith(".ogg"):
+                cmd = [
+                    ffmpeg, "-i", wav_path,
+                    "-acodec", "libopus", "-ac", "1",
+                    "-b:a", "64k", "-vbr", "off",
+                    "-y", "-loglevel", "error",
+                    output_path,
+                ]
+            else:
+                cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
+            result = subprocess.run(cmd, capture_output=True, timeout=30)
+            if result.returncode != 0:
+                stderr = result.stderr.decode("utf-8", errors="ignore")[:300]
+                raise RuntimeError(f"ffmpeg conversion failed: {stderr}")
+        else:
+            logger.warning(
+                "ffmpeg not found; writing raw WAV to %s (extension may be misleading)",
+                output_path,
+            )
+            shutil.copyfile(wav_path, output_path)
+    finally:
+        try:
+            os.remove(wav_path)
+        except OSError:
+            pass
+
+    return output_path
+
+
 # ===========================================================================
 # NeuTTS (local, on-device TTS via neutts_cli)
 # ===========================================================================
@ -634,7 +810,7 @@ def text_to_speech_tool(
        out_dir.mkdir(parents=True, exist_ok=True)
        # Use .ogg for Telegram with providers that support native Opus output,
        # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
-        if want_opus and provider in ("openai", "elevenlabs", "mistral"):
+        if want_opus and provider in ("openai", "elevenlabs", "mistral", "gemini"):
            file_path = out_dir / f"tts_{timestamp}.ogg"
        else:
            file_path = out_dir / f"tts_{timestamp}.mp3"
@ -687,6 +863,10 @@ def text_to_speech_tool(
            logger.info("Generating speech with Mistral Voxtral TTS...")
            _generate_mistral_tts(text, file_str, tts_config)

+        elif provider == "gemini":
+            logger.info("Generating speech with Google Gemini TTS...")
+            _generate_gemini_tts(text, file_str, tts_config)
+
        elif provider == "neutts":
            if not _check_neutts_available():
                return json.dumps({
@ -741,7 +921,7 @@ def text_to_speech_tool(
            if opus_path:
                file_str = opus_path
                voice_compatible = True
-        elif provider in ("elevenlabs", "openai", "mistral"):
+        elif provider in ("elevenlabs", "openai", "mistral", "gemini"):
            voice_compatible = file_str.endswith(".ogg")

        file_size = os.path.getsize(file_str)
@ -811,6 +991,8 @@ def check_tts_requirements() -> bool:
        return True
    if os.getenv("XAI_API_KEY"):
        return True
+    if os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY"):
+        return True
    try:
        _import_mistral_client()
        if os.getenv("MISTRAL_API_KEY"):