From 09a491464c5fa10da01d33cd810e3ec2cc4241be Mon Sep 17 00:00:00 2001 From: Austin Pickett Date: Wed, 6 May 2026 11:58:53 -0400 Subject: [PATCH 001/230] feat(tui): add /sessions slash command for browsing and resuming previous sessions --- hermes_cli/commands.py | 3 +++ ui-tui/src/app/slash/commands/session.ts | 13 +++++++++++++ 2 files changed, 16 insertions(+) diff --git a/hermes_cli/commands.py b/hermes_cli/commands.py index 2cf2c3e9f40..b82cc2b4fc9 100644 --- a/hermes_cli/commands.py +++ b/hermes_cli/commands.py @@ -109,6 +109,9 @@ COMMAND_REGISTRY: list[CommandDef] = [ CommandDef("resume", "Resume a previously-named session", "Session", args_hint="[name]"), + # Configuration + CommandDef("sessions", "Browse and resume previous sessions", "Session"), + # Configuration CommandDef("config", "Show current configuration", "Configuration", cli_only=True), diff --git a/ui-tui/src/app/slash/commands/session.ts b/ui-tui/src/app/slash/commands/session.ts index 9dddd853726..ce9315ddb48 100644 --- a/ui-tui/src/app/slash/commands/session.ts +++ b/ui-tui/src/app/slash/commands/session.ts @@ -92,6 +92,19 @@ export const sessionCommands: SlashCommand[] = [ } }, + { + help: 'browse and resume previous sessions', + name: 'sessions', + run: (arg, ctx) => { + if (ctx.session.guardBusySessionSwitch('switch sessions')) { + return + } + if (!arg.trim()) { + return patchOverlayState({ picker: true }) + } + } + }, + { help: 'attach an image', name: 'image', From f4031df05dd457ad6ae17aff6a89848384447013 Mon Sep 17 00:00:00 2001 From: ethernet Date: Wed, 6 May 2026 15:53:47 -0400 Subject: [PATCH 002/230] ci(docker): don't cancel overlapping builds, guard :latest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch top-level concurrency to cancel-in-progress=false so every push to main gets its own SHA-tagged image published — no more discarded builds when commits land back-to-back. Guard the :latest tag with a second job that has its own concurrency group with cancel-in-progress=true plus a git-ancestor check against the revision label on the current :latest. Together these guarantee :latest only ever moves forward in history: a slower run whose commit isn't a descendant of the current :latest refuses to clobber it, and a newer push mid-way through the move-latest job preempts the older one before it can retag. - Every main push publishes nousresearch/hermes-agent:sha- with an org.opencontainers.image.revision label embedded. - move-latest job reads that label off :latest, runs merge-base --is-ancestor, and only retags (via buildx imagetools create, registry-side, no rebuild) if our commit strictly descends. - fetch-depth bumped to 1000 so merge-base has the history it needs. - Release tag flow unchanged (unique tag, no race). --- .github/workflows/docker-publish.yml | 145 ++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 228ee339646..7fb10b3dfbf 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -16,9 +16,13 @@ on: permissions: contents: read +# Top-level concurrency: do NOT cancel in-flight builds when a new push lands. +# Every commit deserves its own SHA-tagged image in the registry, and we guard +# the :latest tag in a separate job below (with its own concurrency group) so +# a slow run can't clobber :latest with older bits. concurrency: group: docker-${{ github.ref }} - cancel-in-progress: true + cancel-in-progress: false jobs: build-and-push: @@ -26,11 +30,18 @@ jobs: if: github.repository == 'NousResearch/hermes-agent' runs-on: ubuntu-latest timeout-minutes: 60 + outputs: + pushed_sha_tag: ${{ steps.mark_pushed.outputs.pushed }} steps: - name: Checkout code uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 with: submodules: recursive + # Fetch enough history to run `git merge-base --is-ancestor` in the + # move-latest job. That job reuses this checkout via its own + # actions/checkout call, but commits reachable from main up to ~1000 + # back are plenty for any realistic race window. + fetch-depth: 1000 - name: Set up QEMU uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3 @@ -74,7 +85,12 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Push multi-arch image (main branch) + # Always push a per-commit SHA tag on main. This is race-free because + # every commit has a unique SHA — concurrent runs can't clobber each + # other here. We also embed the git SHA as an OCI label so the + # move-latest job (below) can read it back off the registry's `:latest`. + - name: Push multi-arch image with SHA tag (main branch) + id: push_sha if: github.event_name == 'push' && github.ref == 'refs/heads/main' uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6 with: @@ -82,10 +98,17 @@ jobs: file: Dockerfile push: true platforms: linux/amd64,linux/arm64 - tags: nousresearch/hermes-agent:latest + tags: nousresearch/hermes-agent:sha-${{ github.sha }} + labels: | + org.opencontainers.image.revision=${{ github.sha }} cache-from: type=gha cache-to: type=gha,mode=max + - name: Mark SHA tag pushed + id: mark_pushed + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + run: echo "pushed=true" >> "$GITHUB_OUTPUT" + - name: Push multi-arch image (release) if: github.event_name == 'release' uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6 @@ -97,3 +120,119 @@ jobs: tags: nousresearch/hermes-agent:${{ github.event.release.tag_name }} cache-from: type=gha cache-to: type=gha,mode=max + + # Second job: moves `:latest` to point at the SHA tag the first job pushed. + # + # Has its own concurrency group with `cancel-in-progress: true`, which + # gives us the serialization we need: if a newer push arrives while an + # older run is mid-way through this job, the older run is cancelled + # before it can clobber `:latest`. Combined with the ancestor check + # below, this means `:latest` only ever moves forward in git history. + move-latest: + if: | + github.repository == 'NousResearch/hermes-agent' + && github.event_name == 'push' + && github.ref == 'refs/heads/main' + && needs.build-and-push.outputs.pushed_sha_tag == 'true' + needs: build-and-push + runs-on: ubuntu-latest + timeout-minutes: 10 + concurrency: + group: docker-move-latest-${{ github.ref }} + cancel-in-progress: true + steps: + - name: Checkout code + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + with: + fetch-depth: 1000 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3 + + - name: Log in to Docker Hub + uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # Read the git revision label off the current `:latest` manifest, then + # use `git merge-base --is-ancestor` to check whether our commit is a + # descendant of it. If `:latest` doesn't exist yet, or its label is + # missing, we treat that as "safe to publish". If another run already + # advanced `:latest` past us (or diverged), we skip and leave it alone. + - name: Decide whether to move :latest + id: latest_check + run: | + set -euo pipefail + image=nousresearch/hermes-agent + + # Pull the JSON for the linux/amd64 sub-manifest's config and extract + # the OCI revision label with jq — Go template field access can't + # handle dots in map keys, so using json+jq is the robust route. + image_json=$( + docker buildx imagetools inspect "${image}:latest" \ + --format '{{ json (index .Image "linux/amd64") }}' \ + 2>/dev/null || true + ) + + if [ -z "${image_json}" ]; then + echo "No existing :latest (or inspect failed) — safe to publish." + echo "push_latest=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + current_sha=$( + printf '%s' "${image_json}" \ + | jq -r '.config.Labels."org.opencontainers.image.revision" // ""' + ) + + if [ -z "${current_sha}" ]; then + echo "Registry :latest has no revision label — safe to publish." + echo "push_latest=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + + echo "Registry :latest is at ${current_sha}" + echo "This run is at ${GITHUB_SHA}" + + if [ "${current_sha}" = "${GITHUB_SHA}" ]; then + echo ":latest already points at our SHA — nothing to do." + echo "push_latest=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Make sure we have the :latest commit locally for merge-base. + if ! git cat-file -e "${current_sha}^{commit}" 2>/dev/null; then + git fetch --no-tags --prune origin \ + "+refs/heads/main:refs/remotes/origin/main" \ + || true + fi + + if ! git cat-file -e "${current_sha}^{commit}" 2>/dev/null; then + echo "Registry :latest points at an unknown commit (${current_sha}); refusing to overwrite." + echo "push_latest=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Our SHA must be a descendant of the current :latest to be safe. + if git merge-base --is-ancestor "${current_sha}" "${GITHUB_SHA}"; then + echo "Our commit is a descendant of :latest — safe to advance." + echo "push_latest=true" >> "$GITHUB_OUTPUT" + else + echo "Another run advanced :latest past us (or diverged) — leaving it alone." + echo "push_latest=false" >> "$GITHUB_OUTPUT" + fi + + # Retag the already-pushed SHA manifest as :latest. This is a registry- + # side operation — no rebuild, no layer re-push — so it's quick and + # atomic per-tag. The ancestor check above plus the cancel-in-progress + # concurrency on this job together guarantee we only ever move :latest + # forward in git history. + - name: Move :latest to this SHA + if: steps.latest_check.outputs.push_latest == 'true' + run: | + set -euo pipefail + image=nousresearch/hermes-agent + docker buildx imagetools create \ + --tag "${image}:latest" \ + "${image}:sha-${GITHUB_SHA}" From d514dd40552c6747eb465a539d5991376125c709 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 6 May 2026 13:20:09 -0700 Subject: [PATCH 003/230] docs(tool-gateway): rewrite as pitch-first marketing page (#20827) Previous version read like internal API docs \u2014 leading with env var tables, config YAML, and 'precedence' rules before ever explaining the product. Complete rewrite inverts the structure so readers see value first, mechanics last. Structure now: - Lede: 'One subscription. Every tool built in.' + pitch paragraph - CTA: subscribe/manage button styled as a real call-to-action - What's included: emoji-led table with expanded descriptions per tool. Image gen lists all 9 models by name (FLUX 2 Klein/Pro, Z-Image Turbo, Nano Banana Pro, GPT Image 1.5/2, Ideogram V3, Recraft V4 Pro, Qwen) - Why it's here: value bullets \u2014 one bill, one signup, one key, same quality, bring-your-own anytime - Get started: two-command flow (hermes model \u2192 hermes status) - Eligibility: paid-tier note with upgrade link - Mix and match: three realistic usage patterns - Using individual image models: ID reference table for power users - --- separator --- - Configuration reference (demoted): use_gateway flag, disabling, self-hosted gateway env vars moved below the fold where they belong - FAQ: streamlined, removed redundant content Fact-checked against code: - 9 FAL models confirmed from tools/image_generation_tool.py FAL_MODELS - Status section output verified against hermes_cli/status.py - Portal subscription URL preserved - Self-hosted env vars (TOOL_GATEWAY_DOMAIN etc.) kept accurate Verified: docusaurus build SUCCESS, page renders, no new broken links. --- .../docs/user-guide/features/tool-gateway.md | 215 +++++++++--------- 1 file changed, 102 insertions(+), 113 deletions(-) diff --git a/website/docs/user-guide/features/tool-gateway.md b/website/docs/user-guide/features/tool-gateway.md index 5d702e6f9f7..91a560b92e6 100644 --- a/website/docs/user-guide/features/tool-gateway.md +++ b/website/docs/user-guide/features/tool-gateway.md @@ -1,80 +1,116 @@ --- title: "Nous Tool Gateway" -description: "Route web search, image generation, text-to-speech, and browser automation through your Nous subscription — no extra API keys needed" +description: "One subscription, every tool. Web search, image generation, TTS, and cloud browsers — all routed through Nous Portal with no extra API keys." sidebar_label: "Tool Gateway" sidebar_position: 2 --- # Nous Tool Gateway -:::tip Get Started -The Tool Gateway is included with paid Nous Portal subscriptions. **[Manage your subscription →](https://portal.nousresearch.com/manage-subscription)** -::: +**One subscription. Every tool built in.** -The **Tool Gateway** lets paid [Nous Portal](https://portal.nousresearch.com) subscribers use web search, image generation, text-to-speech, and browser automation through their existing subscription — no need to sign up for separate API keys from Firecrawl, FAL, OpenAI, or Browser Use. +The Tool Gateway is included with every paid [Nous Portal](https://portal.nousresearch.com) subscription. It routes Hermes' tool calls — web search, image generation, text-to-speech, and cloud browser automation — through infrastructure Nous already runs, so you don't have to sign up with Firecrawl, FAL, OpenAI, Browser Use, or anyone else just to make your agent useful. -## What's Included +
+ Start or manage subscription → +
-| Tool | What It Does | Direct Alternative | -|------|--------------|--------------------| -| **Web search & extract** | Search the web and extract page content via Firecrawl | `FIRECRAWL_API_KEY`, `EXA_API_KEY`, `PARALLEL_API_KEY`, `TAVILY_API_KEY` | -| **Image generation** | Generate images via FAL (9 models: FLUX 2 Klein/Pro, GPT-Image 1.5/2, Nano Banana Pro, Ideogram V3, Recraft V4 Pro, Qwen, Z-Image Turbo) | `FAL_KEY` | -| **Text-to-speech** | Convert text to speech via OpenAI TTS | `VOICE_TOOLS_OPENAI_KEY`, `ELEVENLABS_API_KEY` | -| **Browser automation** | Control cloud browsers via Browser Use | `BROWSER_USE_API_KEY`, `BROWSERBASE_API_KEY` | +## What's included -All four tools bill to your Nous subscription. You can enable any combination — for example, use the gateway for web and image generation while keeping your own ElevenLabs key for TTS. +| | Tool | What you get | +|---|---|---| +| 🔍 | **Web search & extract** | Agent-grade web search and full-page extraction via Firecrawl. No rate limits to worry about — the gateway handles scaling. | +| 🎨 | **Image generation** | Nine models under one endpoint: **FLUX 2 Klein 9B**, **FLUX 2 Pro**, **Z-Image Turbo**, **Nano Banana Pro** (Gemini 3 Pro Image), **GPT Image 1.5**, **GPT Image 2**, **Ideogram V3**, **Recraft V4 Pro**, **Qwen Image**. Pick per-generation with a flag, or let Hermes default to FLUX 2 Klein. | +| 🔊 | **Text-to-speech** | OpenAI TTS voices wired into the `text_to_speech` tool. Drop voice notes into Telegram, generate audio for pipelines, narrate anything. | +| 🌐 | **Cloud browser automation** | Headless Chromium sessions via Browser Use. `browser_navigate`, `browser_click`, `browser_type`, `browser_vision` — all the agent-driving primitives, no Browserbase account required. | -## Eligibility +All four are pay-as-you-use billed against your Nous subscription. Use any combination — run the gateway for web and images while keeping your own ElevenLabs key for TTS, or route everything through Nous. -The Tool Gateway is available to **paid** [Nous Portal](https://portal.nousresearch.com/manage-subscription) subscribers. Free-tier accounts do not have access — [upgrade your subscription](https://portal.nousresearch.com/manage-subscription) to unlock it. +## Why it's here -To check your status: +Building an agent that can actually *do things* means stitching together 5+ API subscriptions — each with their own signup, rate limits, billing, and quirks. The gateway collapses that into one account: + +- **One bill.** Pay Nous; we handle the rest. +- **One signup.** No Firecrawl, FAL, Browser Use, or OpenAI audio accounts to manage. +- **One key.** Your Nous Portal OAuth covers every tool. +- **Same quality.** Same backends the direct-key route uses — just fronted by us. + +Bring your own keys anytime — per-tool, whenever you want to. The gateway isn't a lock-in, it's a shortcut. + +## Get started + +```bash +hermes model # Pick Nous Portal as your provider +``` + +When you select Nous Portal, Hermes offers to turn on the Tool Gateway. Accept, and you're done — every supported tool is live on the next run. + +Check what's active at any time: ```bash hermes status ``` -Look for the **Nous Tool Gateway** section. It shows which tools are active via the gateway, which use direct keys, and which aren't configured. - -## Enabling the Tool Gateway - -### During model setup - -When you run `hermes model` and select Nous Portal as your provider, Hermes automatically offers to enable the Tool Gateway: +You'll see a section like: ``` -Your Nous subscription includes the Tool Gateway. - - The Tool Gateway gives you access to web search, image generation, - text-to-speech, and browser automation through your Nous subscription. - No need to sign up for separate API keys — just pick the tools you want. - - ○ Web search & extract (Firecrawl) — not configured - ○ Image generation (FAL) — not configured - ○ Text-to-speech (OpenAI TTS) — not configured - ○ Browser automation (Browser Use) — not configured - - ● Enable Tool Gateway - ○ Skip +◆ Nous Tool Gateway + Nous Portal ✓ managed tools available + Web tools ✓ active via Nous subscription + Image gen ✓ active via Nous subscription + TTS ✓ active via Nous subscription + Browser ○ active via Browser Use key ``` -Select **Enable Tool Gateway** and you're done. +Tools marked "active via Nous subscription" are going through the gateway. Anything else is using your own keys. -If you already have direct API keys for some tools, the prompt adapts — you can enable the gateway for all tools (your existing keys are kept in `.env` but not used at runtime), enable only for unconfigured tools, or skip entirely. +## Eligibility -### Via `hermes tools` +The Tool Gateway is a **paid-subscription** feature. Free-tier Nous accounts can use Portal for inference but don't include managed tools — [upgrade your plan](https://portal.nousresearch.com/manage-subscription) to unlock the gateway. -You can also enable the gateway tool-by-tool through the interactive tool configuration: +## Mix and match + +The gateway is per-tool. Turn it on for just what you want: + +- **All tools through Nous** — easiest; one subscription, done. +- **Gateway for web + images, bring your own TTS** — keep your ElevenLabs voice, let Nous handle the rest. +- **Gateway only for things you don't have keys for** — "I already pay for Browserbase, but I don't want a Firecrawl account" works fine. + +Switch any tool at any time via: ```bash -hermes tools +hermes tools # Interactive picker for each tool category ``` -Select a tool category (Web, Browser, Image Generation, or TTS), then choose **Nous Subscription** as the provider. This sets `use_gateway: true` for that tool in your config. +Select the tool, pick **Nous Subscription** as the provider (or any direct provider you prefer). No config editing required. -### Manual configuration +## Using individual image models -Set the `use_gateway` flag directly in `~/.hermes/config.yaml`: +Image generation defaults to FLUX 2 Klein 9B for speed. Override per-call by passing the model ID to the `image_generate` tool: + +| Model | ID | Best for | +|---|---|---| +| FLUX 2 Klein 9B | `fal-ai/flux-2/klein/9b` | Fast, good default | +| FLUX 2 Pro | `fal-ai/flux-2/pro` | Higher fidelity FLUX | +| Z-Image Turbo | `fal-ai/z-image/turbo` | Stylized, fast | +| Nano Banana Pro | `fal-ai/gemini-3-pro-image` | Google Gemini 3 Pro Image | +| GPT Image 1.5 | `fal-ai/gpt-image-1/5` | OpenAI image gen, text+image | +| GPT Image 2 | `fal-ai/gpt-image-2` | OpenAI latest | +| Ideogram V3 | `fal-ai/ideogram/v3` | Strong prompt adherence + typography | +| Recraft V4 Pro | `fal-ai/recraft/v4/pro` | Vector-style, graphic design | +| Qwen Image | `fal-ai/qwen-image` | Alibaba multimodal | + +The set evolves — `hermes tools` → Image Generation shows the current live list. + +--- + +## Configuration reference + +Most users never need to touch this — `hermes model` and `hermes tools` cover every workflow interactively. This section is for writing config.yaml directly or scripting setups. + +### Per-tool `use_gateway` flag + +Each tool's config block takes a `use_gateway` boolean: ```yaml web: @@ -93,95 +129,48 @@ browser: use_gateway: true ``` -## How It Works +Precedence: `use_gateway: true` routes through Nous regardless of any direct keys in `.env`. `use_gateway: false` (or absent) uses direct keys if available and only falls back to the gateway when none exist. -When `use_gateway: true` is set for a tool, the runtime routes API calls through the Nous Tool Gateway instead of using direct API keys: - -1. **Web tools** — `web_search` and `web_extract` use the gateway's Firecrawl endpoint -2. **Image generation** — `image_generate` uses the gateway's FAL endpoint -3. **TTS** — `text_to_speech` uses the gateway's OpenAI Audio endpoint -4. **Browser** — `browser_navigate` and other browser tools use the gateway's Browser Use endpoint - -The gateway authenticates using your Nous Portal credentials (stored in `~/.hermes/auth.json` after `hermes model`). - -### Precedence - -Each tool checks `use_gateway` first: - -- **`use_gateway: true`** → route through the gateway, even if direct API keys exist in `.env` -- **`use_gateway: false`** (or absent) → use direct API keys if available, fall back to gateway only when no direct keys exist - -This means you can switch between gateway and direct keys at any time without deleting your `.env` credentials. - -## Switching Back to Direct Keys - -To stop using the gateway for a specific tool: - -```bash -hermes tools # Select the tool → choose a direct provider -``` - -Or set `use_gateway: false` in config: +### Disabling the gateway ```yaml web: - backend: firecrawl - use_gateway: false # Now uses FIRECRAWL_API_KEY from .env + use_gateway: false # Hermes now uses FIRECRAWL_API_KEY from .env ``` -When you select a non-gateway provider in `hermes tools`, the `use_gateway` flag is automatically set to `false` to prevent contradictory config. +`hermes tools` automatically clears the flag when you pick a non-gateway provider, so this usually happens for you. -## Checking Status +### Self-hosted gateway (advanced) + +Running your own Nous-compatible gateway? Override endpoints in `~/.hermes/.env`: ```bash -hermes status +TOOL_GATEWAY_DOMAIN=your-domain.example.com +TOOL_GATEWAY_SCHEME=https +TOOL_GATEWAY_USER_TOKEN=your-token # normally auto-populated from Portal login +FIRECRAWL_GATEWAY_URL=https://... # override one endpoint specifically ``` -The **Nous Tool Gateway** section shows: - -``` -◆ Nous Tool Gateway - Nous Portal ✓ managed tools available - Web tools ✓ active via Nous subscription - Image gen ✓ active via Nous subscription - TTS ✓ active via Nous subscription - Browser ○ active via Browser Use key - Modal ○ available via subscription (optional) -``` - -Tools marked "active via Nous subscription" are routed through the gateway. Tools with their own keys show which provider is active. - -## Advanced: Self-Hosted Gateway - -For self-hosted or custom gateway deployments, you can override the gateway endpoints via environment variables in `~/.hermes/.env`: - -```bash -TOOL_GATEWAY_DOMAIN=nousresearch.com # Base domain for gateway routing -TOOL_GATEWAY_SCHEME=https # HTTP or HTTPS (default: https) -TOOL_GATEWAY_USER_TOKEN=your-token # Auth token (normally auto-populated) -FIRECRAWL_GATEWAY_URL=https://... # Override for the Firecrawl endpoint specifically -``` - -These env vars are always visible in the configuration regardless of subscription status — they're useful for custom infrastructure setups. +These knobs exist for custom infrastructure setups (enterprise deployments, dev environments). Regular subscribers never set them. ## FAQ -### Do I need to delete my existing API keys? +### Does it work with Telegram / Discord / the other messaging gateways? -No. When `use_gateway: true` is set, the runtime skips direct API keys and routes through the gateway. Your keys stay in `.env` untouched. If you later disable the gateway, they'll be used again automatically. +Yes. Tool Gateway operates at the tool-execution layer, not the CLI. Every interface that can call a tool — CLI, Telegram, Discord, Slack, IRC, Teams, the API server, anything — benefits from it transparently. -### Can I use the gateway for some tools and direct keys for others? +### What happens if my subscription expires? -Yes. The `use_gateway` flag is per-tool. You can mix and match — for example, gateway for web and image generation, your own ElevenLabs key for TTS, and Browserbase for browser automation. +Tools routed through the gateway stop working until you renew or swap in direct API keys via `hermes tools`. Hermes shows a clear error pointing at the portal. -### What if my subscription expires? +### Can I see usage or costs per tool? -Tools that were routed through the gateway will stop working until you [renew your subscription](https://portal.nousresearch.com/manage-subscription) or switch to direct API keys via `hermes tools`. +Yes — the [Nous Portal dashboard](https://portal.nousresearch.com) breaks usage down by tool so you can see what's driving your bill. -### Does the gateway work with the messaging gateway? +### Is Modal (serverless terminal) included? -Yes. The Tool Gateway routes tool API calls regardless of whether you're using the CLI, Telegram, Discord, or any other messaging platform. It operates at the tool runtime level, not the entry point level. +Modal is available as an **optional add-on** through the Nous subscription, not part of the default Tool Gateway bundle. Configure it via `hermes setup terminal` or directly in `config.yaml` when you want a remote sandbox for shell execution. -### Is Modal included? +### Do I need to delete my existing API keys when I enable the gateway? -Modal (serverless terminal backend) is available as an optional add-on through the Nous subscription. It's not enabled by the Tool Gateway prompt — configure it separately via `hermes setup terminal` or in `config.yaml`. +No — keep them in `.env`. When `use_gateway: true`, Hermes skips direct keys and uses the gateway. Flip the flag back to `false` and your keys become the source again. The gateway isn't a lock-in. From 33bf5f6292f49f109f11fb9c035afae6dcd356e3 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 6 May 2026 09:07:32 -0700 Subject: [PATCH 004/230] fix(auth): fall back to global-root auth.json for providers missing in profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profile processes (kanban workers, cron subprocesses, delegated subagents) read the profile's auth.json only. If a provider was authenticated at the global root but not inside the profile, the profile's credential_pool comes back empty and the process fails with 'No LLM provider configured' — even though the credentials are sitting in ~/.hermes/auth.json. #18594 propagated HERMES_HOME correctly, which is what surfaced this: workers now land in the right profile, and the profile turns out to shadow global with no fallback. Semantics (read-only, per-provider shadowing): * Profile has any entries for provider X → use profile only (global ignored). * Profile has zero entries for provider X → fall back to global. * Writes (write_credential_pool, _save_auth_store) still target the profile. * Classic mode (HERMES_HOME == global root) skips the fallback entirely — _global_auth_file_path() returns None. Also mirrors the fallback in get_provider_auth_state so OAuth singletons (nous, minimax-oauth, openai-codex, spotify) inherit cleanly — the Nous shared-token store (PR #19712) remains the authoritative path for Nous OAuth rotation, this just makes the read side consistent with it. Seat belt: _load_global_auth_store() refuses to read the real user's ~/.hermes/auth.json under PYTEST_CURRENT_TEST even when HERMES_HOME points to a profile-shaped path. Guard uses $HOME (stable across fixtures) rather than Path.home() (which fixtures often monkeypatch to a tmp root). Reported by @SeedsForbidden on Twitter as the credential_pool shadowing follow-up to the #18594 fix. --- hermes_cli/auth.py | 128 ++++++- .../hermes_cli/test_auth_profile_fallback.py | 360 ++++++++++++++++++ 2 files changed, 483 insertions(+), 5 deletions(-) create mode 100644 tests/hermes_cli/test_auth_profile_fallback.py diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index 48abb1fa12f..5ff5638b91e 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -780,6 +780,73 @@ def _auth_file_path() -> Path: return path +def _global_auth_file_path() -> Optional[Path]: + """Return the global-root auth.json when the process is in profile mode. + + Returns ``None`` when the profile and global root resolve to the same + directory (classic mode, or custom HERMES_HOME that is not a profile). + Used by read-only fallback paths so providers authed at the root are + visible to profile processes that haven't configured them locally. + + See issue #18594 follow-up (credential_pool shadowing). + """ + try: + from hermes_constants import get_default_hermes_root + global_root = get_default_hermes_root() + except Exception: + return None + profile_home = get_hermes_home() + try: + if profile_home.resolve(strict=False) == global_root.resolve(strict=False): + return None + except Exception: + if profile_home == global_root: + return None + # No pytest seat belt here: this is a pure read-only path, and + # ``_load_global_auth_store()`` wraps the read in a try/except so an + # unreadable global file can never break the profile process. The + # write-side seat belt still lives on ``_auth_file_path()`` where it + # belongs (that's what protects the real user's auth store from being + # corrupted by a mis-configured test). + return global_root / "auth.json" + + +def _load_global_auth_store() -> Dict[str, Any]: + """Load the global-root auth store (read-only fallback). + + Returns an empty dict when no global fallback exists (classic mode, + or the global auth.json is absent). Never raises on missing file. + + Seat belt: under pytest, refuses to read the real user's + ``~/.hermes/auth.json`` even when HERMES_HOME is set to a profile + path. The hermetic conftest does not redirect ``HOME``, so + ``get_default_hermes_root()`` for a profile-shaped HERMES_HOME can + still resolve to the real user's home on a dev machine. That would + leak real credentials into tests. This guard uses the unmodified + ``HOME`` env var (what ``os.path.expanduser('~')`` would resolve to), + not ``Path.home()``, because ``Path.home`` is sometimes monkeypatched + by fixtures that want to relocate the global root to a tmp path. + """ + global_path = _global_auth_file_path() + if global_path is None or not global_path.exists(): + return {} + if os.environ.get("PYTEST_CURRENT_TEST"): + real_home_env = os.environ.get("HOME", "") + if real_home_env: + real_root = Path(real_home_env) / ".hermes" / "auth.json" + try: + if global_path.resolve(strict=False) == real_root.resolve(strict=False): + return {} + except Exception: + pass + try: + return _load_auth_store(global_path) + except Exception: + # A malformed global store must not break profile reads. The + # profile's own auth store is still authoritative. + return {} + + def _auth_lock_path() -> Path: return _auth_file_path().with_suffix(".lock") @@ -966,15 +1033,50 @@ def get_auth_provider_display_name(provider_id: str) -> str: def read_credential_pool(provider_id: Optional[str] = None) -> Dict[str, Any]: - """Return the persisted credential pool, or one provider slice.""" + """Return the persisted credential pool, or one provider slice. + + In profile mode, the profile's credential pool is authoritative. If a + provider has no entries in the profile, entries from the global-root + ``auth.json`` are used as a read-only fallback — so workers spawned in a + profile can see providers that were only authenticated at global scope. + + Profile entries always win: the global fallback only applies per-provider + when the profile has zero entries for that provider. Once the user runs + ``hermes auth add `` inside the profile, profile entries + fully shadow global for that provider on the next read. + + Writes always go to the profile (``write_credential_pool`` is unchanged). + See issue #18594 follow-up. + """ auth_store = _load_auth_store() pool = auth_store.get("credential_pool") if not isinstance(pool, dict): pool = {} + + global_pool: Dict[str, Any] = {} + global_store = _load_global_auth_store() + maybe_global_pool = global_store.get("credential_pool") if global_store else None + if isinstance(maybe_global_pool, dict): + global_pool = maybe_global_pool + if provider_id is None: - return dict(pool) + merged = dict(pool) + for gp_key, gp_entries in global_pool.items(): + if not isinstance(gp_entries, list) or not gp_entries: + continue + # Per-provider shadowing: profile wins whenever it has ANY entries. + existing = merged.get(gp_key) + if isinstance(existing, list) and existing: + continue + merged[gp_key] = list(gp_entries) + return merged + provider_entries = pool.get(provider_id) - return list(provider_entries) if isinstance(provider_entries, list) else [] + if isinstance(provider_entries, list) and provider_entries: + return list(provider_entries) + # Profile has no entries for this provider — fall back to global. + global_entries = global_pool.get(provider_id) + return list(global_entries) if isinstance(global_entries, list) else [] def write_credential_pool(provider_id: str, entries: List[Dict[str, Any]]) -> Path: @@ -1033,9 +1135,25 @@ def unsuppress_credential_source(provider_id: str, source: str) -> bool: def get_provider_auth_state(provider_id: str) -> Optional[Dict[str, Any]]: - """Return persisted auth state for a provider, or None.""" + """Return persisted auth state for a provider, or None. + + In profile mode, falls back to the global-root ``auth.json`` when the + profile has no state for this provider. Profile state always wins when + present. Writes (``_save_auth_store`` / ``persist_*_credentials``) are + unchanged — they still target the profile only. This mirrors + ``read_credential_pool``'s per-provider shadowing semantics so that + ``_seed_from_singletons`` can reseed a profile's credential pool from + global-scope provider state (e.g. a globally-authenticated Anthropic + OAuth or Nous device-code session). See issue #18594 follow-up. + """ auth_store = _load_auth_store() - return _load_provider_state(auth_store, provider_id) + state = _load_provider_state(auth_store, provider_id) + if state is not None: + return state + global_store = _load_global_auth_store() + if not global_store: + return None + return _load_provider_state(global_store, provider_id) def get_active_provider() -> Optional[str]: diff --git a/tests/hermes_cli/test_auth_profile_fallback.py b/tests/hermes_cli/test_auth_profile_fallback.py new file mode 100644 index 00000000000..2063517d28c --- /dev/null +++ b/tests/hermes_cli/test_auth_profile_fallback.py @@ -0,0 +1,360 @@ +"""Tests for cross-profile auth fallback. + +When ``HERMES_HOME`` points to a named profile, ``read_credential_pool()`` +and ``get_provider_auth_state()`` fall back to the global-root +``auth.json`` per-provider when the profile has no entries for that +provider. Writes still target the profile only. + +See the #18594 follow-up report: profile workers couldn't see providers +authenticated only at the global root. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + + +def _make_auth_store(pool: dict | None = None, providers: dict | None = None) -> dict: + store: dict = {"version": 1} + if pool is not None: + store["credential_pool"] = pool + if providers is not None: + store["providers"] = providers + return store + + +@pytest.fixture() +def profile_env(tmp_path, monkeypatch): + """Set up a global root + an active profile under Path.home()/.hermes/profiles/coder. + + * Path.home() -> tmp_path + * Global root -> tmp_path/.hermes (has its own auth.json fixture) + * Profile -> tmp_path/.hermes/profiles/coder (active, HERMES_HOME points here) + + This mirrors the real "named profile mounted under the default root" + layout that profile users actually have on disk. + """ + monkeypatch.setattr(Path, "home", lambda: tmp_path) + global_root = tmp_path / ".hermes" + global_root.mkdir() + profile_dir = global_root / "profiles" / "coder" + profile_dir.mkdir(parents=True) + monkeypatch.setenv("HERMES_HOME", str(profile_dir)) + return {"global": global_root, "profile": profile_dir} + + +def _write(path: Path, payload: dict) -> None: + path.write_text(json.dumps(payload, indent=2)) + + +# --------------------------------------------------------------------------- +# read_credential_pool — provider-slice reads +# --------------------------------------------------------------------------- + + +def test_profile_with_zero_entries_falls_back_to_global(profile_env): + """Empty profile pool inherits the global-root entries for that provider.""" + from hermes_cli.auth import read_credential_pool + + _write(profile_env["global"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "glob-1", + "label": "global-key", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-global", + }], + })) + # Profile auth.json: exists but has no openrouter entries. + _write(profile_env["profile"] / "auth.json", _make_auth_store(pool={})) + + entries = read_credential_pool("openrouter") + assert len(entries) == 1 + assert entries[0]["id"] == "glob-1" + assert entries[0]["access_token"] == "sk-or-global" + + +def test_profile_with_entries_fully_shadows_global(profile_env): + """Once the profile has any entries for a provider, global is ignored.""" + from hermes_cli.auth import read_credential_pool + + _write(profile_env["global"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "glob-1", + "label": "global-key", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-global", + }], + })) + _write(profile_env["profile"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "prof-1", + "label": "profile-key", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-profile", + }], + })) + + entries = read_credential_pool("openrouter") + assert len(entries) == 1 + assert entries[0]["id"] == "prof-1" + assert entries[0]["access_token"] == "sk-or-profile" + + +def test_per_provider_shadowing_is_independent(profile_env): + """Profile can override one provider while inheriting another from global.""" + from hermes_cli.auth import read_credential_pool + + _write(profile_env["global"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "glob-or", + "label": "global-or", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-global", + }], + "anthropic": [{ + "id": "glob-ant", + "label": "global-ant", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-ant-global", + }], + })) + _write(profile_env["profile"] / "auth.json", _make_auth_store(pool={ + # Profile has openrouter only — anthropic should still fall back. + "openrouter": [{ + "id": "prof-or", + "label": "profile-or", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-profile", + }], + })) + + or_entries = read_credential_pool("openrouter") + ant_entries = read_credential_pool("anthropic") + assert [e["id"] for e in or_entries] == ["prof-or"] + assert [e["id"] for e in ant_entries] == ["glob-ant"] + + +def test_missing_global_auth_file_is_safe(profile_env): + """Profile processes that never had a global auth.json still work.""" + from hermes_cli.auth import read_credential_pool + + # No global auth.json written at all. + _write(profile_env["profile"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "prof-1", + "label": "profile", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-profile", + }], + })) + + assert read_credential_pool("openrouter")[0]["id"] == "prof-1" + assert read_credential_pool("anthropic") == [] + + +def test_malformed_global_auth_file_does_not_break_profile_read(profile_env): + (profile_env["global"] / "auth.json").write_text("{not valid json") + _write(profile_env["profile"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "prof-1", + "label": "profile", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-profile", + }], + })) + + from hermes_cli.auth import read_credential_pool + + # Profile reads still work; malformed global is silently ignored. + assert read_credential_pool("openrouter")[0]["id"] == "prof-1" + # And no fallback for anthropic since global is unreadable. + assert read_credential_pool("anthropic") == [] + + +# --------------------------------------------------------------------------- +# read_credential_pool — whole-pool reads (provider_id=None) +# --------------------------------------------------------------------------- + + +def test_whole_pool_merges_global_providers_when_missing_locally(profile_env): + from hermes_cli.auth import read_credential_pool + + _write(profile_env["global"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "glob-or", + "label": "global-or", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-global", + }], + "anthropic": [{ + "id": "glob-ant", + "label": "global-ant", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-ant-global", + }], + })) + _write(profile_env["profile"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "prof-or", + "label": "profile-or", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-or-profile", + }], + })) + + pool = read_credential_pool(None) + # Profile wins for openrouter, global fills in anthropic. + assert [e["id"] for e in pool["openrouter"]] == ["prof-or"] + assert [e["id"] for e in pool["anthropic"]] == ["glob-ant"] + + +# --------------------------------------------------------------------------- +# get_provider_auth_state — singleton fallback +# --------------------------------------------------------------------------- + + +def test_provider_auth_state_falls_back_to_global_when_profile_has_none(profile_env): + from hermes_cli.auth import get_provider_auth_state + + _write(profile_env["global"] / "auth.json", _make_auth_store(providers={ + "nous": {"access_token": "nous-global", "refresh_token": "rt-global"}, + })) + _write(profile_env["profile"] / "auth.json", _make_auth_store(providers={})) + + state = get_provider_auth_state("nous") + assert state is not None + assert state["access_token"] == "nous-global" + + +def test_provider_auth_state_profile_wins_when_present(profile_env): + from hermes_cli.auth import get_provider_auth_state + + _write(profile_env["global"] / "auth.json", _make_auth_store(providers={ + "nous": {"access_token": "nous-global"}, + })) + _write(profile_env["profile"] / "auth.json", _make_auth_store(providers={ + "nous": {"access_token": "nous-profile"}, + })) + + state = get_provider_auth_state("nous") + assert state is not None + assert state["access_token"] == "nous-profile" + + +def test_provider_auth_state_returns_none_when_neither_has_it(profile_env): + from hermes_cli.auth import get_provider_auth_state + + _write(profile_env["global"] / "auth.json", _make_auth_store(providers={})) + _write(profile_env["profile"] / "auth.json", _make_auth_store(providers={})) + + assert get_provider_auth_state("nous") is None + + +# --------------------------------------------------------------------------- +# Classic mode — no fallback path should ever trigger +# --------------------------------------------------------------------------- + + +def test_classic_mode_does_not_double_read_same_file(tmp_path, monkeypatch): + """In classic mode (HERMES_HOME == global root), no fallback path runs. + + This guards against the merge accidentally duplicating entries when the + profile and global resolve to the same directory. + """ + # Put Path.home() under a subdir so the seat belt in _auth_file_path() + # sees tmp_path/home/.hermes as the "real home" — which is NOT equal + # to the HERMES_HOME we set (tmp_path/classic), so the guard passes. + fake_home = tmp_path / "home" + fake_home.mkdir() + monkeypatch.setattr(Path, "home", lambda: fake_home) + hermes_home = tmp_path / "classic" + hermes_home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(hermes_home)) + + _write(hermes_home / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "only", + "label": "classic", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-classic", + }], + })) + + from hermes_cli.auth import read_credential_pool, _global_auth_file_path + + # Classic mode: HERMES_HOME is set to a custom path that is NOT under + # ~/.hermes/profiles/ — get_default_hermes_root() returns HERMES_HOME + # itself, so the profile root and global root are the same directory, + # and the helper correctly returns None (no fallback). + assert _global_auth_file_path() is None + # And the read should return exactly one entry (not two). + entries = read_credential_pool("openrouter") + assert len(entries) == 1 + assert entries[0]["id"] == "only" + + +# --------------------------------------------------------------------------- +# Writes stay scoped to the profile +# --------------------------------------------------------------------------- + + +def test_write_credential_pool_targets_profile_not_global(profile_env): + from hermes_cli.auth import read_credential_pool, write_credential_pool + + _write(profile_env["global"] / "auth.json", _make_auth_store(pool={ + "openrouter": [{ + "id": "glob-1", + "label": "global", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-global", + }], + })) + + write_credential_pool("openrouter", [{ + "id": "prof-new", + "label": "profile-new", + "auth_type": "api_key", + "priority": 0, + "source": "manual", + "access_token": "sk-profile-new", + }]) + + # Global auth.json unchanged. + global_data = json.loads((profile_env["global"] / "auth.json").read_text()) + assert global_data["credential_pool"]["openrouter"][0]["id"] == "glob-1" + + # Profile auth.json holds the new entry. + profile_data = json.loads((profile_env["profile"] / "auth.json").read_text()) + assert profile_data["credential_pool"]["openrouter"][0]["id"] == "prof-new" + + # Subsequent read returns profile (shadows global). + assert [e["id"] for e in read_credential_pool("openrouter")] == ["prof-new"] From b71f80e6ce2af7a75e319170340dec9d64461576 Mon Sep 17 00:00:00 2001 From: Guillaume Meyer Date: Wed, 6 May 2026 15:37:04 +0000 Subject: [PATCH 005/230] feat(gateway): per-platform gateway_restart_notification flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an opt-out toggle on PlatformConfig that gates both restart lifecycle pings: the "♻ Gateway restarted" message sent to the chat that issued /restart, and the "♻️ Gateway online" home-channel startup notification. Defaults to True so existing deployments are unaffected. The motivating split is operator vs. end-user surfaces: a back-channel like Telegram should keep these pings, while a Slack workspace shared with end users should not surface gateway lifecycle noise. Co-Authored-By: Claude Opus 4.7 (1M context) --- gateway/config.py | 19 ++++-- gateway/run.py | 16 +++++ tests/gateway/test_config.py | 13 ++++ tests/gateway/test_restart_notification.py | 76 ++++++++++++++++++++++ 4 files changed, 120 insertions(+), 4 deletions(-) diff --git a/gateway/config.py b/gateway/config.py index 2e0e3276b7b..8eb39ba54a3 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -271,15 +271,23 @@ class PlatformConfig: # - "first": Only first chunk threads to user's message (default) # - "all": All chunks in multi-part replies thread to user's message reply_to_mode: str = "first" - + + # Whether the gateway is allowed to send "♻️ Gateway online" / + # "♻ Gateway restarted" lifecycle notifications on this platform. + # Default True preserves prior behavior. Set False on platforms used + # by end users (e.g. Slack) where operator-flavored restart pings are + # noise; keep True for back-channels where the operator wants them. + gateway_restart_notification: bool = True + # Platform-specific settings extra: Dict[str, Any] = field(default_factory=dict) - + def to_dict(self) -> Dict[str, Any]: result = { "enabled": self.enabled, "extra": self.extra, "reply_to_mode": self.reply_to_mode, + "gateway_restart_notification": self.gateway_restart_notification, } if self.token: result["token"] = self.token @@ -288,19 +296,22 @@ class PlatformConfig: if self.home_channel: result["home_channel"] = self.home_channel.to_dict() return result - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> "PlatformConfig": home_channel = None if "home_channel" in data: home_channel = HomeChannel.from_dict(data["home_channel"]) - + return cls( enabled=_coerce_bool(data.get("enabled"), False), token=data.get("token"), api_key=data.get("api_key"), home_channel=home_channel, reply_to_mode=data.get("reply_to_mode", "first"), + gateway_restart_notification=_coerce_bool( + data.get("gateway_restart_notification"), True + ), extra=data.get("extra", {}), ) diff --git a/gateway/run.py b/gateway/run.py index 1c125d9aff2..77f20178d19 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -11386,6 +11386,14 @@ class GatewayRunner: ) return None + platform_cfg = self.config.platforms.get(platform) + if platform_cfg is not None and not platform_cfg.gateway_restart_notification: + logger.info( + "Restart notification suppressed: %s has gateway_restart_notification=false", + platform_str, + ) + return None + metadata = {"thread_id": thread_id} if thread_id else None result = await adapter.send( str(chat_id), @@ -11437,6 +11445,14 @@ class GatewayRunner: if not home or not home.chat_id: continue + platform_cfg = self.config.platforms.get(platform) + if platform_cfg is not None and not platform_cfg.gateway_restart_notification: + logger.info( + "Home-channel startup notification suppressed: %s has gateway_restart_notification=false", + platform.value, + ) + continue + target = (platform.value, str(home.chat_id), str(home.thread_id) if home.thread_id else None) if target in skipped or target in delivered: continue diff --git a/tests/gateway/test_config.py b/tests/gateway/test_config.py index 3df2a7d50b9..c53e34b757e 100644 --- a/tests/gateway/test_config.py +++ b/tests/gateway/test_config.py @@ -57,6 +57,19 @@ class TestPlatformConfigRoundtrip: restored = PlatformConfig.from_dict({"enabled": "false"}) assert restored.enabled is False + def test_gateway_restart_notification_defaults_true(self): + assert PlatformConfig().gateway_restart_notification is True + assert PlatformConfig.from_dict({}).gateway_restart_notification is True + + def test_gateway_restart_notification_roundtrip_false(self): + pc = PlatformConfig(enabled=True, gateway_restart_notification=False) + restored = PlatformConfig.from_dict(pc.to_dict()) + assert restored.gateway_restart_notification is False + + def test_gateway_restart_notification_coerces_quoted_false(self): + restored = PlatformConfig.from_dict({"gateway_restart_notification": "false"}) + assert restored.gateway_restart_notification is False + class TestGetConnectedPlatforms: def test_returns_enabled_with_token(self): diff --git a/tests/gateway/test_restart_notification.py b/tests/gateway/test_restart_notification.py index e97216072a4..d48ced6bb7f 100644 --- a/tests/gateway/test_restart_notification.py +++ b/tests/gateway/test_restart_notification.py @@ -496,6 +496,82 @@ async def test_send_restart_notification_logs_warning_on_sendresult_failure( assert not notify_path.exists() +@pytest.mark.asyncio +async def test_send_home_channel_startup_notification_skipped_when_flag_disabled( + tmp_path, monkeypatch +): + """Per-platform opt-out: gateway_restart_notification=False mutes the home-channel ping.""" + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + + runner, adapter = make_restart_runner() + runner.config.platforms[Platform.TELEGRAM].home_channel = HomeChannel( + platform=Platform.TELEGRAM, + chat_id="home-42", + name="Ops Home", + ) + runner.config.platforms[Platform.TELEGRAM].gateway_restart_notification = False + adapter.send = AsyncMock() + + delivered = await runner._send_home_channel_startup_notifications() + + assert delivered == set() + adapter.send.assert_not_called() + + +@pytest.mark.asyncio +async def test_send_home_channel_startup_notification_default_flag_true( + tmp_path, monkeypatch +): + """Default behavior is unchanged: missing flag means notifications still fire.""" + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + + runner, adapter = make_restart_runner() + # Sanity-check the dataclass default — guards against future refactors + # silently flipping the default to False. + assert runner.config.platforms[Platform.TELEGRAM].gateway_restart_notification is True + + runner.config.platforms[Platform.TELEGRAM].home_channel = HomeChannel( + platform=Platform.TELEGRAM, + chat_id="home-42", + name="Ops Home", + ) + adapter.send = AsyncMock(return_value=SendResult(success=True, message_id="home")) + + delivered = await runner._send_home_channel_startup_notifications() + + assert delivered == {("telegram", "home-42", None)} + adapter.send.assert_called_once() + + +@pytest.mark.asyncio +async def test_send_restart_notification_skipped_when_flag_disabled( + tmp_path, monkeypatch +): + """The /restart originator's notification also honors the per-platform flag. + + Slack used by end users → flag off → no "Gateway restarted" message even + when an end user accidentally triggers /restart. The marker file is still + cleaned up so the notification doesn't leak into the next boot. + """ + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + + notify_path = tmp_path / ".restart_notify.json" + notify_path.write_text(json.dumps({ + "platform": "telegram", + "chat_id": "42", + })) + + runner, adapter = make_restart_runner() + runner.config.platforms[Platform.TELEGRAM].gateway_restart_notification = False + adapter.send = AsyncMock() + + delivered_target = await runner._send_restart_notification() + + assert delivered_target is None + adapter.send.assert_not_called() + assert not notify_path.exists() + + @pytest.mark.asyncio async def test_send_restart_notification_logs_info_on_sendresult_success( tmp_path, monkeypatch, caplog From 7df6115199278f415bd3d3dacf439e467341245c Mon Sep 17 00:00:00 2001 From: Guillaume Meyer Date: Wed, 6 May 2026 15:55:01 +0000 Subject: [PATCH 006/230] feat(gateway): also gate pre-restart "Gateway restarting" notification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the gateway_restart_notification flag to cover _notify_active_sessions_of_shutdown — the message that fires just before drain ("⚠️ Gateway restarting — Your current task will be interrupted. Send any message after restart and I'll try to resume where you left off.") sent to active sessions and home channels. Same operator/end-user reasoning: on a Slack workspace shared with end users, "Gateway restarting" reads as "the bot is broken" — the operator should be able to suppress it consistently with the other two lifecycle pings rather than having a partial opt-out. Co-Authored-By: Claude Opus 4.7 (1M context) --- gateway/run.py | 16 ++++++++++++++ tests/gateway/test_restart_drain.py | 34 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/gateway/run.py b/gateway/run.py index 77f20178d19..15ce3ab08ce 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2458,6 +2458,14 @@ class GatewayRunner: if not adapter: continue + platform_cfg = self.config.platforms.get(platform) + if platform_cfg is not None and not platform_cfg.gateway_restart_notification: + logger.info( + "Shutdown notification suppressed for active session: %s has gateway_restart_notification=false", + platform_str, + ) + continue + # Include thread_id if present so the message lands in the # correct forum topic / thread. metadata = {"thread_id": thread_id} if thread_id else None @@ -2488,6 +2496,14 @@ class GatewayRunner: if not home or not home.chat_id: continue + platform_cfg = self.config.platforms.get(platform) + if platform_cfg is not None and not platform_cfg.gateway_restart_notification: + logger.info( + "Shutdown notification suppressed for home channel: %s has gateway_restart_notification=false", + platform.value, + ) + continue + dedup_key = (platform.value, str(home.chat_id), str(home.thread_id) if home.thread_id else None) if dedup_key in notified: continue diff --git a/tests/gateway/test_restart_drain.py b/tests/gateway/test_restart_drain.py index 3aca6d64057..55de5a45544 100644 --- a/tests/gateway/test_restart_drain.py +++ b/tests/gateway/test_restart_drain.py @@ -257,6 +257,40 @@ async def test_shutdown_notification_send_failure_does_not_block(): await runner._notify_active_sessions_of_shutdown() +@pytest.mark.asyncio +async def test_shutdown_notification_suppressed_when_flag_disabled(): + """Active-session ping is muted when gateway_restart_notification=False on the platform.""" + from gateway.config import Platform + + runner, adapter = make_restart_runner() + runner._restart_requested = True + runner.config.platforms[Platform.TELEGRAM].gateway_restart_notification = False + session_key = "agent:main:telegram:dm:999" + runner._running_agents[session_key] = MagicMock() + + await runner._notify_active_sessions_of_shutdown() + + assert adapter.sent == [] + + +@pytest.mark.asyncio +async def test_shutdown_notification_home_channel_suppressed_when_flag_disabled(): + """Home-channel ping during shutdown is muted when the flag is False.""" + from gateway.config import HomeChannel, Platform + + runner, adapter = make_restart_runner() + runner.config.platforms[Platform.TELEGRAM].home_channel = HomeChannel( + platform=Platform.TELEGRAM, + chat_id="home-42", + name="Ops Home", + ) + runner.config.platforms[Platform.TELEGRAM].gateway_restart_notification = False + + await runner._notify_active_sessions_of_shutdown() + + assert adapter.sent == [] + + @pytest.mark.asyncio async def test_shutdown_notification_uses_persisted_origin_for_colon_ids(): """Shutdown notifications should route from persisted origin, not reparsed keys.""" From d8b85bfd1c9dd207acdf0b23d181343ab396d974 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 6 May 2026 13:30:34 -0700 Subject: [PATCH 007/230] chore: add guillaumemeyer to AUTHOR_MAP For cherry-picked commits in PR #20801. --- scripts/release.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/release.py b/scripts/release.py index 09ac83ca76b..8249484e446 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -87,6 +87,7 @@ AUTHOR_MAP = { "happy5318@users.noreply.github.com": "happy5318", "chengoak@users.noreply.github.com": "chengoak", "mrhanoi@outlook.com": "qxxaa", + "guillaume.meyer@outlook.com": "guillaumemeyer", "emelyanenko.kirill@gmail.com": "EmelyanenkoK", "lazycat.manatee@gmail.com": "manateelazycat", "bzarnitz13@gmail.com": "Beandon13", From 5044e1cbf135af1a999935c6d141e137d60d5d1b Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Wed, 6 May 2026 13:51:13 -0700 Subject: [PATCH 008/230] fix(cli): submit LF enter in thin PTYs (#20896) --- cli.py | 25 ++++++++++++++++++------- tests/cli/test_cli_init.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/cli.py b/cli.py index 31ba863f9f6..1b2a81dfc49 100644 --- a/cli.py +++ b/cli.py @@ -1774,6 +1774,20 @@ _TERMINAL_INPUT_MODE_RESET_SEQ = ( ) +def _bind_prompt_submit_keys(kb, handler) -> None: + """Bind both CR and LF terminal Enter forms to the submit handler.""" + for key in ("enter", "c-j"): + kb.add(key)(handler) + + +def _disable_prompt_toolkit_cpr_warning(app) -> None: + """Let prompt_toolkit fall back from CPR without printing into the prompt.""" + try: + app.renderer.cpr_not_supported_callback = None + except Exception: + pass + + def _strip_leaked_terminal_responses_with_meta(text: str) -> tuple[str, bool]: """Strip leaked terminal control-response sequences from user input. @@ -10338,7 +10352,6 @@ class HermesCLI: # Key bindings for the input area kb = KeyBindings() - @kb.add('enter') def handle_enter(event): """Handle Enter key - submit input. @@ -10497,17 +10510,14 @@ class HermesCLI: else: self._pending_input.put(payload) event.app.current_buffer.reset(append_to_history=True) + + _bind_prompt_submit_keys(kb, handle_enter) @kb.add('escape', 'enter') def handle_alt_enter(event): """Alt+Enter inserts a newline for multi-line input.""" event.current_buffer.insert_text('\n') - @kb.add('c-j') - def handle_ctrl_enter(event): - """Ctrl+Enter (c-j) inserts a newline. Most terminals send c-j for Ctrl+Enter.""" - event.current_buffer.insert_text('\n') - # VSCode/Cursor bind Ctrl+G to "Find Next" at the editor level, so # the keystroke never reaches the embedded terminal. Alt+G is unbound # in those IDEs and arrives here as ('escape', 'g') — register it as @@ -11106,7 +11116,7 @@ class HermesCLI: def get_prompt(): return cli_ref._get_tui_prompt_fragments() - # Create the input area with multiline (shift+enter), autocomplete, and paste handling + # Create the input area with multiline (Alt+Enter), autocomplete, and paste handling from prompt_toolkit.auto_suggest import AutoSuggestFromHistory @@ -11848,6 +11858,7 @@ class HermesCLI: mouse_support=False, **({'cursor': _STEADY_CURSOR} if _STEADY_CURSOR is not None else {}), ) + _disable_prompt_toolkit_cpr_warning(app) self._app = app # Store reference for clarify_callback # ── Fix ghost status-bar lines on terminal resize ────────────── diff --git a/tests/cli/test_cli_init.py b/tests/cli/test_cli_init.py index bf1f347e500..c9ecf2c7df5 100644 --- a/tests/cli/test_cli_init.py +++ b/tests/cli/test_cli_init.py @@ -3,6 +3,7 @@ that only manifest at runtime (not in mocked unit tests).""" import os import sys +from types import SimpleNamespace from unittest.mock import MagicMock, patch sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) @@ -161,6 +162,35 @@ class TestBusyInputMode: assert cli._pending_input.empty() +class TestPromptToolkitTerminalCompatibility: + def test_lf_enter_binds_to_submit_handler(self): + """Some thin PTYs deliver Enter as LF/c-j instead of CR/enter.""" + from prompt_toolkit.key_binding import KeyBindings + + from cli import _bind_prompt_submit_keys + + kb = KeyBindings() + + def submit_handler(event): + return None + + _bind_prompt_submit_keys(kb, submit_handler) + + bindings = {tuple(key.value for key in binding.keys): binding.handler for binding in kb.bindings} + assert bindings[("c-m",)] is submit_handler + assert bindings[("c-j",)] is submit_handler + + def test_cpr_warning_callback_is_disabled(self): + from cli import _disable_prompt_toolkit_cpr_warning + + renderer = SimpleNamespace(cpr_not_supported_callback=lambda: None) + app = SimpleNamespace(renderer=renderer) + + _disable_prompt_toolkit_cpr_warning(app) + + assert renderer.cpr_not_supported_callback is None + + class TestSingleQueryState: def test_voice_and_interrupt_state_initialized_before_run(self): """Single-query mode calls chat() without going through run().""" From da6019820a916ff7b6b89fa0fba2cccf700554d6 Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Wed, 6 May 2026 13:54:46 -0700 Subject: [PATCH 009/230] fix(tui): refresh virtual offsets after row resize (#20898) --- .../virtualHistoryOffsetCache.test.ts | 40 ++++++++++++++++++- ui-tui/src/hooks/useVirtualHistory.ts | 11 ++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/ui-tui/src/__tests__/virtualHistoryOffsetCache.test.ts b/ui-tui/src/__tests__/virtualHistoryOffsetCache.test.ts index b4a5e7cd624..5a3e8cd0976 100644 --- a/ui-tui/src/__tests__/virtualHistoryOffsetCache.test.ts +++ b/ui-tui/src/__tests__/virtualHistoryOffsetCache.test.ts @@ -1,6 +1,7 @@ -import { Box, renderSync, ScrollBox, Text, type ScrollBoxHandle } from '@hermes/ink' -import React, { useLayoutEffect, useRef } from 'react' import { PassThrough } from 'stream' + +import { Box, renderSync, ScrollBox, type ScrollBoxHandle, Text } from '@hermes/ink' +import React, { useLayoutEffect, useRef } from 'react' import { describe, expect, it } from 'vitest' import { useVirtualHistory } from '../hooks/useVirtualHistory.js' @@ -50,6 +51,7 @@ const viewportIsMounted = (items: readonly Item[], virtualHistory: ReturnType; items: readonly Item[] }) { const scrollRef = useRef(null) + const virtualHistory = useVirtualHistory(scrollRef, items, 80, { coldStartCount: 16, estimateHeight: index => items[index]?.height ?? 1, @@ -83,11 +85,45 @@ function Harness({ expose, items }: { expose: React.MutableRefObject { + it('recomputes offsets after a mounted row height changes', async () => { + const tall = [ + { height: 6, key: 'a' }, + { height: 6, key: 'b' }, + { height: 6, key: 'c' } + ] + + const short = tall.map(item => ({ ...item, height: 2 })) + const expose = { current: null as Exposed | null } + const streams = makeStreams() + + const instance = renderSync(React.createElement(Harness, { expose, items: tall }), { + patchConsole: false, + stderr: streams.stderr as NodeJS.WriteStream, + stdin: streams.stdin as NodeJS.ReadStream, + stdout: streams.stdout as NodeJS.WriteStream + }) + + try { + await delay(20) + expect(expose.current!.virtualHistory.offsets[tall.length]).toBe(18) + + instance.rerender(React.createElement(Harness, { expose, items: short })) + await delay(40) + + expect(expose.current!.virtualHistory.offsets[short.length]).toBe(6) + expect(expose.current!.virtualHistory.bottomSpacer).toBe(0) + } finally { + instance.unmount() + instance.cleanup() + } + }) + it('ignores stale reused offset-array entries after the item count shrinks', async () => { const beforeShrink = Array.from({ length: 1400 }, (_, index) => ({ height: 1, key: `old${index}` })) const afterShrink = Array.from({ length: 800 }, (_, index) => ({ height: 7, key: `new${index}` })) const expose = { current: null as Exposed | null } const streams = makeStreams() + const instance = renderSync(React.createElement(Harness, { expose, items: beforeShrink }), { patchConsole: false, stderr: streams.stderr as NodeJS.WriteStream, diff --git a/ui-tui/src/hooks/useVirtualHistory.ts b/ui-tui/src/hooks/useVirtualHistory.ts index dbd3a2f6663..ef96ae1078c 100644 --- a/ui-tui/src/hooks/useVirtualHistory.ts +++ b/ui-tui/src/hooks/useVirtualHistory.ts @@ -130,6 +130,9 @@ export function useVirtualHistory( }) const [hasScrollRef, setHasScrollRef] = useState(false) + // Height cache writes happen in layout effects; bump once so offsets and + // clamp bounds rebuild without waiting for the next scroll/input event. + const [measuredHeightVersion, bumpMeasuredHeightVersion] = useState(0) const metrics = useRef({ sticky: true, top: 0, vp: 0 }) const lastScrollTopRef = useRef(0) @@ -434,6 +437,7 @@ export function useVirtualHistory( useLayoutEffect(() => { const s = scrollRef.current let dirty = false + let heightDirty = false // Give the renderer the mounted-row coverage for passive scroll clamping. // Clamp MUST use the EFFECTIVE (deferred) range, not the immediate one. @@ -474,6 +478,7 @@ export function useVirtualHistory( if (h > 0 && heights.current.get(k) !== h) { heights.current.set(k, h) dirty = true + heightDirty = true } } } @@ -499,7 +504,11 @@ export function useVirtualHistory( offsetVersion.current++ onHeightsChangeRef.current?.(heights.current) } - }) + + if (heightDirty) { + bumpMeasuredHeightVersion(n => n + 1) + } + }, [effEnd, effStart, items, liveTailActive, measuredHeightVersion, n, offsets, scrollRef, sticky, total, vp]) return { bottomSpacer: Math.max(0, total - (offsets[effEnd] ?? total)), From f1a8e99942e6150d5785bdd734c4d9ff63dfa7f7 Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Wed, 6 May 2026 14:01:56 -0700 Subject: [PATCH 010/230] fix(tui): honor skin highlight colors (#20895) --- hermes_cli/skin_engine.py | 1 + ui-tui/src/__tests__/theme.test.ts | 28 +++++++++++++++++++++++ ui-tui/src/components/appOverlays.tsx | 12 ++++++++-- ui-tui/src/theme.ts | 24 +++++++++++++++---- website/docs/user-guide/features/skins.md | 2 ++ 5 files changed, 61 insertions(+), 6 deletions(-) diff --git a/hermes_cli/skin_engine.py b/hermes_cli/skin_engine.py index 6ca6f8adf3d..0acb41d6878 100644 --- a/hermes_cli/skin_engine.py +++ b/hermes_cli/skin_engine.py @@ -42,6 +42,7 @@ All fields are optional. Missing values inherit from the ``default`` skin. session_border: "#8B8682" # Session ID dim color status_bar_bg: "#1a1a2e" # TUI status/usage bar background voice_status_bg: "#1a1a2e" # TUI voice status background + selection_bg: "#333355" # TUI mouse-selection highlight background completion_menu_bg: "#1a1a2e" # Completion menu background completion_menu_current_bg: "#333355" # Active completion row background completion_menu_meta_bg: "#1a1a2e" # Completion meta column background diff --git a/ui-tui/src/__tests__/theme.test.ts b/ui-tui/src/__tests__/theme.test.ts index 30a047df661..d45576698dd 100644 --- a/ui-tui/src/__tests__/theme.test.ts +++ b/ui-tui/src/__tests__/theme.test.ts @@ -209,6 +209,34 @@ describe('fromSkin', () => { expect(theme.color.completionCurrentBg).toBe('#bfbfbf') }) + it('uses active completion color as the selection highlight fallback', async () => { + const { fromSkin } = await importThemeWithCleanEnv() + + const theme = fromSkin({ completion_menu_current_bg: '#123456' }, {}) + + expect(theme.color.selectionBg).toBe('#123456') + }) + + it('maps completion meta background colors from skins', async () => { + const { fromSkin } = await importThemeWithCleanEnv() + + const theme = fromSkin({ + completion_menu_meta_bg: '#111111', + completion_menu_meta_current_bg: '#222222' + }, {}) + + expect(theme.color.completionMetaBg).toBe('#111111') + expect(theme.color.completionMetaCurrentBg).toBe('#222222') + }) + + it('lets selection_bg override completion highlight colors', async () => { + const { fromSkin } = await importThemeWithCleanEnv() + + const theme = fromSkin({ completion_menu_current_bg: '#123456', selection_bg: '#654321' }, {}) + + expect(theme.color.selectionBg).toBe('#654321') + }) + it('overrides branding', async () => { const { fromSkin } = await importThemeWithCleanEnv() const { brand } = fromSkin({}, { agent_name: 'TestBot', prompt_symbol: '$' }) diff --git a/ui-tui/src/components/appOverlays.tsx b/ui-tui/src/components/appOverlays.tsx index e4a80ba816d..c12624a4bf8 100644 --- a/ui-tui/src/components/appOverlays.tsx +++ b/ui-tui/src/components/appOverlays.tsx @@ -182,7 +182,7 @@ export function FloatingOverlays({ return ( - {item.meta ? {item.meta} : null} + {item.meta ? ( + + {' '} + {item.meta} + + ) : null} ) })} diff --git a/ui-tui/src/theme.ts b/ui-tui/src/theme.ts index 2a557090366..6d7426caed4 100644 --- a/ui-tui/src/theme.ts +++ b/ui-tui/src/theme.ts @@ -6,6 +6,8 @@ export interface ThemeColors { muted: string completionBg: string completionCurrentBg: string + completionMetaBg: string + completionMetaCurrentBg: string label: string ok: string @@ -264,8 +266,10 @@ export const DARK_THEME: Theme = { // new value sits ~60% luminance — readable without losing the "muted / // secondary" semantic. Field labels still use `label` (65%) which // stays brighter so hierarchy holds. - completionBg: '#FFFFFF', - completionCurrentBg: mix('#FFFFFF', '#FFBF00', 0.25), + completionBg: '#1a1a2e', + completionCurrentBg: '#333355', + completionMetaBg: '#1a1a2e', + completionMetaCurrentBg: '#333355', label: '#DAA520', ok: '#4caf50', @@ -312,6 +316,8 @@ export const LIGHT_THEME: Theme = { muted: '#7A5A0F', completionBg: '#F5F5F5', completionCurrentBg: mix('#F5F5F5', '#A0651C', 0.25), + completionMetaBg: '#F5F5F5', + completionMetaCurrentBg: mix('#F5F5F5', '#A0651C', 0.25), label: '#7A5A0F', ok: '#2E7D32', @@ -517,12 +523,20 @@ export function fromSkin( ): Theme { const d = DEFAULT_THEME const c = (k: string) => colors[k] + const hasSkinColors = Object.keys(colors).length > 0 const accent = c('ui_accent') ?? c('banner_accent') ?? d.color.accent const bannerAccent = c('banner_accent') ?? c('banner_title') ?? d.color.accent const muted = c('banner_dim') ?? d.color.muted const completionBg = c('completion_menu_bg') ?? d.color.completionBg + const completionCurrentBg = + c('completion_menu_current_bg') ?? + (hasSkinColors ? mix(completionBg, bannerAccent, 0.25) : d.color.completionCurrentBg) + + const completionMetaBg = c('completion_menu_meta_bg') ?? completionBg + const completionMetaCurrentBg = c('completion_menu_meta_current_bg') ?? completionCurrentBg + return normalizeThemeForAnsiLightTerminal({ color: { primary: c('ui_primary') ?? c('banner_title') ?? d.color.primary, @@ -531,7 +545,9 @@ export function fromSkin( text: c('ui_text') ?? c('banner_text') ?? d.color.text, muted, completionBg, - completionCurrentBg: c('completion_menu_current_bg') ?? mix(completionBg, bannerAccent, 0.25), + completionCurrentBg, + completionMetaBg, + completionMetaCurrentBg, label: c('ui_label') ?? d.color.label, ok: c('ui_ok') ?? d.color.ok, @@ -548,7 +564,7 @@ export function fromSkin( statusWarn: c('ui_warn') ?? d.color.statusWarn, statusBad: d.color.statusBad, statusCritical: d.color.statusCritical, - selectionBg: c('selection_bg') ?? d.color.selectionBg, + selectionBg: c('selection_bg') ?? c('completion_menu_current_bg') ?? (hasSkinColors ? completionCurrentBg : d.color.selectionBg), diffAdded: d.color.diffAdded, diffRemoved: d.color.diffRemoved, diff --git a/website/docs/user-guide/features/skins.md b/website/docs/user-guide/features/skins.md index 5648c46e032..def81d0e7b3 100644 --- a/website/docs/user-guide/features/skins.md +++ b/website/docs/user-guide/features/skins.md @@ -67,6 +67,7 @@ Controls all color values throughout the CLI. Values are hex color strings. | `session_border` | Session ID dim border color | `#8B8682` | | `status_bar_bg` | Background color for the TUI status / usage bar | `#1a1a2e` | | `voice_status_bg` | Background color for the voice-mode status badge | `#1a1a2e` | +| `selection_bg` | Background color for the TUI mouse-selection highlighter. Falls back to `completion_menu_current_bg` when unset. | `#333355` | | `completion_menu_bg` | Background color for the completion menu list | `#1a1a2e` | | `completion_menu_current_bg` | Background color for the active completion row | `#333355` | | `completion_menu_meta_bg` | Background color for the completion meta column | `#1a1a2e` | @@ -139,6 +140,7 @@ colors: session_border: "#8B8682" status_bar_bg: "#1a1a2e" voice_status_bg: "#1a1a2e" + selection_bg: "#333355" completion_menu_bg: "#1a1a2e" completion_menu_current_bg: "#333355" completion_menu_meta_bg: "#1a1a2e" From 5ccab51fa851d258da69ab12912657ec14bf3bc8 Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Wed, 6 May 2026 14:50:31 -0700 Subject: [PATCH 011/230] fix(tui): steady transcript scrollbar (#20917) * fix(tui): steady transcript scrollbar Keep the visible scrollbar tied to committed viewport position while virtual history can still prefetch against pending scroll targets, and preserve drag grab offset synchronously for native-feeling scrollbar drags. * fix(tui): smooth precision wheel scroll Replace the opt-scroll throttle with frame-sized coalescing so modifier wheel gestures stay line-precise without stepping. --- ui-tui/src/__tests__/precisionWheel.test.ts | 44 ++++++++++++++++++ ui-tui/src/__tests__/viewportStore.test.ts | 33 +++++++++++++- ui-tui/src/app/useInputHandlers.ts | 39 +++++----------- ui-tui/src/components/appChrome.tsx | 16 ++++--- ui-tui/src/lib/precisionWheel.ts | 48 ++++++++++++++++++++ ui-tui/src/lib/viewportStore.ts | 50 +++++++++++++++++++++ 6 files changed, 196 insertions(+), 34 deletions(-) create mode 100644 ui-tui/src/__tests__/precisionWheel.test.ts create mode 100644 ui-tui/src/lib/precisionWheel.ts diff --git a/ui-tui/src/__tests__/precisionWheel.test.ts b/ui-tui/src/__tests__/precisionWheel.test.ts new file mode 100644 index 00000000000..13567521799 --- /dev/null +++ b/ui-tui/src/__tests__/precisionWheel.test.ts @@ -0,0 +1,44 @@ +import { describe, expect, it } from 'vitest' + +import { computePrecisionWheelStep, initPrecisionWheel } from '../lib/precisionWheel.js' + +describe('precisionWheel', () => { + it('passes the first modifier-held wheel event', () => { + const s = initPrecisionWheel() + + expect(computePrecisionWheelStep(s, 1, true, 1000)).toEqual({ active: true, entered: true, rows: 1 }) + }) + + it('coalesces same-frame events without throttling line-by-line scroll', () => { + const s = initPrecisionWheel() + + computePrecisionWheelStep(s, 1, true, 1000) + + expect(computePrecisionWheelStep(s, 1, true, 1008).rows).toBe(0) + expect(computePrecisionWheelStep(s, 1, true, 1016).rows).toBe(1) + }) + + it('keeps queued momentum in precision mode briefly after modifier release', () => { + const s = initPrecisionWheel() + + computePrecisionWheelStep(s, 1, true, 1000) + + expect(computePrecisionWheelStep(s, 1, false, 1050)).toMatchObject({ active: true, rows: 1 }) + }) + + it('leaves precision mode once modifier-free momentum goes idle', () => { + const s = initPrecisionWheel() + + computePrecisionWheelStep(s, 1, true, 1000) + + expect(computePrecisionWheelStep(s, 1, false, 1100)).toEqual({ active: false, entered: false, rows: 0 }) + }) + + it('does not coalesce immediate reversals', () => { + const s = initPrecisionWheel() + + computePrecisionWheelStep(s, 1, true, 1000) + + expect(computePrecisionWheelStep(s, -1, true, 1008).rows).toBe(1) + }) +}) diff --git a/ui-tui/src/__tests__/viewportStore.test.ts b/ui-tui/src/__tests__/viewportStore.test.ts index 7889b65cdea..2d37127e546 100644 --- a/ui-tui/src/__tests__/viewportStore.test.ts +++ b/ui-tui/src/__tests__/viewportStore.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from 'vitest' -import { getViewportSnapshot, viewportSnapshotKey } from '../lib/viewportStore.js' +import { getScrollbarSnapshot, getViewportSnapshot, scrollbarSnapshotKey, viewportSnapshotKey } from '../lib/viewportStore.js' describe('viewportStore', () => { it('normalizes absent scroll handles', () => { @@ -51,4 +51,35 @@ describe('viewportStore', () => { expect(snap.atBottom).toBe(true) expect(snap.scrollHeight).toBe(20) }) + + it('keeps scrollbar position tied to committed scrollTop, not pending target', () => { + const handle = { + getPendingDelta: () => 24, + getScrollHeight: () => 100, + getScrollTop: () => 10, + getViewportHeight: () => 20, + isSticky: () => false + } + + const viewport = getViewportSnapshot(handle as any) + const scrollbar = getScrollbarSnapshot(handle as any) + + expect(viewport.top).toBe(34) + expect(scrollbar).toEqual({ + scrollHeight: 100, + top: 10, + viewportHeight: 20 + }) + expect(scrollbarSnapshotKey(scrollbar)).toBe('10:20:100') + }) + + it('clamps scrollbar position to committed scroll bounds', () => { + const handle = { + getScrollHeight: () => 30, + getScrollTop: () => 50, + getViewportHeight: () => 20 + } + + expect(getScrollbarSnapshot(handle as any).top).toBe(10) + }) }) diff --git a/ui-tui/src/app/useInputHandlers.ts b/ui-tui/src/app/useInputHandlers.ts index 20e9b087a4b..3d85a500d8b 100644 --- a/ui-tui/src/app/useInputHandlers.ts +++ b/ui-tui/src/app/useInputHandlers.ts @@ -11,6 +11,7 @@ import type { VoiceRecordResponse } from '../gatewayTypes.js' import { isAction, isCopyShortcut, isMac, isVoiceToggleKey } from '../lib/platform.js' +import { computePrecisionWheelStep, initPrecisionWheel } from '../lib/precisionWheel.js' import { computeWheelStep, initWheelAccelForHost } from '../lib/wheelAccel.js' import { getInputSelection } from './inputSelectionStore.js' @@ -21,8 +22,6 @@ import { patchTurnState } from './turnStore.js' import { getUiState } from './uiStore.js' const isCtrl = (key: { ctrl: boolean }, ch: string, target: string) => key.ctrl && ch.toLowerCase() === target -const PRECISION_WHEEL_MIN_GAP_MS = 80 -const PRECISION_WHEEL_STICKY_MS = 80 export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { const { actions, composer, gateway, terminal, voice, wheelStep } = ctx @@ -38,9 +37,7 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { // rows = wheelStep × accelMult. State mutates in place across renders. const wheelAccelRef = useRef(initWheelAccelForHost()) - const precisionWheelRef = useRef<{ active: boolean; dir: 0 | -1 | 1; lastEventAtMs: number; lastScrollAtMs: number }>( - { active: false, dir: 0, lastEventAtMs: 0, lastScrollAtMs: 0 } - ) + const precisionWheelRef = useRef(initPrecisionWheel()) useEffect(() => () => clearTimeout(scrollIdleTimer.current ?? undefined), []) @@ -291,40 +288,26 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { if (key.wheelUp || key.wheelDown) { const dir: -1 | 1 = key.wheelUp ? -1 : 1 const now = Date.now() - // Modifier-held wheel = precision mode: at most one wheelStep per short - // interval. Smooth mice / trackpads emit many raw wheel events for one - // intended line step, so raw 1:1 still moves too far. + // Modifier-held wheel = precision mode: one row per frame, no accel. + // Smooth mice / trackpads emit tiny same-frame bursts; coalesce those + // without the old 80ms throttle that made opt-scroll feel stepped. // SGR/X10 mouse encoding only carries shift/meta/ctrl bits; Cmd on // macOS is intercepted by the terminal, so we honor Option (meta) on // Mac / Alt (meta) on Win+Linux / Ctrl as a portable fallback. Shift // is reserved for selection extension. const hasModifier = key.meta || key.ctrl - const precision = precisionWheelRef.current - // Keep precision active through the current wheel burst after the - // modifier is released. Otherwise a stream of queued/momentum wheel - // events can hand off mid-burst into the accelerated path and jump. - const precisionSticky = now - precision.lastEventAtMs < PRECISION_WHEEL_STICKY_MS + const precision = computePrecisionWheelStep(precisionWheelRef.current, dir, hasModifier, now) - if (hasModifier || precisionSticky) { - if (!precision.active) { - precision.active = true + if (precision.active) { + // Entering precision mode must discard any accelerated wheel state; + // otherwise the next normal wheel event inherits stale momentum. + if (precision.entered) { wheelAccelRef.current = initWheelAccelForHost() } - precision.lastEventAtMs = now - - if (dir === precision.dir && now - precision.lastScrollAtMs < PRECISION_WHEEL_MIN_GAP_MS) { - return - } - - precision.lastScrollAtMs = now - precision.dir = dir - - return scrollTranscript(dir * wheelStep) + return precision.rows ? scrollTranscript(dir * wheelStep) : undefined } - precision.active = false - // 0 = direction-flip bounce deferred; skip the no-op scroll. const rows = computeWheelStep(wheelAccelRef.current, dir, now) diff --git a/ui-tui/src/components/appChrome.tsx b/ui-tui/src/components/appChrome.tsx index 29e663a47fe..c2e08b3698e 100644 --- a/ui-tui/src/components/appChrome.tsx +++ b/ui-tui/src/components/appChrome.tsx @@ -1,6 +1,6 @@ import { Box, type ScrollBoxHandle, Text } from '@hermes/ink' import { useStore } from '@nanostores/react' -import { type ReactNode, type RefObject, useEffect, useMemo, useState } from 'react' +import { type ReactNode, type RefObject, useEffect, useMemo, useRef, useState } from 'react' import unicodeSpinners from 'unicode-animations' import { $delegationState } from '../app/delegationStore.js' @@ -13,7 +13,7 @@ import { fmtDuration } from '../domain/messages.js' import { stickyPromptFromViewport } from '../domain/viewport.js' import { buildSubagentTree, treeTotals, widthByDepth } from '../lib/subagentTree.js' import { fmtK } from '../lib/text.js' -import { useViewportSnapshot } from '../lib/viewportStore.js' +import { useScrollbarSnapshot, useViewportSnapshot } from '../lib/viewportStore.js' import type { Theme } from '../theme.js' import type { Msg, Usage } from '../types.js' @@ -377,7 +377,8 @@ export function StickyPromptTracker({ messages, offsets, scrollRef, onChange }: export function TranscriptScrollbar({ scrollRef, t }: TranscriptScrollbarProps) { const [hover, setHover] = useState(false) const [grab, setGrab] = useState(null) - const { scrollHeight: total, top: pos, viewportHeight: vp } = useViewportSnapshot(scrollRef) + const grabRef = useRef(null) + const { scrollHeight: total, top: pos, viewportHeight: vp } = useScrollbarSnapshot(scrollRef) if (!vp) { return @@ -405,15 +406,20 @@ export function TranscriptScrollbar({ scrollRef, t }: TranscriptScrollbarProps) onMouseDown={(e: { localRow?: number }) => { const row = Math.max(0, Math.min(vp - 1, e.localRow ?? 0)) const off = row >= thumbTop && row < thumbTop + thumb ? row - thumbTop : Math.floor(thumb / 2) + + grabRef.current = off setGrab(off) jump(row, off) }} onMouseDrag={(e: { localRow?: number }) => - jump(Math.max(0, Math.min(vp - 1, e.localRow ?? 0)), grab ?? Math.floor(thumb / 2)) + jump(Math.max(0, Math.min(vp - 1, e.localRow ?? 0)), grabRef.current ?? Math.floor(thumb / 2)) } onMouseEnter={() => setHover(true)} onMouseLeave={() => setHover(false)} - onMouseUp={() => setGrab(null)} + onMouseUp={() => { + grabRef.current = null + setGrab(null) + }} width={1} > {!scrollable ? ( diff --git a/ui-tui/src/lib/precisionWheel.ts b/ui-tui/src/lib/precisionWheel.ts new file mode 100644 index 00000000000..4ddb447abf0 --- /dev/null +++ b/ui-tui/src/lib/precisionWheel.ts @@ -0,0 +1,48 @@ +const PRECISION_WHEEL_FRAME_MS = 16 +const PRECISION_WHEEL_STICKY_MS = 80 + +export type PrecisionWheelState = { + active: boolean + dir: 0 | -1 | 1 + lastEventAtMs: number + lastScrollAtMs: number +} + +export type PrecisionWheelStep = { + active: boolean + entered: boolean + rows: 0 | 1 +} + +export function initPrecisionWheel(): PrecisionWheelState { + return { active: false, dir: 0, lastEventAtMs: 0, lastScrollAtMs: 0 } +} + +export function computePrecisionWheelStep( + state: PrecisionWheelState, + dir: -1 | 1, + hasModifier: boolean, + now: number +): PrecisionWheelStep { + const active = hasModifier || now - state.lastEventAtMs < PRECISION_WHEEL_STICKY_MS + + if (!active) { + state.active = false + + return { active: false, entered: false, rows: 0 } + } + + const entered = !state.active + + state.active = true + state.lastEventAtMs = now + + if (dir === state.dir && now - state.lastScrollAtMs < PRECISION_WHEEL_FRAME_MS) { + return { active: true, entered, rows: 0 } + } + + state.dir = dir + state.lastScrollAtMs = now + + return { active: true, entered, rows: 1 } +} diff --git a/ui-tui/src/lib/viewportStore.ts b/ui-tui/src/lib/viewportStore.ts index b25ef581f47..25acbd8bebc 100644 --- a/ui-tui/src/lib/viewportStore.ts +++ b/ui-tui/src/lib/viewportStore.ts @@ -11,6 +11,12 @@ export interface ViewportSnapshot { viewportHeight: number } +export interface ScrollbarSnapshot { + scrollHeight: number + top: number + viewportHeight: number +} + const EMPTY: ViewportSnapshot = { atBottom: true, bottom: 0, @@ -20,6 +26,12 @@ const EMPTY: ViewportSnapshot = { viewportHeight: 0 } +const EMPTY_SCROLLBAR: ScrollbarSnapshot = { + scrollHeight: 0, + top: 0, + viewportHeight: 0 +} + export function getViewportSnapshot(s?: ScrollBoxHandle | null): ViewportSnapshot { if (!s) { return EMPTY @@ -52,6 +64,26 @@ export function viewportSnapshotKey(v: ViewportSnapshot) { return `${v.atBottom ? 1 : 0}:${Math.ceil(v.top / 8) * 8}:${v.viewportHeight}:${Math.ceil(v.scrollHeight / 8) * 8}:${v.pending}` } +export function getScrollbarSnapshot(s?: ScrollBoxHandle | null): ScrollbarSnapshot { + if (!s) { + return EMPTY_SCROLLBAR + } + + const viewportHeight = Math.max(0, s.getViewportHeight()) + const scrollHeight = Math.max(viewportHeight, s.getScrollHeight()) + const maxTop = Math.max(0, scrollHeight - viewportHeight) + + return { + scrollHeight, + top: Math.max(0, Math.min(maxTop, s.getScrollTop())), + viewportHeight + } +} + +export function scrollbarSnapshotKey(v: ScrollbarSnapshot) { + return `${v.top}:${v.viewportHeight}:${v.scrollHeight}` +} + export function useViewportSnapshot(scrollRef: RefObject): ViewportSnapshot { const key = useSyncExternalStore( useCallback((cb: () => void) => scrollRef.current?.subscribe(cb) ?? (() => {}), [scrollRef]), @@ -72,3 +104,21 @@ export function useViewportSnapshot(scrollRef: RefObject } }, [key]) } + +export function useScrollbarSnapshot(scrollRef: RefObject): ScrollbarSnapshot { + const key = useSyncExternalStore( + useCallback((cb: () => void) => scrollRef.current?.subscribe(cb) ?? (() => {}), [scrollRef]), + () => scrollbarSnapshotKey(getScrollbarSnapshot(scrollRef.current)), + () => scrollbarSnapshotKey(EMPTY_SCROLLBAR) + ) + + return useMemo(() => { + const [top = '0', viewportHeight = '0', scrollHeight = '0'] = key.split(':') + + return { + scrollHeight: Number(scrollHeight), + top: Number(top), + viewportHeight: Number(viewportHeight) + } + }, [key]) +} From 04cf4788ccc05003785992682e3cb25205e509cc Mon Sep 17 00:00:00 2001 From: brooklyn! Date: Wed, 6 May 2026 15:49:59 -0700 Subject: [PATCH 012/230] fix(tui): restore voice push-to-talk parity (#20897) * fix(tui): restore classic CLI voice push-to-talk parity (cherry picked from commit 93b9ae301bb89f5b5e01b4b9f8ac91ffa74fbd9d) * fix(tui): harden voice push-to-talk stop flow Address review feedback from PR #16189 by stopping the active recorder before background transcription, documenting single-shot voice capture, and covering the TUI gateway flags with regression tests. * fix(tui): preserve silent voice strike tracking Keep single-shot voice recording's no-speech counter alive across starts so the TUI can still emit the three-strikes auto-disable event, and bind the auto-restart state at module scope for type checking. * fix(tui): clean up voice stop failure path Address follow-up review by naming the TUI flow as single-shot push-to-talk and cancelling the recorder when forced stop cannot produce a WAV. * fix(tui): report busy voice capture starts Return explicit start state from the voice wrapper so the TUI gateway does not report recording while forced-stop transcription is still cleaning up. * fix(tui): handle busy voice record responses Apply the gateway busy status immediately in the TUI and route forced-stop voice events to the session that sent the stop request. * fix(tui): clear voice recording on null response Treat a null voice.record RPC result as a failed optimistic start so the REC badge cannot stick after gateway-side errors. * fix(tui): count silent manual voice stops Preserve single-shot voice no-speech strikes through forced stop transcription so empty push-to-talk captures still trigger the three-strikes guard. --------- Co-authored-by: Montbra --- hermes_cli/voice.py | 184 ++++++++++++--- tests/hermes_cli/test_voice_wrapper.py | 219 +++++++++++++++++- tests/test_tui_gateway_server.py | 75 ++++++ tui_gateway/server.py | 23 +- ui-tui/src/__tests__/useInputHandlers.test.ts | 37 +++ ui-tui/src/app/useInputHandlers.ts | 44 +++- ui-tui/src/gatewayTypes.ts | 2 +- 7 files changed, 527 insertions(+), 57 deletions(-) create mode 100644 ui-tui/src/__tests__/useInputHandlers.test.ts diff --git a/hermes_cli/voice.py b/hermes_cli/voice.py index f85f30c7bf4..a4ee6a0842d 100644 --- a/hermes_cli/voice.py +++ b/hermes_cli/voice.py @@ -281,6 +281,8 @@ _recorder_lock = threading.Lock() # ── Continuous (VAD) state ─────────────────────────────────────────── _continuous_lock = threading.Lock() _continuous_active = False +_continuous_stopping = False +_continuous_auto_restart: bool = True _continuous_recorder: Any = None # ── TTS-vs-STT feedback guard ──────────────────────────────────────── @@ -370,32 +372,43 @@ def start_continuous( on_silent_limit: Optional[Callable[[], None]] = None, silence_threshold: int = 200, silence_duration: float = 3.0, -) -> None: + auto_restart: bool = True, +) -> bool: """Start a VAD-driven continuous recording loop. The loop calls ``on_transcript(text)`` each time speech is detected and - transcribed successfully, then auto-restarts. After - ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech - picked up at all) the loop stops itself and calls ``on_silent_limit`` - so the UI can reflect "voice off". Idempotent — calling while already - active is a no-op. + transcribed successfully. If ``auto_restart`` is True, it auto-restarts + for the next turn and resets the no-speech counter for that loop. If + ``auto_restart`` is False, the first silence-triggered transcription ends + the loop and reports ``"idle"``; no-speech counts are retained across + starts so a push-to-talk caller can still enforce the three-strikes guard. + After ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech + picked up at all) the loop stops itself and calls ``on_silent_limit`` so the + UI can reflect "voice off". Returns False if a previous stop is still + transcribing/cleaning up; otherwise returns True. Idempotent — calling while + already active is a successful no-op. ``on_status`` is called with ``"listening"`` / ``"transcribing"`` / ``"idle"`` so the UI can show a live indicator. """ - global _continuous_active, _continuous_recorder + global _continuous_active, _continuous_recorder, _continuous_auto_restart global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit global _continuous_no_speech_count with _continuous_lock: if _continuous_active: _debug("start_continuous: already active — no-op") - return + return True + if _continuous_stopping: + _debug("start_continuous: stop/transcribe in progress — busy") + return False _continuous_active = True + _continuous_auto_restart = auto_restart _continuous_on_transcript = on_transcript _continuous_on_status = on_status _continuous_on_silent_limit = on_silent_limit - _continuous_no_speech_count = 0 + if auto_restart: + _continuous_no_speech_count = 0 if _continuous_recorder is None: _continuous_recorder = create_audio_recorder() @@ -428,15 +441,18 @@ def start_continuous( except Exception: pass + return True -def stop_continuous() -> None: + +def stop_continuous(force_transcribe: bool = False) -> None: """Stop the active continuous loop and release the microphone. - Idempotent — calling while not active is a no-op. Any in-flight - transcription completes but its result is discarded (the callback - checks ``_continuous_active`` before firing). + Idempotent — calling while not active is a no-op. If ``force_transcribe`` is + True, the recorder stops synchronously, then transcription/cleanup runs on a + background thread before reporting ``"idle"``. Otherwise the buffer is + discarded. """ - global _continuous_active, _continuous_on_transcript + global _continuous_active, _continuous_on_transcript, _continuous_stopping global _continuous_on_status, _continuous_on_silent_limit global _continuous_recorder, _continuous_no_speech_count @@ -446,18 +462,98 @@ def stop_continuous() -> None: _continuous_active = False rec = _continuous_recorder on_status = _continuous_on_status + on_transcript = _continuous_on_transcript + on_silent_limit = _continuous_on_silent_limit + auto_restart = _continuous_auto_restart + track_no_speech = force_transcribe and not auto_restart + _continuous_stopping = rec is not None _continuous_on_transcript = None _continuous_on_status = None _continuous_on_silent_limit = None - _continuous_no_speech_count = 0 + if not track_no_speech: + _continuous_no_speech_count = 0 if rec is not None: - try: - # cancel() (not stop()) discards buffered frames — the loop - # is over, we don't want to transcribe a half-captured turn. - rec.cancel() - except Exception as e: - logger.warning("failed to cancel recorder: %s", e) + if force_transcribe and on_transcript: + if on_status: + try: + on_status("transcribing") + except Exception: + pass + try: + wav_path = rec.stop() + except Exception as e: + logger.warning("failed to stop recorder: %s", e) + try: + rec.cancel() + except Exception as cancel_error: + logger.warning("failed to cancel recorder: %s", cancel_error) + wav_path = None + + def _transcribe_and_cleanup(): + global _continuous_no_speech_count, _continuous_stopping + transcript: Optional[str] = None + should_halt = False + + try: + if wav_path: + try: + result = transcribe_recording(wav_path) + if result.get("success"): + text = (result.get("transcript") or "").strip() + if text and not is_whisper_hallucination(text): + transcript = text + finally: + if os.path.isfile(wav_path): + os.unlink(wav_path) + except Exception as e: + logger.warning("failed to stop/transcribe recorder: %s", e) + finally: + if transcript: + try: + on_transcript(transcript) + except Exception as e: + logger.warning("on_transcript callback raised: %s", e) + + if track_no_speech: + with _continuous_lock: + if transcript: + _continuous_no_speech_count = 0 + else: + _continuous_no_speech_count += 1 + should_halt = ( + _continuous_no_speech_count + >= _CONTINUOUS_NO_SPEECH_LIMIT + ) + if should_halt: + _continuous_no_speech_count = 0 + if should_halt and on_silent_limit: + try: + on_silent_limit() + except Exception: + pass + + _play_beep(frequency=660, count=2) + with _continuous_lock: + _continuous_stopping = False + if on_status: + try: + on_status("idle") + except Exception: + pass + + threading.Thread(target=_transcribe_and_cleanup, daemon=True).start() + return + else: + try: + # cancel() (not stop()) discards buffered frames — the loop + # is over, we don't want to transcribe a half-captured turn. + rec.cancel() + except Exception as e: + logger.warning("failed to cancel recorder: %s", e) + + with _continuous_lock: + _continuous_stopping = False # Audible "recording stopped" cue (CLI parity: same 660 Hz × 2 the # silence-auto-stop path plays). @@ -603,23 +699,39 @@ def _continuous_on_silence() -> None: _debug("_continuous_on_silence: stopped while waiting for TTS") return - # Restart for the next turn. - _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})") - _play_beep(frequency=880, count=1) - try: - rec.start(on_silence_stop=_continuous_on_silence) - except Exception as e: - logger.error("failed to restart continuous recording: %s", e) - _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}") + if _continuous_auto_restart: + # Restart for the next turn. + _debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})") + _play_beep(frequency=880, count=1) + try: + rec.start(on_silence_stop=_continuous_on_silence) + except Exception as e: + logger.error("failed to restart continuous recording: %s", e) + _debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}") + with _continuous_lock: + _continuous_active = False + if on_status: + try: + on_status("idle") + except Exception: + pass + return + + if on_status: + try: + on_status("listening") + except Exception: + pass + else: + # Do not auto-restart. Clean up state and notify idle. + _debug("_continuous_on_silence: auto_restart=False, stopping loop") with _continuous_lock: _continuous_active = False - return - - if on_status: - try: - on_status("listening") - except Exception: - pass + if on_status: + try: + on_status("idle") + except Exception: + pass # ── TTS API ────────────────────────────────────────────────────────── diff --git a/tests/hermes_cli/test_voice_wrapper.py b/tests/hermes_cli/test_voice_wrapper.py index 3caacf4313c..c744c08d5b8 100644 --- a/tests/hermes_cli/test_voice_wrapper.py +++ b/tests/hermes_cli/test_voice_wrapper.py @@ -309,6 +309,7 @@ class TestContinuousAPI: # Isolate from any state left behind by other tests in the session. monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_stopping", False, raising=False) monkeypatch.setattr(voice, "_continuous_recorder", None) assert voice.is_continuous_active() is False @@ -343,11 +344,20 @@ class TestContinuousAPI: monkeypatch.setattr(voice, "_continuous_recorder", FakeRecorder()) - voice.start_continuous(on_transcript=lambda _t: None) + started = voice.start_continuous(on_transcript=lambda _t: None) # The guard inside start_continuous short-circuits before rec.start() + assert started is True assert called["n"] == 0 + def test_start_returns_false_while_stopping(self, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr(voice, "_continuous_active", False) + monkeypatch.setattr(voice, "_continuous_stopping", True, raising=False) + + assert voice.start_continuous(on_transcript=lambda _t: None) is False + class TestContinuousLoopSimulation: """End-to-end simulation of the VAD loop with a fake recorder. @@ -368,6 +378,8 @@ class TestContinuousLoopSimulation: monkeypatch.setattr(voice, "_continuous_on_transcript", None) monkeypatch.setattr(voice, "_continuous_on_status", None) monkeypatch.setattr(voice, "_continuous_on_silent_limit", None) + monkeypatch.setattr(voice, "_continuous_auto_restart", True, raising=False) + monkeypatch.setattr(voice, "_play_beep", lambda *_, **__: None) class FakeRecorder: _silence_threshold = 200 @@ -381,13 +393,20 @@ class TestContinuousLoopSimulation: self.cancelled = 0 # Preset WAV path returned by stop() self.next_stop_wav = "/tmp/fake.wav" + self.fail_stop = False + self.fail_next_start = False def start(self, on_silence_stop=None): + if self.fail_next_start: + self.fail_next_start = False + raise RuntimeError("boom") self.start_calls += 1 self.last_callback = on_silence_stop self.is_recording = True def stop(self): + if self.fail_stop: + raise RuntimeError("stop failed") self.stopped += 1 self.is_recording = False return self.next_stop_wav @@ -433,6 +452,204 @@ class TestContinuousLoopSimulation: voice.stop_continuous() + def test_auto_restart_false_stops_after_first_transcript(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "single shot"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + statuses = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_status=lambda s: statuses.append(s), + auto_restart=False, + ) + fake_recorder.last_callback() + + assert transcripts == ["single shot"] + assert fake_recorder.start_calls == 1 + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + + def test_auto_restart_false_retains_silent_strikes_across_starts( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": ""}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + silent_limit_fired = [] + + for _ in range(3): + voice.start_continuous( + on_transcript=lambda _t: None, + on_silent_limit=lambda: silent_limit_fired.append(True), + auto_restart=False, + ) + fake_recorder.last_callback() + + assert silent_limit_fired == [True] + assert voice.is_continuous_active() is False + assert fake_recorder.start_calls == 3 + + def test_force_transcribe_stop_delivers_current_buffer(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "manual stop"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + statuses = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_status=lambda s: statuses.append(s), + ) + voice.stop_continuous(force_transcribe=True) + + assert fake_recorder.stopped == 1 + assert transcripts == ["manual stop"] + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + + def test_force_transcribe_empty_single_shots_hit_silent_limit( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": ""}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + silent_limit_fired = [] + + for _ in range(3): + voice.start_continuous( + on_transcript=lambda _t: None, + on_silent_limit=lambda: silent_limit_fired.append(True), + auto_restart=False, + ) + voice.stop_continuous(force_transcribe=True) + + assert silent_limit_fired == [True] + assert fake_recorder.stopped == 3 + assert voice._continuous_no_speech_count == 0 + + def test_force_transcribe_valid_single_shot_resets_silent_strikes( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + monkeypatch.setattr(voice, "_continuous_no_speech_count", 2) + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "manual stop"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + transcripts = [] + silent_limit_fired = [] + + voice.start_continuous( + on_transcript=lambda t: transcripts.append(t), + on_silent_limit=lambda: silent_limit_fired.append(True), + auto_restart=False, + ) + voice.stop_continuous(force_transcribe=True) + + assert transcripts == ["manual stop"] + assert silent_limit_fired == [] + assert voice._continuous_no_speech_count == 0 + + def test_force_transcribe_stop_failure_cancels_and_clears_stopping( + self, fake_recorder, monkeypatch + ): + import hermes_cli.voice as voice + + class ImmediateThread: + def __init__(self, target, daemon=False): + self.target = target + + def start(self): + self.target() + + monkeypatch.setattr(voice.threading, "Thread", ImmediateThread) + fake_recorder.fail_stop = True + + statuses = [] + voice.start_continuous( + on_transcript=lambda _t: None, + on_status=lambda s: statuses.append(s), + ) + voice.stop_continuous(force_transcribe=True) + + assert fake_recorder.cancelled == 1 + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + assert voice._continuous_stopping is False + + def test_restart_failure_reports_idle(self, fake_recorder, monkeypatch): + import hermes_cli.voice as voice + + monkeypatch.setattr( + voice, + "transcribe_recording", + lambda _p: {"success": True, "transcript": "hello world"}, + ) + monkeypatch.setattr(voice, "is_whisper_hallucination", lambda _t: False) + + statuses = [] + voice.start_continuous(on_transcript=lambda _t: None, on_status=statuses.append) + + fake_recorder.fail_next_start = True + fake_recorder.last_callback() + + assert statuses == ["listening", "transcribing", "idle"] + assert voice.is_continuous_active() is False + def test_silent_limit_halts_loop_after_three_strikes(self, fake_recorder, monkeypatch): import hermes_cli.voice as voice diff --git a/tests/test_tui_gateway_server.py b/tests/test_tui_gateway_server.py index 5a25a306ba0..184f5606a8c 100644 --- a/tests/test_tui_gateway_server.py +++ b/tests/test_tui_gateway_server.py @@ -204,6 +204,7 @@ def test_voice_record_start_handles_non_dict_voice_cfg(monkeypatch): assert resp["result"]["status"] == "recording" assert captured["silence_threshold"] == 200 assert captured["silence_duration"] == 3.0 + assert captured["auto_restart"] is False # Round-12 Copilot review regression on #19835: ``bool`` is a subclass # of ``int``, so the naive ``isinstance(threshold, (int, float))`` @@ -232,6 +233,80 @@ def test_voice_record_start_handles_non_dict_voice_cfg(monkeypatch): assert ( captured["silence_duration"] == 3.0 ), f"bool silence_duration leaked through for {bad_bool_cfg!r}" + assert captured["auto_restart"] is False + + +def test_voice_record_stop_forces_transcription(monkeypatch): + captured: dict = {} + + def fake_stop_continuous(**kwargs): + captured.update(kwargs) + + monkeypatch.setitem( + sys.modules, + "hermes_cli.voice", + types.SimpleNamespace( + start_continuous=lambda **_kwargs: None, + stop_continuous=fake_stop_continuous, + ), + ) + + resp = server.dispatch( + { + "id": "voice-record-stop", + "method": "voice.record", + "params": {"action": "stop"}, + } + ) + + assert resp["result"]["status"] == "stopped" + assert captured["force_transcribe"] is True + + +def test_voice_record_stop_updates_event_session_id(monkeypatch): + monkeypatch.setitem( + sys.modules, + "hermes_cli.voice", + types.SimpleNamespace( + start_continuous=lambda **_kwargs: True, + stop_continuous=lambda **_kwargs: None, + ), + ) + monkeypatch.setattr(server, "_voice_event_sid", "old-session") + + resp = server.dispatch( + { + "id": "voice-record-stop-session", + "method": "voice.record", + "params": {"action": "stop", "session_id": "new-session"}, + } + ) + + assert resp["result"]["status"] == "stopped" + assert server._voice_event_sid == "new-session" + + +def test_voice_record_start_reports_busy_when_stop_is_in_progress(monkeypatch): + monkeypatch.setitem( + sys.modules, + "hermes_cli.voice", + types.SimpleNamespace( + start_continuous=lambda **_kwargs: False, + stop_continuous=lambda **_kwargs: None, + ), + ) + monkeypatch.setenv("HERMES_VOICE", "1") + monkeypatch.setattr(server, "_load_cfg", lambda: {"voice": {}}) + + resp = server.dispatch( + { + "id": "voice-record-busy", + "method": "voice.record", + "params": {"action": "start"}, + } + ) + + assert resp["result"]["status"] == "busy" def test_voice_toggle_tts_branch_also_carries_record_key(monkeypatch): diff --git a/tui_gateway/server.py b/tui_gateway/server.py index b618c5bd56d..4c36a561b1f 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -5619,14 +5619,13 @@ def _(rid, params: dict) -> dict: @method("voice.record") def _(rid, params: dict) -> dict: - """VAD-driven continuous record loop, CLI-parity. + """VAD-bounded push-to-talk capture, CLI-parity. - ``start`` turns on a VAD loop that emits ``voice.transcript`` events - for each detected utterance and auto-restarts for the next turn. - ``stop`` halts the loop (manual stop; matches cli.py's Ctrl+B-while- - recording branch clearing ``_voice_continuous``). Three consecutive - silent cycles stop the loop automatically and emit a - ``voice.transcript`` with ``no_speech_limit=True``. + ``start`` begins one VAD-bounded capture and emits ``voice.transcript`` + after silence stops the recorder. ``stop`` forces transcription of the + active buffer, matching classic CLI push-to-talk. The voice wrapper retains + no-speech counts across single-shot starts, so three consecutive silent + captures emit ``voice.transcript`` with ``no_speech_limit=True``. """ action = params.get("action", "start") @@ -5665,7 +5664,7 @@ def _(rid, params: dict) -> dict: if isinstance(duration, (int, float)) and not isinstance(duration, bool) else 3.0 ) - start_continuous( + started = start_continuous( on_transcript=lambda t: _voice_emit("voice.transcript", {"text": t}), on_status=lambda s: _voice_emit("voice.status", {"state": s}), on_silent_limit=lambda: _voice_emit( @@ -5673,13 +5672,19 @@ def _(rid, params: dict) -> dict: ), silence_threshold=safe_threshold, silence_duration=safe_duration, + auto_restart=False, ) + if started is False: + return _ok(rid, {"status": "busy"}) return _ok(rid, {"status": "recording"}) # action == "stop" + with _voice_sid_lock: + _voice_event_sid = params.get("session_id") or _voice_event_sid + from hermes_cli.voice import stop_continuous - stop_continuous() + stop_continuous(force_transcribe=True) return _ok(rid, {"status": "stopped"}) except ImportError: return _err( diff --git a/ui-tui/src/__tests__/useInputHandlers.test.ts b/ui-tui/src/__tests__/useInputHandlers.test.ts new file mode 100644 index 00000000000..066292abfa5 --- /dev/null +++ b/ui-tui/src/__tests__/useInputHandlers.test.ts @@ -0,0 +1,37 @@ +import { describe, expect, it, vi } from 'vitest' + +import { applyVoiceRecordResponse } from '../app/useInputHandlers.js' + +describe('applyVoiceRecordResponse', () => { + it('reverts optimistic REC state when the gateway reports voice busy', () => { + const setProcessing = vi.fn() + const setRecording = vi.fn() + const sys = vi.fn() + + applyVoiceRecordResponse({ status: 'busy' }, true, { setProcessing, setRecording }, sys) + + expect(setRecording).toHaveBeenCalledWith(false) + expect(setProcessing).toHaveBeenCalledWith(true) + expect(sys).toHaveBeenCalledWith('voice: still transcribing; try again shortly') + }) + + it('keeps optimistic REC state for successful recording starts', () => { + const setProcessing = vi.fn() + const setRecording = vi.fn() + + applyVoiceRecordResponse({ status: 'recording' }, true, { setProcessing, setRecording }, vi.fn()) + + expect(setRecording).not.toHaveBeenCalled() + expect(setProcessing).not.toHaveBeenCalled() + }) + + it('reverts optimistic REC state when the gateway returns null', () => { + const setProcessing = vi.fn() + const setRecording = vi.fn() + + applyVoiceRecordResponse(null, true, { setProcessing, setRecording }, vi.fn()) + + expect(setRecording).toHaveBeenCalledWith(false) + expect(setProcessing).toHaveBeenCalledWith(false) + }) +}) diff --git a/ui-tui/src/app/useInputHandlers.ts b/ui-tui/src/app/useInputHandlers.ts index 3d85a500d8b..ce25af70edd 100644 --- a/ui-tui/src/app/useInputHandlers.ts +++ b/ui-tui/src/app/useInputHandlers.ts @@ -23,6 +23,26 @@ import { getUiState } from './uiStore.js' const isCtrl = (key: { ctrl: boolean }, ch: string, target: string) => key.ctrl && ch.toLowerCase() === target +export function applyVoiceRecordResponse( + response: null | VoiceRecordResponse, + starting: boolean, + voice: Pick, + sys: (text: string) => void +) { + if (!starting || response?.status === 'recording') { + return + } + + voice.setRecording(false) + + if (response?.status === 'busy') { + voice.setProcessing(true) + sys('voice: still transcribing; try again shortly') + } else { + voice.setProcessing(false) + } +} + export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { const { actions, composer, gateway, terminal, voice, wheelStep } = ctx const { actions: cActions, refs: cRefs, state: cState } = composer @@ -157,11 +177,12 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { } } - // CLI parity: Ctrl+B toggles the VAD-driven continuous recording loop + // CLI parity: Ctrl+B toggles a VAD-bounded push-to-talk capture // (NOT the voice-mode umbrella bit). The mode is enabled via /voice on; // Ctrl+B while the mode is off sys-nudges the user. While the mode is - // on, the first press starts a continuous loop (gateway → start_continuous, - // VAD auto-stop → transcribe → auto-restart), a subsequent press stops it. + // on, the first press starts a single VAD-bounded capture + // (gateway -> start_continuous(auto_restart=false), VAD auto-stop -> + // transcribe -> idle), a subsequent press stops and transcribes it. // The gateway publishes voice.status + voice.transcript events that // createGatewayEventHandler turns into UI badges and composer injection. const voiceRecordToggle = () => { @@ -182,14 +203,17 @@ export function useInputHandlers(ctx: InputHandlerContext): InputHandlerResult { voice.setProcessing(false) } - gateway.rpc('voice.record', { action }).catch((e: Error) => { - // Revert optimistic UI on failure. - if (starting) { - voice.setRecording(false) - } + gateway + .rpc('voice.record', { action, session_id: getUiState().sid }) + .then(r => applyVoiceRecordResponse(r, starting, voice, actions.sys)) + .catch((e: Error) => { + // Revert optimistic UI on failure. + if (starting) { + voice.setRecording(false) + } - actions.sys(`voice error: ${e.message}`) - }) + actions.sys(`voice error: ${e.message}`) + }) } useInput((ch, key) => { diff --git a/ui-tui/src/gatewayTypes.ts b/ui-tui/src/gatewayTypes.ts index 0dacd790f06..8c5cb18b23d 100644 --- a/ui-tui/src/gatewayTypes.ts +++ b/ui-tui/src/gatewayTypes.ts @@ -295,7 +295,7 @@ export interface VoiceToggleResponse { } export interface VoiceRecordResponse { - status?: string + status?: 'busy' | 'recording' | 'stopped' text?: string } From 3cdbf334d5074aff0de857c0f94f278f06745e6b Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 6 May 2026 14:08:29 -0700 Subject: [PATCH 013/230] fix(gateway): don't dead-end setup wizard when only system-scope unit is installed The setup wizard dropped non-root users at a bare shell prompt when trying to start a system-scope gateway service. Previously _require_root_for_system_service called sys.exit(1), which the wizard's `except Exception` guards cannot catch (SystemExit is a BaseException). Users with a pre-existing /etc/systemd/system unit (e.g. from an earlier `sudo hermes setup` run) hit this whenever they re-ran `hermes setup` as a regular user. - Convert _require_root_for_system_service to raise a typed SystemScopeRequiresRootError (RuntimeError subclass) instead of sys.exit(1). The direct CLI path (`hermes gateway install|start|stop| restart|uninstall` without sudo) still exits 1 cleanly via a new catch at the top of gateway_command, matching the existing UserSystemdUnavailableError pattern. - Add _system_scope_wizard_would_need_root() pre-check and _print_system_scope_remediation() helper. Both setup wizards (hermes_cli/setup.py and hermes_cli/gateway.py::gateway_setup) now detect the dead-end before prompting and print actionable guidance: either `sudo systemctl start ` this time, or uninstall the system unit and install a per-user one. - Defense-in-depth: all 5 wizard prompt sites also catch SystemScopeRequiresRootError and fall back to the remediation helper if the pre-check is bypassed (race, etc.). Tests: 12 new tests in TestSystemScopeRequiresRootError, TestSystemScopeWizardPreCheck, TestSystemScopeRemediationOutput, and TestGatewayCommandCatchesSystemScopeError covering the exception contract, pre-check matrix (root vs non-root, system-only vs user-present vs none vs explicit system=True), remediation output for each action, and the direct-CLI exit-1 path. --- hermes_cli/gateway.py | 100 +++++++++++++- hermes_cli/setup.py | 24 +++- tests/hermes_cli/test_gateway_service.py | 168 +++++++++++++++++++++++ 3 files changed, 285 insertions(+), 7 deletions(-) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 846736a2cc6..547e8e03c08 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -967,6 +967,27 @@ class UserSystemdUnavailableError(RuntimeError): """ +class SystemScopeRequiresRootError(RuntimeError): + """Raised when a system-scope gateway operation is attempted as non-root. + + System-scope units live in ``/etc/systemd/system/`` and require root for + install / uninstall / start / stop / restart via ``systemctl``. The + previous behavior was ``sys.exit(1)`` which blew past the wizard's + ``except Exception`` guards and dumped the user at a bare shell prompt + with no guidance. Raising a typed exception lets callers that can + recover (the setup wizard) print actionable remediation instead, while + ``gateway_command`` still exits 1 with the same message for the direct + CLI path. + + ``args[0]`` carries the user-facing message, ``args[1]`` the action name. + ``str(e)`` returns only the message (not the tuple repr) so format + strings like ``f"Failed: {e}"`` render cleanly. + """ + + def __str__(self) -> str: + return self.args[0] if self.args else "" + + def _user_dbus_socket_path() -> Path: """Return the expected per-user D-Bus socket path (regardless of existence).""" xdg = os.environ.get("XDG_RUNTIME_DIR") or f"/run/user/{os.getuid()}" @@ -1382,8 +1403,10 @@ def print_systemd_scope_conflict_warning() -> None: def _require_root_for_system_service(action: str) -> None: if os.geteuid() != 0: - print(f"System gateway {action} requires root. Re-run with sudo.") - sys.exit(1) + raise SystemScopeRequiresRootError( + f"System gateway {action} requires root. Re-run with sudo.", + action, + ) def _system_service_identity(run_as_user: str | None = None) -> tuple[str, str, str]: @@ -1930,6 +1953,47 @@ def _select_systemd_scope(system: bool = False) -> bool: return get_systemd_unit_path(system=True).exists() and not get_systemd_unit_path(system=False).exists() +def _system_scope_wizard_would_need_root(system: bool = False) -> bool: + """True when the setup wizard is about to trigger a system-scope operation + as a non-root user. + + Replicates the decision ``_select_systemd_scope`` makes inside + ``systemd_start`` / ``systemd_restart`` / ``systemd_stop`` so the wizard + can detect the dead-end BEFORE prompting, rather than letting + ``SystemScopeRequiresRootError`` propagate out and leave the user + staring at a bare shell. + """ + if os.geteuid() == 0: + return False + return _select_systemd_scope(system=system) + + +def _print_system_scope_remediation(action: str) -> None: + """Print actionable remediation when the wizard skips a system-scope + prompt because the user isn't root. Keeps the wizard flowing instead of + aborting. + """ + svc = get_service_name() + print_warning( + f"Gateway is installed as a system-wide service — " + f"{action} requires root." + ) + print_info(" Options:") + print_info(f" 1. {action.capitalize()} it this time:") + if action == "start": + print_info(f" sudo systemctl start {svc}") + elif action == "stop": + print_info(f" sudo systemctl stop {svc}") + elif action == "restart": + print_info(f" sudo systemctl restart {svc}") + else: + print_info(f" sudo systemctl {action} {svc}") + print_info(" 2. Switch to a per-user service (recommended for personal use):") + print_info(" sudo hermes gateway uninstall --system") + print_info(" hermes gateway install") + print_info(" hermes gateway start") + + def _get_restart_drain_timeout() -> float: """Return the configured gateway restart drain timeout in seconds.""" raw = os.getenv("HERMES_RESTART_DRAIN_TIMEOUT", "").strip() @@ -4115,7 +4179,9 @@ def gateway_setup(): print_success("Gateway service is installed and running.") elif service_installed: print_warning("Gateway service is installed but not running.") - if prompt_yes_no(" Start it now?", True): + if supports_systemd_services() and _system_scope_wizard_would_need_root(): + _print_system_scope_remediation("start") + elif prompt_yes_no(" Start it now?", True): try: if supports_systemd_services(): systemd_start() @@ -4125,6 +4191,12 @@ def gateway_setup(): print_error(" Failed to start — user systemd not reachable:") for line in str(e).splitlines(): print(f" {line}") + except SystemScopeRequiresRootError as e: + # Defense in depth: the pre-check above should have caught + # this, but handle the race/edge case gracefully instead of + # letting the exception escape the wizard. + print_error(f" Failed to start: {e}") + _print_system_scope_remediation("start") except subprocess.CalledProcessError as e: print_error(f" Failed to start: {e}") else: @@ -4174,7 +4246,9 @@ def gateway_setup(): service_running = _is_service_running() if service_running: - if prompt_yes_no(" Restart the gateway to pick up changes?", True): + if supports_systemd_services() and _system_scope_wizard_would_need_root(): + _print_system_scope_remediation("restart") + elif prompt_yes_no(" Restart the gateway to pick up changes?", True): try: if supports_systemd_services(): systemd_restart() @@ -4187,10 +4261,15 @@ def gateway_setup(): print_error(" Restart failed — user systemd not reachable:") for line in str(e).splitlines(): print(f" {line}") + except SystemScopeRequiresRootError as e: + print_error(f" Restart failed: {e}") + _print_system_scope_remediation("restart") except subprocess.CalledProcessError as e: print_error(f" Restart failed: {e}") elif service_installed: - if prompt_yes_no(" Start the gateway service?", True): + if supports_systemd_services() and _system_scope_wizard_would_need_root(): + _print_system_scope_remediation("start") + elif prompt_yes_no(" Start the gateway service?", True): try: if supports_systemd_services(): systemd_start() @@ -4200,6 +4279,9 @@ def gateway_setup(): print_error(" Start failed — user systemd not reachable:") for line in str(e).splitlines(): print(f" {line}") + except SystemScopeRequiresRootError as e: + print_error(f" Start failed: {e}") + _print_system_scope_remediation("start") except subprocess.CalledProcessError as e: print_error(f" Start failed: {e}") else: @@ -4273,6 +4355,14 @@ def gateway_command(args): for line in str(e).splitlines(): print(f" {line}") sys.exit(1) + except SystemScopeRequiresRootError as e: + # The direct ``hermes gateway install|uninstall|start|stop|restart`` + # path lands here when the user typed a system-scope action without + # sudo. Same exit code as before — just gives the wizard a way to + # intercept the same condition with friendlier guidance before the + # error is raised. + print(str(e)) + sys.exit(1) def _gateway_command_inner(args): diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index e82bdafdfa5..f5b8b6c160f 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -2462,6 +2462,9 @@ def setup_gateway(config: dict): launchd_start, launchd_restart, UserSystemdUnavailableError, + SystemScopeRequiresRootError, + _system_scope_wizard_would_need_root, + _print_system_scope_remediation, ) service_installed = _is_service_installed() @@ -2479,7 +2482,9 @@ def setup_gateway(config: dict): print() if service_running: - if prompt_yes_no(" Restart the gateway to pick up changes?", True): + if supports_systemd and _system_scope_wizard_would_need_root(): + _print_system_scope_remediation("restart") + elif prompt_yes_no(" Restart the gateway to pick up changes?", True): try: if supports_systemd: systemd_restart() @@ -2489,10 +2494,19 @@ def setup_gateway(config: dict): print_error(" Restart failed — user systemd not reachable:") for line in str(e).splitlines(): print(f" {line}") + except SystemScopeRequiresRootError as e: + # Defense in depth: the pre-check above should have + # caught this, but a race (unit file appearing mid-run) + # could still land here. Previously this exited the + # whole wizard via sys.exit(1). + print_error(f" Restart failed: {e}") + _print_system_scope_remediation("restart") except Exception as e: print_error(f" Restart failed: {e}") elif service_installed: - if prompt_yes_no(" Start the gateway service?", True): + if supports_systemd and _system_scope_wizard_would_need_root(): + _print_system_scope_remediation("start") + elif prompt_yes_no(" Start the gateway service?", True): try: if supports_systemd: systemd_start() @@ -2502,6 +2516,9 @@ def setup_gateway(config: dict): print_error(" Start failed — user systemd not reachable:") for line in str(e).splitlines(): print(f" {line}") + except SystemScopeRequiresRootError as e: + print_error(f" Start failed: {e}") + _print_system_scope_remediation("start") except Exception as e: print_error(f" Start failed: {e}") elif supports_service_manager: @@ -2529,6 +2546,9 @@ def setup_gateway(config: dict): print_error(" Start failed — user systemd not reachable:") for line in str(e).splitlines(): print(f" {line}") + except SystemScopeRequiresRootError as e: + print_error(f" Start failed: {e}") + _print_system_scope_remediation("start") except Exception as e: print_error(f" Start failed: {e}") except Exception as e: diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index 994e8d02846..b3d90362073 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -2177,3 +2177,171 @@ class TestSystemdInstallOffersLegacyRemoval: assert prompt_called["count"] == 0 assert remove_called["invoked"] is False + + +class TestSystemScopeRequiresRootError: + """Tests for the SystemScopeRequiresRootError replacement of sys.exit(1). + + Before this change, ``_require_root_for_system_service`` called + ``sys.exit(1)`` when non-root code tried a system-scope systemd + operation. The wizard's ``except Exception`` guards don't catch + ``SystemExit`` (it's a ``BaseException`` subclass), so the user was + dumped at a bare shell prompt mid-setup. The fix raises a typed + exception instead, which the wizard intercepts and handles with + actionable remediation. + """ + + def test_require_root_raises_when_non_root(self, monkeypatch): + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000) + + with pytest.raises(gateway_cli.SystemScopeRequiresRootError) as excinfo: + gateway_cli._require_root_for_system_service("start") + + assert excinfo.value.args[0] == "System gateway start requires root. Re-run with sudo." + assert excinfo.value.args[1] == "start" + # str(e) renders only the message, not the tuple repr, so that + # wizard format strings like f"Failed: {e}" print cleanly. + assert str(excinfo.value) == "System gateway start requires root. Re-run with sudo." + assert f"Failed: {excinfo.value}" == "Failed: System gateway start requires root. Re-run with sudo." + + def test_require_root_noop_when_root(self, monkeypatch): + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 0) + + # Should not raise, should not exit + gateway_cli._require_root_for_system_service("start") + + def test_error_is_runtime_error_subclass(self): + """Wizards use ``except Exception`` guards — the error must be a + ``RuntimeError`` (catchable by ``Exception``), NOT a ``SystemExit`` + (``BaseException``), so the wizard can recover from it. + """ + err = gateway_cli.SystemScopeRequiresRootError("msg", "start") + assert isinstance(err, RuntimeError) + assert isinstance(err, Exception) + assert not isinstance(err, SystemExit) + + +class TestSystemScopeWizardPreCheck: + """Tests for _system_scope_wizard_would_need_root — the guard the + wizard uses to detect the dead-end BEFORE prompting the user to start + a service that will fail without sudo. + """ + + @staticmethod + def _setup_units(tmp_path, monkeypatch, system_present: bool, user_present: bool): + sys_dir = tmp_path / "sys" + usr_dir = tmp_path / "usr" + sys_dir.mkdir() + usr_dir.mkdir() + if system_present: + (sys_dir / "hermes-gateway.service").write_text("[Unit]\n") + if user_present: + (usr_dir / "hermes-gateway.service").write_text("[Unit]\n") + monkeypatch.setattr( + gateway_cli, + "get_systemd_unit_path", + lambda system=False: (sys_dir if system else usr_dir) / "hermes-gateway.service", + ) + + def test_non_root_with_only_system_unit_returns_true(self, tmp_path, monkeypatch): + self._setup_units(tmp_path, monkeypatch, system_present=True, user_present=False) + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000) + + assert gateway_cli._system_scope_wizard_would_need_root() is True + + def test_root_never_needs_root(self, tmp_path, monkeypatch): + self._setup_units(tmp_path, monkeypatch, system_present=True, user_present=False) + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 0) + + assert gateway_cli._system_scope_wizard_would_need_root() is False + + def test_non_root_with_user_unit_present_returns_false(self, tmp_path, monkeypatch): + # User-scope unit present — user can start it themselves, no sudo needed. + self._setup_units(tmp_path, monkeypatch, system_present=True, user_present=True) + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000) + + assert gateway_cli._system_scope_wizard_would_need_root() is False + + def test_non_root_with_no_units_returns_false(self, tmp_path, monkeypatch): + self._setup_units(tmp_path, monkeypatch, system_present=False, user_present=False) + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000) + + assert gateway_cli._system_scope_wizard_would_need_root() is False + + def test_non_root_with_explicit_system_arg_returns_true(self, tmp_path, monkeypatch): + # Caller passed system=True explicitly (e.g. ``hermes gateway start --system``). + self._setup_units(tmp_path, monkeypatch, system_present=False, user_present=False) + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000) + + assert gateway_cli._system_scope_wizard_would_need_root(system=True) is True + + +class TestSystemScopeRemediationOutput: + """Tests for _print_system_scope_remediation — the actionable guidance + shown when the wizard detects a system-scope-only setup as non-root. + """ + + def test_start_remediation_mentions_sudo_systemctl_and_uninstall(self, capsys, monkeypatch): + monkeypatch.setattr(gateway_cli, "get_service_name", lambda: "hermes-gateway") + + gateway_cli._print_system_scope_remediation("start") + out = capsys.readouterr().out + + assert "system-wide service" in out + assert "start requires root" in out + assert "sudo systemctl start hermes-gateway" in out + assert "sudo hermes gateway uninstall --system" in out + assert "hermes gateway install" in out + + def test_restart_remediation_uses_systemctl_restart(self, capsys, monkeypatch): + monkeypatch.setattr(gateway_cli, "get_service_name", lambda: "hermes-gateway") + + gateway_cli._print_system_scope_remediation("restart") + out = capsys.readouterr().out + + assert "restart requires root" in out + assert "sudo systemctl restart hermes-gateway" in out + + def test_stop_remediation_uses_systemctl_stop(self, capsys, monkeypatch): + monkeypatch.setattr(gateway_cli, "get_service_name", lambda: "hermes-gateway") + + gateway_cli._print_system_scope_remediation("stop") + out = capsys.readouterr().out + + assert "stop requires root" in out + assert "sudo systemctl stop hermes-gateway" in out + + +class TestGatewayCommandCatchesSystemScopeError: + """The direct CLI path (``hermes gateway start --system`` etc.) must + still exit 1 with a clean message when non-root. The top-level + ``gateway_command`` catches ``SystemScopeRequiresRootError`` and + converts it back to ``sys.exit(1)``, preserving existing CLI behavior. + """ + + def test_non_root_system_start_exits_one_with_clean_message(self, tmp_path, monkeypatch, capsys): + sys_dir = tmp_path / "sys" + usr_dir = tmp_path / "usr" + sys_dir.mkdir() + usr_dir.mkdir() + (sys_dir / "hermes-gateway.service").write_text("[Unit]\n") + monkeypatch.setattr( + gateway_cli, + "get_systemd_unit_path", + lambda system=False: (sys_dir if system else usr_dir) / "hermes-gateway.service", + ) + monkeypatch.setattr(gateway_cli.os, "geteuid", lambda: 1000) + monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True) + monkeypatch.setattr(gateway_cli, "is_termux", lambda: False) + monkeypatch.setattr(gateway_cli, "kill_gateway_processes", lambda **kw: 0) + + args = SimpleNamespace(gateway_command="start", system=True, all=False) + + with pytest.raises(SystemExit) as excinfo: + gateway_cli.gateway_command(args) + + assert excinfo.value.code == 1 + out = capsys.readouterr().out + # Renders the message, NOT the ``('msg', 'action')`` tuple repr + assert "System gateway start requires root. Re-run with sudo." in out + assert "('" not in out # no tuple repr leaking through From 65c762b2e83ea39f5cda56a6abf737c3c864b188 Mon Sep 17 00:00:00 2001 From: Austin Pickett Date: Wed, 6 May 2026 19:30:46 -0400 Subject: [PATCH 014/230] fix(tui): preserve session when switching personality MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, /personality in the TUI called _reset_session_agent() which destroyed the agent, cleared conversation history, and effectively started a new session. This made personality switching disruptive — users lost their entire conversation context. Now /personality updates the agent's ephemeral_system_prompt in-place and injects a pivot marker into the conversation history. The marker tells the model to adopt the new persona from that point forward, which is necessary because LLMs tend to pattern-match their prior responses and continue the established tone without an explicit signal. Changes: - tui_gateway/server.py: Rewrite _apply_personality_to_session to update the agent in-place instead of resetting. Inject a user-role pivot marker so the model actually switches style mid-conversation. - ui-tui/src/app/slash/commands/session.ts: Update help text (no longer mentions history reset). - tests/test_tui_gateway_server.py: Update test to verify history is preserved, pivot marker is injected, and ephemeral prompt is set. --- tests/test_tui_gateway_server.py | 28 ++++++++------ tui_gateway/server.py | 49 ++++++++++++++++++------ ui-tui/src/app/slash/commands/session.ts | 2 +- 3 files changed, 55 insertions(+), 24 deletions(-) diff --git a/tests/test_tui_gateway_server.py b/tests/test_tui_gateway_server.py index 41b5194da63..c81a92e65e5 100644 --- a/tests/test_tui_gateway_server.py +++ b/tests/test_tui_gateway_server.py @@ -1559,13 +1559,15 @@ def test_config_set_personality_rejects_unknown_name(monkeypatch): assert "Unknown personality" in resp["error"]["message"] -def test_config_set_personality_resets_history_and_returns_info(monkeypatch): +def test_config_set_personality_preserves_history_and_returns_info(monkeypatch): + agent = types.SimpleNamespace( + ephemeral_system_prompt=None, _cached_system_prompt="old" + ) session = _session( - agent=types.SimpleNamespace(), + agent=agent, history=[{"role": "user", "text": "hi"}], history_version=4, ) - new_agent = types.SimpleNamespace(model="x") emits = [] server._sessions["sid"] = session @@ -1574,13 +1576,9 @@ def test_config_set_personality_resets_history_and_returns_info(monkeypatch): "_available_personalities", lambda cfg=None: {"helpful": "You are helpful."}, ) - monkeypatch.setattr( - server, "_make_agent", lambda sid, key, session_id=None: new_agent - ) monkeypatch.setattr( server, "_session_info", lambda agent: {"model": getattr(agent, "model", "?")} ) - monkeypatch.setattr(server, "_restart_slash_worker", lambda session: None) monkeypatch.setattr(server, "_emit", lambda *args: emits.append(args)) monkeypatch.setattr(server, "_write_config_key", lambda path, value: None) @@ -1592,11 +1590,19 @@ def test_config_set_personality_resets_history_and_returns_info(monkeypatch): } ) - assert resp["result"]["history_reset"] is True - assert resp["result"]["info"] == {"model": "x"} - assert session["history"] == [] + assert resp["result"]["history_reset"] is False + assert resp["result"]["info"] == {"model": "?"} + # History is preserved with a pivot marker appended + assert len(session["history"]) == 2 + assert session["history"][0] == {"role": "user", "text": "hi"} + assert session["history"][1]["role"] == "user" + assert "personality" in session["history"][1]["content"].lower() + assert "You are helpful." in session["history"][1]["content"] assert session["history_version"] == 5 - assert ("session.info", "sid", {"model": "x"}) in emits + # Agent's system prompt was updated in-place; cached prompt untouched + assert agent.ephemeral_system_prompt == "You are helpful." + assert agent._cached_system_prompt == "old" + assert ("session.info", "sid", {"model": "?"}) in emits def test_session_compress_uses_compress_helper(monkeypatch): diff --git a/tui_gateway/server.py b/tui_gateway/server.py index 724fb542e67..690607cca3e 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -1680,21 +1680,46 @@ def _validate_personality(value: str, cfg: dict | None = None) -> tuple[str, str def _apply_personality_to_session( sid: str, session: dict, new_prompt: str ) -> tuple[bool, dict | None]: + """Apply a personality change to an existing session without resetting history. + + Updates the agent's ephemeral system prompt in-place so the new personality + takes effect on the next turn. The cached base system prompt is left intact + (ephemeral_system_prompt is appended at API-call time, not baked into the + cache), which preserves prompt-cache hits. + + Also injects a system-role marker into the conversation history so the model + knows to pivot its style from this point forward (without this, LLMs tend to + continue the tone established by earlier messages in the transcript). + + Returns (history_reset, info) — history_reset is always False since we + preserve the conversation. + """ if not session: return False, None - try: - info = _reset_session_agent(sid, session) - return True, info - except Exception: - if session.get("agent"): - agent = session["agent"] - agent.ephemeral_system_prompt = new_prompt or None - agent._cached_system_prompt = None - info = _session_info(agent) - _emit("session.info", sid, info) - return False, info - return False, None + agent = session.get("agent") + if agent: + agent.ephemeral_system_prompt = new_prompt or None + # Inject a pivot marker into history so the model sees the change point. + # This prevents it from pattern-matching its prior style. + if new_prompt: + marker = ( + "[System: The user has changed the assistant's personality. " + "From this point forward, adopt the following persona and respond " + f"accordingly: {new_prompt}]" + ) + else: + marker = ( + "[System: The user has cleared the personality overlay. " + "From this point forward, respond in your normal default style.]" + ) + with session["history_lock"]: + session["history"].append({"role": "user", "content": marker}) + session["history_version"] = int(session.get("history_version", 0)) + 1 + info = _session_info(agent) + _emit("session.info", sid, info) + return False, info + return False, None def _cfg_max_turns(cfg: dict, default: int) -> int: diff --git a/ui-tui/src/app/slash/commands/session.ts b/ui-tui/src/app/slash/commands/session.ts index 0a5324ef559..2ca77fc3d74 100644 --- a/ui-tui/src/app/slash/commands/session.ts +++ b/ui-tui/src/app/slash/commands/session.ts @@ -109,7 +109,7 @@ export const sessionCommands: SlashCommand[] = [ }, { - help: 'switch or reset personality (history reset on set)', + help: 'switch personality for this session', name: 'personality', run: (arg, ctx) => { if (!arg) { From d797755a1c17566b0aef4d77548a4b460142d26a Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Tue, 5 May 2026 21:55:58 -0600 Subject: [PATCH 015/230] fix(gateway): wait for systemd restart readiness --- gateway/platforms/discord.py | 191 +++++++++++++++++++++-- hermes_cli/gateway.py | 175 +++++++++++++++++---- tests/gateway/test_discord_connect.py | 140 +++++++++++++++++ tests/hermes_cli/test_gateway_service.py | 159 +++++++++++++++---- 4 files changed, 587 insertions(+), 78 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index e30c4478ef9..f0ee06f8ca7 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -10,6 +10,8 @@ Uses discord.py library for: """ import asyncio +import hashlib +import json import logging import os import struct @@ -24,6 +26,9 @@ logger = logging.getLogger(__name__) VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080} _DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"} +_DISCORD_COMMAND_SYNC_STATE_FILE = "discord_command_sync_state.json" +_DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5 +_DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0 try: import discord @@ -45,6 +50,7 @@ from gateway.config import Platform, PlatformConfig import re from gateway.platforms.helpers import MessageDeduplicator, ThreadParticipationTracker +from utils import atomic_json_write from gateway.platforms.base import ( BasePlatformAdapter, MessageEvent, @@ -825,6 +831,128 @@ class DiscordAdapter(BasePlatformAdapter): logger.info("[%s] Disconnected", self.name) + def _command_sync_state_path(self) -> _Path: + from hermes_constants import get_hermes_home + + return get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_FILE + + def _read_command_sync_state(self) -> dict: + try: + path = self._command_sync_state_path() + if not path.exists(): + return {} + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + return {} + return data if isinstance(data, dict) else {} + + def _write_command_sync_state(self, state: dict) -> None: + atomic_json_write( + self._command_sync_state_path(), + state, + indent=None, + separators=(",", ":"), + ) + + def _command_sync_state_key(self, app_id: Any) -> str: + return str(app_id or "unknown") + + def _desired_command_sync_fingerprint(self) -> str: + tree = self._client.tree if self._client else None + desired = [] + if tree is not None: + desired = [ + self._canonicalize_app_command_payload(command.to_dict(tree)) + for command in tree.get_commands() + ] + desired.sort(key=lambda item: (item.get("type", 1), item.get("name", ""))) + payload = json.dumps(desired, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + def _command_sync_skip_reason(self, app_id: Any, fingerprint: str) -> Optional[str]: + entry = self._read_command_sync_state().get(self._command_sync_state_key(app_id)) + if not isinstance(entry, dict): + return None + now = time.time() + retry_after_until = float(entry.get("retry_after_until") or 0) + if retry_after_until > now: + remaining = max(1, int(retry_after_until - now)) + return f"Discord asked us to wait before syncing slash commands; retry in {remaining}s" + if entry.get("fingerprint") == fingerprint and entry.get("last_success_at"): + return "same slash-command fingerprint already synced" + return None + + def _record_command_sync_attempt(self, app_id: Any, fingerprint: str) -> None: + state = self._read_command_sync_state() + state[self._command_sync_state_key(app_id)] = { + **( + state.get(self._command_sync_state_key(app_id)) + if isinstance(state.get(self._command_sync_state_key(app_id)), dict) + else {} + ), + "fingerprint": fingerprint, + "last_attempt_at": time.time(), + } + self._write_command_sync_state(state) + + def _record_command_sync_rate_limit(self, app_id: Any, fingerprint: str, retry_after: float) -> None: + retry_after = max(1.0, float(retry_after)) + state = self._read_command_sync_state() + state[self._command_sync_state_key(app_id)] = { + **( + state.get(self._command_sync_state_key(app_id)) + if isinstance(state.get(self._command_sync_state_key(app_id)), dict) + else {} + ), + "fingerprint": fingerprint, + "last_attempt_at": time.time(), + "retry_after_until": time.time() + retry_after, + "retry_after": retry_after, + } + self._write_command_sync_state(state) + + def _record_command_sync_success(self, app_id: Any, fingerprint: str, summary: dict) -> None: + state = self._read_command_sync_state() + state[self._command_sync_state_key(app_id)] = { + "fingerprint": fingerprint, + "last_attempt_at": time.time(), + "last_success_at": time.time(), + "summary": summary, + } + self._write_command_sync_state(state) + + @staticmethod + def _extract_discord_retry_after(exc: BaseException) -> Optional[float]: + value = getattr(exc, "retry_after", None) + if value is not None: + try: + return max(1.0, float(value)) + except (TypeError, ValueError): + return None + response = getattr(exc, "response", None) + headers = getattr(response, "headers", None) + if headers: + for key in ("Retry-After", "X-RateLimit-Reset-After"): + try: + raw = headers.get(key) + except Exception: + raw = None + if raw is None: + continue + try: + return max(1.0, float(raw)) + except (TypeError, ValueError): + continue + return None + + def _command_sync_mutation_interval_seconds(self) -> float: + return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS + + async def _sleep_between_command_sync_mutations(self) -> None: + interval = self._command_sync_mutation_interval_seconds() + if interval > 0: + await asyncio.sleep(interval) + async def _run_post_connect_initialization(self) -> None: """Finish non-critical startup work after Discord is connected.""" if not self._client: @@ -840,14 +968,42 @@ class DiscordAdapter(BasePlatformAdapter): logger.info("[%s] Synced %d slash command(s) via bulk tree sync", self.name, len(synced)) return - # Discord's per-app command-management bucket is ~5 writes / 20 s, - # so a mass-prune-plus-upsert reconcile (e.g. 77 orphans + 30 - # desired = 107 writes) takes several minutes of forced waits. - # A flat 30 s budget blew up reliably under bucket pressure and - # left slash commands broken for ~60 min until the bucket fully - # recovered. Use a wide ceiling; the cap still guards against a - # true hang. (#16713) - summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600) + app_id = getattr(self._client, "application_id", None) or getattr(getattr(self._client, "user", None), "id", None) + fingerprint = self._desired_command_sync_fingerprint() + skip_reason = self._command_sync_skip_reason(app_id, fingerprint) + if skip_reason: + logger.info("[%s] Skipping Discord slash command sync: %s", self.name, skip_reason) + return + self._record_command_sync_attempt(app_id, fingerprint) + + http = getattr(self._client, "http", None) + has_ratelimit_timeout = http is not None and hasattr(http, "max_ratelimit_timeout") + previous_ratelimit_timeout = getattr(http, "max_ratelimit_timeout", None) if has_ratelimit_timeout else None + if has_ratelimit_timeout: + http.max_ratelimit_timeout = _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS + + try: + # Discord's per-app command-management bucket is small, and + # discord.py can otherwise sit inside one long retry sleep + # before surfacing the 429. Keep the whole sync bounded and + # persist Discord's retry-after when it refuses the batch. + summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600) + except Exception as e: + retry_after = self._extract_discord_retry_after(e) + if retry_after is not None: + self._record_command_sync_rate_limit(app_id, fingerprint, retry_after) + logger.warning( + "[%s] Discord rate-limited slash command sync; retrying after %.0fs", + self.name, + retry_after, + ) + return + raise + finally: + if has_ratelimit_timeout: + http.max_ratelimit_timeout = previous_ratelimit_timeout + + self._record_command_sync_success(app_id, fingerprint, summary) logger.info( "[%s] Safely reconciled %d slash command(s): unchanged=%d updated=%d recreated=%d created=%d deleted=%d", self.name, @@ -1009,11 +1165,20 @@ class DiscordAdapter(BasePlatformAdapter): created = 0 deleted = 0 http = self._client.http + mutation_count = 0 + + async def mutate(call, *args): + nonlocal mutation_count + if mutation_count: + await self._sleep_between_command_sync_mutations() + result = await call(*args) + mutation_count += 1 + return result for key, desired in desired_by_key.items(): current = existing_by_key.pop(key, None) if current is None: - await http.upsert_global_command(app_id, desired) + await mutate(http.upsert_global_command, app_id, desired) created += 1 continue @@ -1025,16 +1190,16 @@ class DiscordAdapter(BasePlatformAdapter): continue if self._patchable_app_command_payload(current_existing_payload) == self._patchable_app_command_payload(desired): - await http.delete_global_command(app_id, current.id) - await http.upsert_global_command(app_id, desired) + await mutate(http.delete_global_command, app_id, current.id) + await mutate(http.upsert_global_command, app_id, desired) recreated += 1 continue - await http.edit_global_command(app_id, current.id, desired) + await mutate(http.edit_global_command, app_id, current.id, desired) updated += 1 for current in existing_by_key.values(): - await http.delete_global_command(app_id, current.id) + await mutate(http.delete_global_command, app_id, current.id) deleted += 1 return { diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 547e8e03c08..232f8dac804 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -505,6 +505,7 @@ def _read_systemd_unit_properties( "SubState", "Result", "ExecMainStatus", + "MainPID", ), ) -> dict[str, str]: """Return selected ``systemctl show`` properties for the gateway unit.""" @@ -538,6 +539,41 @@ def _read_systemd_unit_properties( return parsed +def _systemd_main_pid_from_props(props: dict[str, str]) -> int | None: + try: + pid = int(props.get("MainPID", "0") or "0") + except (TypeError, ValueError): + return None + return pid if pid > 0 else None + + +def _systemd_main_pid(system: bool = False) -> int | None: + return _systemd_main_pid_from_props(_read_systemd_unit_properties(system=system)) + + +def _read_gateway_runtime_status() -> dict | None: + try: + from gateway.status import read_runtime_status + + state = read_runtime_status() + except Exception: + return None + return state if isinstance(state, dict) else None + + +def _gateway_runtime_status_for_pid(pid: int | None) -> dict | None: + if not pid: + return None + state = _read_gateway_runtime_status() + if not state: + return None + try: + state_pid = int(state.get("pid", 0) or 0) + except (TypeError, ValueError): + return None + return state if state_pid == pid else None + + def _wait_for_systemd_service_restart( *, system: bool = False, @@ -550,6 +586,7 @@ def _wait_for_systemd_service_restart( svc = get_service_name() scope_label = _service_scope_label(system).capitalize() deadline = time.time() + timeout + printed_runtime_wait = False while time.time() < deadline: props = _read_systemd_unit_properties(system=system) @@ -562,19 +599,32 @@ def _wait_for_systemd_service_restart( new_pid = get_running_pid() except Exception: new_pid = None + if not new_pid: + new_pid = _systemd_main_pid_from_props(props) if active_state == "active": if new_pid and (previous_pid is None or new_pid != previous_pid): - print(f"✓ {scope_label} service restarted (PID {new_pid})") - return True - if previous_pid is None: - print(f"✓ {scope_label} service restarted") - return True + runtime_state = _gateway_runtime_status_for_pid(new_pid) + gateway_state = (runtime_state or {}).get("gateway_state") + if gateway_state == "running": + print(f"✓ {scope_label} service restarted (PID {new_pid})") + return True + if gateway_state == "startup_failed": + reason = (runtime_state or {}).get("exit_reason") or "startup failed" + print(f"⚠ {scope_label} service process restarted (PID {new_pid}), but gateway startup failed: {reason}") + return False + if not printed_runtime_wait: + print(f"⏳ {scope_label} service process started (PID {new_pid}); waiting for gateway runtime...") + printed_runtime_wait = True if active_state == "activating" and sub_state == "auto-restart": time.sleep(1) continue + if _systemd_unit_is_start_limited(props): + _print_systemd_start_limit_wait(system=system) + return False + time.sleep(2) print( @@ -585,6 +635,46 @@ def _wait_for_systemd_service_restart( return False +def _systemd_unit_is_start_limited(props: dict[str, str]) -> bool: + result = props.get("Result", "").lower() + sub_state = props.get("SubState", "").lower() + return result == "start-limit-hit" or sub_state == "start-limit-hit" + + +def _systemd_error_indicates_start_limit(exc: subprocess.CalledProcessError) -> bool: + parts: list[str] = [] + for attr in ("stderr", "stdout", "output"): + value = getattr(exc, attr, None) + if not value: + continue + if isinstance(value, bytes): + value = value.decode(errors="replace") + parts.append(str(value)) + text = "\n".join(parts).lower() + return ( + "start-limit-hit" in text + or "start request repeated too quickly" in text + or "start-limit" in text + ) + + +def _systemd_service_is_start_limited(system: bool = False) -> bool: + return _systemd_unit_is_start_limited(_read_systemd_unit_properties(system=system)) + + +def _print_systemd_start_limit_wait(system: bool = False) -> None: + svc = get_service_name() + scope_label = _service_scope_label(system).capitalize() + scope_flag = " --system" if system else "" + systemctl_prefix = "systemctl " if system else "systemctl --user " + journal_prefix = "journalctl " if system else "journalctl --user " + print(f"⏳ {scope_label} service is temporarily rate-limited by systemd.") + print(" systemd is refusing another immediate start after repeated exits.") + print(f" Wait for the start-limit window to expire, then run: {'sudo ' if system else ''}hermes gateway restart{scope_flag}") + print(f" Or clear the failed state manually: {systemctl_prefix}reset-failed {svc}") + print(f" Check logs: {journal_prefix}-u {svc} -l --since '5 min ago'") + + def _recover_pending_systemd_restart(system: bool = False, previous_pid: int | None = None) -> bool: """Recover a planned service restart that is stuck in systemd state.""" props = _read_systemd_unit_properties(system=system) @@ -2135,41 +2225,52 @@ def systemd_restart(system: bool = False): refresh_systemd_unit_if_needed(system=system) from gateway.status import get_running_pid - pid = get_running_pid() - if pid is not None and _request_gateway_self_restart(pid): - import time + pid = get_running_pid() or _systemd_main_pid(system=system) + if pid is not None: scope_label = _service_scope_label(system).capitalize() svc = get_service_name() + drain_timeout = _get_restart_drain_timeout() - # Phase 1: wait for old process to exit (drain + shutdown) - print(f"⏳ {scope_label} service draining active work...") - deadline = time.time() + 90 - while time.time() < deadline: - try: - os.kill(pid, 0) - time.sleep(1) - except (ProcessLookupError, PermissionError): - break # old process is gone - else: - print(f"⚠ Old process (PID {pid}) still alive after 90s") + print(f"⏳ {scope_label} service restarting gracefully (PID {pid})...") + if _graceful_restart_via_sigusr1(pid, drain_timeout + 5): + # The gateway exits with code 75 for a planned service restart. + # RestartSec can otherwise delay the relaunch even though the + # operator asked for an immediate restart, so kick the unit once + # the old PID has exited and then wait for the replacement PID. + _run_systemctl( + ["reset-failed", svc], + system=system, + check=False, + timeout=30, + ) + _run_systemctl( + ["restart", svc], + system=system, + check=False, + timeout=90, + ) + if _wait_for_systemd_service_restart(system=system, previous_pid=pid): + return + if _systemd_service_is_start_limited(system=system): + return - # The gateway exits with code 75 for a planned service restart. - # systemd can sit in the RestartSec window or even wedge itself into a - # failed/rate-limited state if the operator asks for another restart in - # the middle of that handoff. Clear any stale failed state and kick the - # unit immediately so `hermes gateway restart` behaves idempotently. + print( + f"⚠ Graceful restart did not complete within {int(drain_timeout + 5)}s; " + "forcing a service restart..." + ) _run_systemctl( ["reset-failed", svc], system=system, check=False, timeout=30, ) - _run_systemctl( - ["start", svc], - system=system, - check=False, - timeout=90, - ) + try: + _run_systemctl(["restart", svc], system=system, check=True, timeout=90) + except subprocess.CalledProcessError as exc: + if _systemd_error_indicates_start_limit(exc) or _systemd_service_is_start_limited(system=system): + _print_systemd_start_limit_wait(system=system) + return + raise _wait_for_systemd_service_restart(system=system, previous_pid=pid) return @@ -2182,8 +2283,14 @@ def systemd_restart(system: bool = False): check=False, timeout=30, ) - _run_systemctl(["reload-or-restart", get_service_name()], system=system, check=True, timeout=90) - print(f"✓ {_service_scope_label(system).capitalize()} service restarted") + try: + _run_systemctl(["restart", get_service_name()], system=system, check=True, timeout=90) + except subprocess.CalledProcessError as exc: + if _systemd_error_indicates_start_limit(exc) or _systemd_service_is_start_limited(system=system): + _print_systemd_start_limit_wait(system=system) + return + raise + _wait_for_systemd_service_restart(system=system, previous_pid=pid) @@ -2255,6 +2362,10 @@ def systemd_status(deep: bool = False, system: bool = False, full: bool = False) result_code = unit_props.get("Result", "") if active_state == "activating" and sub_state == "auto-restart": print(" ⏳ Restart pending: systemd is waiting to relaunch the gateway") + elif _systemd_unit_is_start_limited(unit_props): + print(" ⏳ Restart pending: systemd is temporarily rate-limiting starts") + print(f" Run after the start-limit window expires: {'sudo ' if system else ''}hermes gateway restart{scope_flag}") + print(f" Or clear it manually: systemctl {'--user ' if not system else ''}reset-failed {get_service_name()}") elif active_state == "failed" and exec_main_status == str(GATEWAY_SERVICE_RESTART_EXIT_CODE): print(" ⚠ Planned restart is stuck in systemd failed state (exit 75)") print(f" Run: systemctl {'--user ' if not system else ''}reset-failed {get_service_name()} && {'sudo ' if system else ''}hermes gateway start{scope_flag}") diff --git a/tests/gateway/test_discord_connect.py b/tests/gateway/test_discord_connect.py index dd49e78e182..57b3791a058 100644 --- a/tests/gateway/test_discord_connect.py +++ b/tests/gateway/test_discord_connect.py @@ -1,4 +1,5 @@ import asyncio +import json import sys from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock @@ -70,6 +71,15 @@ import gateway.platforms.discord as discord_platform # noqa: E402 from gateway.platforms.discord import DiscordAdapter # noqa: E402 +@pytest.fixture(autouse=True) +def _speed_up_command_sync_mutation_pacing(monkeypatch): + monkeypatch.setattr( + DiscordAdapter, + "_command_sync_mutation_interval_seconds", + lambda self: 0.0, + ) + + class FakeTree: def __init__(self): self.sync = AsyncMock(return_value=[]) @@ -536,6 +546,136 @@ async def test_post_connect_initialization_skips_sync_when_policy_off(monkeypatc fake_tree.sync.assert_not_called() +@pytest.mark.asyncio +async def test_post_connect_initialization_skips_same_fingerprint_after_success(tmp_path, monkeypatch): + adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token")) + monkeypatch.setattr("hermes_constants.get_hermes_home", lambda: tmp_path) + + class _DesiredCommand: + def to_dict(self, tree): + return { + "name": "status", + "description": "Show Hermes status", + "type": 1, + "options": [], + } + + fake_tree = SimpleNamespace( + get_commands=lambda: [_DesiredCommand()], + fetch_commands=AsyncMock(return_value=[]), + ) + fake_http = SimpleNamespace( + upsert_global_command=AsyncMock(), + edit_global_command=AsyncMock(), + delete_global_command=AsyncMock(), + ) + adapter._client = SimpleNamespace( + tree=fake_tree, + http=fake_http, + application_id=999, + user=SimpleNamespace(id=999), + ) + + await adapter._run_post_connect_initialization() + await adapter._run_post_connect_initialization() + + fake_tree.fetch_commands.assert_awaited_once() + fake_http.upsert_global_command.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_post_connect_initialization_respects_discord_retry_after(tmp_path, monkeypatch): + adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token")) + monkeypatch.setattr("hermes_constants.get_hermes_home", lambda: tmp_path) + + class _DesiredCommand: + def to_dict(self, tree): + return { + "name": "status", + "description": "Show Hermes status", + "type": 1, + "options": [], + } + + adapter._client = SimpleNamespace( + tree=SimpleNamespace(get_commands=lambda: [_DesiredCommand()]), + application_id=999, + user=SimpleNamespace(id=999), + ) + class _DiscordRateLimit(RuntimeError): + retry_after = 123.0 + + sync = AsyncMock(side_effect=_DiscordRateLimit("discord rate limited")) + monkeypatch.setattr(adapter, "_safe_sync_slash_commands", sync) + + await adapter._run_post_connect_initialization() + await adapter._run_post_connect_initialization() + + sync.assert_awaited_once() + state = json.loads((tmp_path / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILE).read_text()) + entry = state["999"] + assert entry["retry_after"] == 123.0 + assert entry["retry_after_until"] > entry["last_attempt_at"] + + +@pytest.mark.asyncio +async def test_safe_sync_slash_commands_paces_mutation_writes(monkeypatch): + adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token")) + monkeypatch.setattr( + DiscordAdapter, + "_command_sync_mutation_interval_seconds", + lambda self: 1.25, + ) + sleeps = [] + + async def fake_sleep(delay): + sleeps.append(delay) + + monkeypatch.setattr(discord_platform.asyncio, "sleep", fake_sleep) + + class _DesiredCommand: + def __init__(self, payload): + self._payload = payload + + def to_dict(self, tree): + assert tree is not None + return dict(self._payload) + + desired_one = { + "name": "status", + "description": "Show Hermes status", + "type": 1, + "options": [], + } + desired_two = { + "name": "debug", + "description": "Generate a debug report", + "type": 1, + "options": [], + } + fake_tree = SimpleNamespace( + get_commands=lambda: [_DesiredCommand(desired_one), _DesiredCommand(desired_two)], + fetch_commands=AsyncMock(return_value=[]), + ) + fake_http = SimpleNamespace( + upsert_global_command=AsyncMock(), + edit_global_command=AsyncMock(), + delete_global_command=AsyncMock(), + ) + adapter._client = SimpleNamespace( + tree=fake_tree, + http=fake_http, + application_id=999, + user=SimpleNamespace(id=999), + ) + + summary = await adapter._safe_sync_slash_commands() + + assert summary["created"] == 2 + assert fake_http.upsert_global_command.await_count == 2 + assert sleeps == [1.25] + + @pytest.mark.asyncio async def test_safe_sync_reads_permission_attrs_from_existing_command(): """Regression: AppCommand.to_dict() in discord.py does NOT include diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index b3d90362073..15968f798ed 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -2,6 +2,7 @@ import os import pwd +import subprocess from pathlib import Path from types import SimpleNamespace @@ -90,6 +91,13 @@ class TestSystemdServiceRefresh: monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda system=False, run_as_user=None: "new unit\n") calls = [] + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr(gateway_cli, "_recover_pending_systemd_restart", lambda system=False, previous_pid=None: False) + monkeypatch.setattr( + gateway_cli, + "_wait_for_systemd_service_restart", + lambda system=False, previous_pid=None: calls.append(("wait", system, previous_pid)) or True, + ) def fake_run(cmd, check=True, **kwargs): calls.append(cmd) @@ -100,11 +108,12 @@ class TestSystemdServiceRefresh: gateway_cli.systemd_restart() assert unit_path.read_text(encoding="utf-8") == "new unit\n" - assert calls[:4] == [ + assert calls[:5] == [ ["systemctl", "--user", "daemon-reload"], - ["systemctl", "--user", "show", gateway_cli.get_service_name(), "--no-pager", "--property", "ActiveState,SubState,Result,ExecMainStatus"], + ["systemctl", "--user", "show", gateway_cli.get_service_name(), "--no-pager", "--property", "ActiveState,SubState,Result,ExecMainStatus,MainPID"], ["systemctl", "--user", "reset-failed", gateway_cli.get_service_name()], - ["systemctl", "--user", "reload-or-restart", gateway_cli.get_service_name()], + ["systemctl", "--user", "restart", gateway_cli.get_service_name()], + ("wait", False, None), ] def test_systemd_stop_marks_running_gateway_as_planned_stop(self, monkeypatch): @@ -611,62 +620,141 @@ class TestGatewayServiceDetection: assert gateway_cli._is_service_running() is False class TestGatewaySystemServiceRouting: - def test_systemd_restart_self_requests_graceful_restart_and_waits(self, monkeypatch, capsys): + def test_systemd_restart_gracefully_restarts_running_service_and_waits(self, monkeypatch, capsys): calls = [] monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) monkeypatch.setattr(gateway_cli, "_require_service_installed", lambda action, system=False: None) monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: calls.append(("refresh", system))) + monkeypatch.setattr(gateway_cli, "_get_restart_drain_timeout", lambda: 12.0) monkeypatch.setattr( "gateway.status.get_running_pid", lambda: 654, ) monkeypatch.setattr( gateway_cli, - "_request_gateway_self_restart", - lambda pid: calls.append(("self", pid)) or True, + "_graceful_restart_via_sigusr1", + lambda pid, timeout: calls.append(("graceful", pid, timeout)) or True, ) - # Simulate: old process dies immediately, new process becomes active - kill_call_count = [0] - def fake_kill(pid, sig): - kill_call_count[0] += 1 - if kill_call_count[0] >= 2: # first call checks, second = dead - raise ProcessLookupError() - monkeypatch.setattr(os, "kill", fake_kill) - - # Simulate systemctl reset-failed/start followed by an active unit - new_pid = [None] + # Simulate systemctl reset-failed/restart followed by an active unit. + # A plain start does not break systemd's auto-restart timer once the + # old gateway has exited with the planned restart code. def fake_subprocess_run(cmd, **kwargs): if "reset-failed" in cmd: calls.append(("reset-failed", cmd)) return SimpleNamespace(stdout="", returncode=0) - if "start" in cmd: - calls.append(("start", cmd)) + if "restart" in cmd: + calls.append(("restart", cmd)) return SimpleNamespace(stdout="", returncode=0) - if "show" in cmd: - new_pid[0] = 999 - return SimpleNamespace( - stdout="ActiveState=active\nSubState=running\nResult=success\nExecMainStatus=0\n", - returncode=0, - ) raise AssertionError(f"Unexpected systemctl call: {cmd}") monkeypatch.setattr(gateway_cli.subprocess, "run", fake_subprocess_run) - # get_running_pid returns new PID after restart - pid_calls = [0] - def fake_get_pid(): - pid_calls[0] += 1 - return 999 if pid_calls[0] > 1 else 654 - monkeypatch.setattr("gateway.status.get_running_pid", fake_get_pid) + monkeypatch.setattr( + gateway_cli, + "_wait_for_systemd_service_restart", + lambda system=False, previous_pid=None: calls.append(("wait", system, previous_pid)) or True, + ) gateway_cli.systemd_restart() - assert ("self", 654) in calls + assert ("graceful", 654, 17.0) in calls assert any(call[0] == "reset-failed" for call in calls) - assert any(call[0] == "start" for call in calls) + assert any(call[0] == "restart" for call in calls) + assert ("wait", False, 654) in calls out = capsys.readouterr().out.lower() - assert "restarted" in out + assert "restarting gracefully" in out + + def test_systemd_restart_uses_systemd_main_pid_when_pid_file_is_missing(self, monkeypatch, capsys): + calls = [] + + monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) + monkeypatch.setattr(gateway_cli, "_require_service_installed", lambda action, system=False: None) + monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None) + monkeypatch.setattr(gateway_cli, "_get_restart_drain_timeout", lambda: 10.0) + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr( + gateway_cli, + "_read_systemd_unit_properties", + lambda system=False: { + "ActiveState": "active", + "SubState": "running", + "Result": "success", + "ExecMainStatus": "0", + "MainPID": "777", + }, + ) + monkeypatch.setattr( + gateway_cli, + "_graceful_restart_via_sigusr1", + lambda pid, timeout: calls.append(("graceful", pid, timeout)) or True, + ) + monkeypatch.setattr(gateway_cli, "_run_systemctl", lambda args, **kwargs: calls.append(args) or SimpleNamespace(stdout="", returncode=0)) + monkeypatch.setattr( + gateway_cli, + "_wait_for_systemd_service_restart", + lambda system=False, previous_pid=None: calls.append(("wait", system, previous_pid)) or True, + ) + + gateway_cli.systemd_restart() + + assert ("graceful", 777, 15.0) in calls + assert ("wait", False, 777) in calls + assert "restarting gracefully (pid 777)" in capsys.readouterr().out.lower() + + def test_wait_for_systemd_restart_waits_for_runtime_running(self, monkeypatch, capsys): + monkeypatch.setattr( + gateway_cli, + "_read_systemd_unit_properties", + lambda system=False: { + "ActiveState": "active", + "SubState": "running", + "Result": "success", + "ExecMainStatus": "0", + "MainPID": "999", + }, + ) + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr( + gateway_cli, + "_gateway_runtime_status_for_pid", + lambda pid: {"pid": pid, "gateway_state": "running"}, + ) + + assert gateway_cli._wait_for_systemd_service_restart(previous_pid=777, timeout=0.1) is True + assert "restarted (pid 999)" in capsys.readouterr().out.lower() + + def test_systemd_restart_reports_start_limit_hit(self, monkeypatch, capsys): + calls = [] + + monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) + monkeypatch.setattr(gateway_cli, "_require_service_installed", lambda action, system=False: None) + monkeypatch.setattr(gateway_cli, "refresh_systemd_unit_if_needed", lambda system=False: None) + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr(gateway_cli, "_recover_pending_systemd_restart", lambda system=False, previous_pid=None: False) + + def fake_run_systemctl(args, **kwargs): + calls.append(args) + if args[0] == "show": + return SimpleNamespace(stdout="ActiveState=inactive\nSubState=dead\nResult=success\nExecMainStatus=0\nMainPID=0\n", stderr="", returncode=0) + if args[0] == "reset-failed": + return SimpleNamespace(stdout="", stderr="", returncode=0) + if args[0] == "restart": + raise subprocess.CalledProcessError( + 1, + ["systemctl", "--user", *args], + stderr="Job failed. See result 'start-limit-hit'.", + ) + raise AssertionError(f"Unexpected args: {args}") + + monkeypatch.setattr(gateway_cli, "_run_systemctl", fake_run_systemctl) + + gateway_cli.systemd_restart() + + assert ["restart", gateway_cli.get_service_name()] in calls + out = capsys.readouterr().out.lower() + assert "rate-limited by systemd" in out + assert "reset-failed" in out def test_systemd_restart_recovers_failed_planned_restart(self, monkeypatch, capsys): monkeypatch.setattr(gateway_cli, "_select_systemd_scope", lambda system=False: False) @@ -711,6 +799,11 @@ class TestGatewaySystemServiceRouting: "gateway.status.get_running_pid", lambda: 999 if started["value"] else None, ) + monkeypatch.setattr( + gateway_cli, + "_gateway_runtime_status_for_pid", + lambda pid: {"pid": pid, "gateway_state": "running"}, + ) gateway_cli.systemd_restart() From 5a3cadf6ebcb749f1ad69e73cecb5aad9af0400e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 6 May 2026 17:29:32 -0700 Subject: [PATCH 016/230] fix(discord): narrow rate-limit catch and move sync state under gateway/ Two follow-ups on top of helix4u's slash-command sync hardening: - Only suppress exceptions that are actually Discord 429 rate limits (discord.RateLimited, HTTPException with status 429, or a clearly rate-limit-named duck type). Arbitrary failures that happen to expose a retry_after attribute now re-raise to the outer handler instead of silently swallowing a cooldown. - Move the sync-state JSON under $HERMES_HOME/gateway/ so the home root stops collecting ad-hoc runtime files. Added a test verifying unrelated exceptions don't get misclassified as rate limits. --- gateway/platforms/discord.py | 66 ++++++++++++++++++++++----- tests/gateway/test_discord_connect.py | 49 +++++++++++++++++++- 2 files changed, 103 insertions(+), 12 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index f0ee06f8ca7..ecce8b8fc0f 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080} _DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"} -_DISCORD_COMMAND_SYNC_STATE_FILE = "discord_command_sync_state.json" +_DISCORD_COMMAND_SYNC_STATE_SUBDIR = "gateway" +_DISCORD_COMMAND_SYNC_STATE_FILENAME = "discord_command_sync_state.json" _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5 _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0 @@ -834,7 +835,12 @@ class DiscordAdapter(BasePlatformAdapter): def _command_sync_state_path(self) -> _Path: from hermes_constants import get_hermes_home - return get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_FILE + directory = get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_SUBDIR + try: + directory.mkdir(parents=True, exist_ok=True) + except Exception: + pass + return directory / _DISCORD_COMMAND_SYNC_STATE_FILENAME def _read_command_sync_state(self) -> dict: try: @@ -945,6 +951,40 @@ class DiscordAdapter(BasePlatformAdapter): continue return None + @staticmethod + def _is_discord_rate_limit(exc: BaseException) -> bool: + """True only for exceptions that look like Discord 429 rate limits. + + Narrower than ``hasattr(exc, 'retry_after')``: discord.py's own + ``RateLimited`` exception and any HTTPException with status 429 + qualify. This prevents suppressing unrelated failures that happen + to expose a ``retry_after`` attribute.""" + # discord.py emits RateLimited / HTTPException subclasses for 429s. + # Guard with isinstance-of-class so a mocked ``discord`` module + # (where attrs are MagicMocks, not types) doesn't trip isinstance. + if DISCORD_AVAILABLE and discord is not None: + for attr_name in ("RateLimited", "HTTPException"): + cls = getattr(discord, attr_name, None) + if not isinstance(cls, type): + continue + if isinstance(exc, cls): + if attr_name == "RateLimited": + return True + status = getattr(exc, "status", None) + if status == 429: + return True + # Fallback duck-type: something named like a rate-limit with a + # numeric retry_after. Covers mocked clients in tests and exotic + # transports, without swallowing arbitrary exceptions. + name = type(exc).__name__.lower() + if ("ratelimit" in name or "rate_limit" in name) and getattr(exc, "retry_after", None) is not None: + return True + response = getattr(exc, "response", None) + status = getattr(response, "status", None) or getattr(response, "status_code", None) + if status == 429: + return True + return False + def _command_sync_mutation_interval_seconds(self) -> float: return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS @@ -989,16 +1029,20 @@ class DiscordAdapter(BasePlatformAdapter): # persist Discord's retry-after when it refuses the batch. summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600) except Exception as e: + if not self._is_discord_rate_limit(e): + raise retry_after = self._extract_discord_retry_after(e) - if retry_after is not None: - self._record_command_sync_rate_limit(app_id, fingerprint, retry_after) - logger.warning( - "[%s] Discord rate-limited slash command sync; retrying after %.0fs", - self.name, - retry_after, - ) - return - raise + if retry_after is None: + # Rate-limited but no retry-after signal — back off for a + # conservative default so we don't slam the bucket again. + retry_after = _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS + self._record_command_sync_rate_limit(app_id, fingerprint, retry_after) + logger.warning( + "[%s] Discord rate-limited slash command sync; retrying after %.0fs", + self.name, + retry_after, + ) + return finally: if has_ratelimit_timeout: http.max_ratelimit_timeout = previous_ratelimit_timeout diff --git a/tests/gateway/test_discord_connect.py b/tests/gateway/test_discord_connect.py index 57b3791a058..43f88bcf9da 100644 --- a/tests/gateway/test_discord_connect.py +++ b/tests/gateway/test_discord_connect.py @@ -612,12 +612,59 @@ async def test_post_connect_initialization_respects_discord_retry_after(tmp_path await adapter._run_post_connect_initialization() sync.assert_awaited_once() - state = json.loads((tmp_path / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILE).read_text()) + state_path = ( + tmp_path + / discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR + / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME + ) + state = json.loads(state_path.read_text()) entry = state["999"] assert entry["retry_after"] == 123.0 assert entry["retry_after_until"] > entry["last_attempt_at"] +@pytest.mark.asyncio +async def test_post_connect_initialization_reraises_non_rate_limit_exceptions(tmp_path, monkeypatch): + """Arbitrary failures during sync must surface, not be swallowed as rate-limits.""" + adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token")) + monkeypatch.setattr("hermes_constants.get_hermes_home", lambda: tmp_path) + + class _DesiredCommand: + def to_dict(self, tree): + return {"name": "status", "description": "Show Hermes status", "type": 1, "options": []} + + adapter._client = SimpleNamespace( + tree=SimpleNamespace(get_commands=lambda: [_DesiredCommand()]), + application_id=4242, + user=SimpleNamespace(id=4242), + ) + + # Unrelated failure that happens to expose retry_after. Must NOT be + # caught by the rate-limit handler — it has nothing to do with 429s. + class _UnrelatedError(RuntimeError): + retry_after = 999.0 + + sync = AsyncMock(side_effect=_UnrelatedError("database is down")) + monkeypatch.setattr(adapter, "_safe_sync_slash_commands", sync) + + # The outer _run_post_connect_initialization has a broad except Exception + # that logs defensively — so we assert on state NOT being written. + await adapter._run_post_connect_initialization() + + sync.assert_awaited_once() + state_path = ( + tmp_path + / discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR + / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME + ) + state = json.loads(state_path.read_text()) if state_path.exists() else {} + entry = state.get("4242", {}) + # Attempt was recorded before the sync call, but no rate-limit cooldown + # should have been persisted from the unrelated exception. + assert "retry_after_until" not in entry + assert "retry_after" not in entry + + @pytest.mark.asyncio async def test_safe_sync_slash_commands_paces_mutation_writes(monkeypatch): adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token")) From 45cbf93899a9f9f1e96c8b85d9192b452e6459d4 Mon Sep 17 00:00:00 2001 From: Gille <4317663+helix4u@users.noreply.github.com> Date: Wed, 6 May 2026 19:14:30 -0600 Subject: [PATCH 017/230] docs(kanban): fix orchestrator skill setup instructions (#20958) --- website/docs/user-guide/features/kanban.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/website/docs/user-guide/features/kanban.md b/website/docs/user-guide/features/kanban.md index c82311538de..c91e5fd009e 100644 --- a/website/docs/user-guide/features/kanban.md +++ b/website/docs/user-guide/features/kanban.md @@ -403,10 +403,18 @@ kanban_complete( ) ``` -Load it into your orchestrator profile: +`kanban-orchestrator` is a bundled skill. It is synced into each profile during +install and update, so there is no separate Skills Hub install step. Verify it is +present in your orchestrator profile: ```bash -hermes skills install devops/kanban-orchestrator +hermes -p orchestrator skills list | grep kanban-orchestrator +``` + +If the bundled copy is missing, restore it for that profile: + +```bash +hermes -p orchestrator skills reset kanban-orchestrator --restore ``` For best results, pair it with a profile whose toolsets are restricted to board operations (`kanban`, `gateway`, `memory`) so the orchestrator literally cannot execute implementation tasks even if it tries. From 49c3c2e0d37c96dc593a807a5e81fdf4f0aa3d85 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 6 May 2026 18:40:30 -0700 Subject: [PATCH 018/230] docs(kanban): fix worker skill setup instructions too (#20960) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #20958. The worker skill section had the same stale 'hermes skills install devops/kanban-worker' command — kanban-worker is also bundled, so that command fails with 'Could not fetch from any source.' Replace with bundled-skill verification + restore pattern, matching the orchestrator section. Uses placeholder since assignees vary (researcher, writer, ops, linguist, reviewer, etc.) rather than a single fixed 'worker' profile. --- website/docs/user-guide/features/kanban.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/website/docs/user-guide/features/kanban.md b/website/docs/user-guide/features/kanban.md index c91e5fd009e..acaa07c2012 100644 --- a/website/docs/user-guide/features/kanban.md +++ b/website/docs/user-guide/features/kanban.md @@ -335,10 +335,19 @@ Any profile that should be able to work kanban tasks must load the `kanban-worke 3. Call `kanban_heartbeat(note="...")` every few minutes during long operations. 4. Complete with `kanban_complete(summary="...", metadata={...})`, or `kanban_block(reason="...")` if stuck. -Load it with (this one is **you**, installing into a profile — not a tool call): +`kanban-worker` is a bundled skill, synced into every profile during install and +update — there is no separate Skills Hub install step. Verify it is present in +whichever profile you use for kanban workers (`researcher`, `writer`, `ops`, +etc.): ```bash -hermes skills install devops/kanban-worker +hermes -p skills list | grep kanban-worker +``` + +If the bundled copy is missing, restore it for that profile: + +```bash +hermes -p skills reset kanban-worker --restore ``` The dispatcher also auto-passes `--skills kanban-worker` when spawning every worker, so the worker always has the pattern library available even if a profile's default skills config doesn't include it. From 51f9953e69d303c3d278e41295b1a5c786bf8d87 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 7 May 2026 04:34:38 -0700 Subject: [PATCH 019/230] feat(profiles): --no-skills flag for empty profile creation (#20986) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `hermes profile create --no-skills` to create a profile with zero bundled skills. Writes a `.no-bundled-skills` marker file in the profile root so `hermes update`'s all-profile skill sync loop also skips the profile — without the marker, every update would re-seed skills and the user would have to delete them again. Use case (from @hiut1u): orchestrator profiles and narrow-task profiles don't need 100+ bundled skills polluting their system prompt. - create_profile() gains a `no_skills` param, mutually exclusive with `--clone` / `--clone-all` (cloning explicitly copies skills). - seed_profile_skills() no-ops on opted-out profiles and returns `{skipped_opt_out: True}` so callers can report cleanly. - Web API (POST /api/profiles) accepts `no_skills: bool`. - Delete `.no-bundled-skills` to opt back in — next `hermes update` re-seeds normally. 6 new tests in TestNoSkillsOptOut cover marker write, mutual exclusion with clone, seed_profile_skills opt-out, fresh profile unaffected, and delete-marker-re-enables-seeding. --- hermes_cli/main.py | 22 +++++- hermes_cli/profiles.py | 52 ++++++++++++++ hermes_cli/web_server.py | 5 +- tests/hermes_cli/test_profiles.py | 113 ++++++++++++++++++++++++++++++ 4 files changed, 188 insertions(+), 4 deletions(-) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 26d957f8195..4451704b1b5 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -7331,7 +7331,9 @@ def _cmd_update_impl(args, gateway_mode: bool): for p in all_profiles: try: r = seed_profile_skills(p.path, quiet=True) - if r: + if r and r.get("skipped_opt_out"): + status = "opted out (--no-skills)" + elif r: copied = len(r.get("copied", [])) updated = len(r.get("updated", [])) modified = len(r.get("user_modified", [])) @@ -8124,6 +8126,7 @@ def cmd_profile(args): clone = getattr(args, "clone", False) clone_all = getattr(args, "clone_all", False) no_alias = getattr(args, "no_alias", False) + no_skills = getattr(args, "no_skills", False) try: clone_from = getattr(args, "clone_from", None) @@ -8134,6 +8137,7 @@ def cmd_profile(args): clone_all=clone_all, clone_config=clone, no_alias=no_alias, + no_skills=no_skills, ) print(f"\nProfile '{name}' created at {profile_dir}") @@ -8158,10 +8162,17 @@ def cmd_profile(args): except Exception: pass # Honcho plugin not installed or not configured - # Seed bundled skills (skip if --clone-all already copied them) + # Seed bundled skills (skip if --clone-all already copied them, or + # if --no-skills was passed — in which case seed_profile_skills() + # honors the marker file and returns skipped_opt_out=True). if not clone_all: result = seed_profile_skills(profile_dir) - if result: + if result and result.get("skipped_opt_out"): + print( + "No bundled skills seeded (--no-skills). " + "Delete .no-bundled-skills in the profile to opt back in." + ) + elif result: copied = len(result.get("copied", [])) print(f"{copied} bundled skills synced.") else: @@ -10523,6 +10534,11 @@ Examples: profile_create.add_argument( "--no-alias", action="store_true", help="Skip wrapper script creation" ) + profile_create.add_argument( + "--no-skills", + action="store_true", + help="Create an empty profile with no bundled skills (opts out of `hermes update` skill sync)", + ) profile_delete = profile_subparsers.add_parser("delete", help="Delete a profile") profile_delete.add_argument("profile_name", help="Profile to delete") diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index 10cd36b88c9..93928364c42 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -71,6 +71,22 @@ _CLONE_ALL_STRIP = [ "processes.json", ] +# Marker file written by `hermes profile create --no-skills`. When present in +# a profile's root, callers of seed_profile_skills() (fresh-create, `hermes +# update`'s all-profile sync, the web dashboard) skip bundled-skill seeding +# for that profile. The user can still install skills manually via +# `hermes skills install` or drop SKILL.md files into the profile's skills/. +# Delete the marker file to opt back in. +NO_BUNDLED_SKILLS_MARKER = ".no-bundled-skills" + + +def has_bundled_skills_opt_out(profile_dir: Path) -> bool: + """Return True if the profile opted out of bundled-skill seeding.""" + try: + return (profile_dir / NO_BUNDLED_SKILLS_MARKER).exists() + except OSError: + return False + def _clone_all_copytree_ignore(source_dir: Path): """Ignore ``profiles/`` at the root of *source_dir* only. @@ -427,6 +443,7 @@ def create_profile( clone_all: bool = False, clone_config: bool = False, no_alias: bool = False, + no_skills: bool = False, ) -> Path: """Create a new profile directory. @@ -444,12 +461,22 @@ def create_profile( skills, and selected profile identity files from the source profile. no_alias: If True, skip wrapper script creation. + no_skills: + If True, create an empty profile with no bundled skills, and write + a marker file so ``hermes update`` skips re-seeding this profile's + skills. Mutually exclusive with ``clone_config``/``clone_all`` (those + explicitly copy skills from the source). Returns ------- Path The newly created profile directory. """ + if no_skills and (clone_config or clone_all): + raise ValueError( + "--no-skills is mutually exclusive with --clone / --clone-all " + "(cloning explicitly copies skills from the source profile)." + ) canon = normalize_profile_name(name) validate_profile_name(canon) @@ -527,6 +554,19 @@ def create_profile( except Exception: pass # best-effort — don't fail profile creation over this + # Write the opt-out marker so seed_profile_skills() and `hermes update`'s + # all-profile sync loop both skip this profile for bundled-skill seeding. + if no_skills: + try: + (profile_dir / NO_BUNDLED_SKILLS_MARKER).write_text( + "This profile opted out of bundled-skill seeding " + "(`hermes profile create --no-skills`).\n" + "Delete this file to re-enable sync on the next `hermes update`.\n", + encoding="utf-8", + ) + except OSError: + pass # best-effort — the feature still works via the empty skills/ dir + return profile_dir @@ -535,7 +575,19 @@ def seed_profile_skills(profile_dir: Path, quiet: bool = False) -> Optional[dict Uses subprocess because sync_skills() caches HERMES_HOME at module level. Returns the sync result dict, or None on failure. + + Profiles that opted out of bundled skills (via ``hermes profile create + --no-skills`` — which writes ``.no-bundled-skills`` to the profile root) + are skipped and get an empty-result dict so callers can report + "opted out" instead of "failed". """ + if has_bundled_skills_opt_out(profile_dir): + return { + "copied": [], + "updated": [], + "user_modified": [], + "skipped_opt_out": True, + } project_root = Path(__file__).parent.parent.resolve() try: result = subprocess.run( diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 754dd834432..5469cff607a 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -2366,6 +2366,7 @@ async def delete_cron_job(job_id: str): class ProfileCreate(BaseModel): name: str clone_from_default: bool = False + no_skills: bool = False class ProfileRename(BaseModel): @@ -2471,11 +2472,13 @@ async def create_profile_endpoint(body: ProfileCreate): name=body.name, clone_from="default" if body.clone_from_default else None, clone_config=body.clone_from_default, + no_skills=body.no_skills, ) # Match the CLI's profile-create flow: fresh named profiles get the # bundled skills installed. When cloning from default, create_profile() # has already copied the source profile's skills, including any - # user-installed skills. + # user-installed skills. When no_skills=True, create_profile() wrote + # the opt-out marker and seed_profile_skills() will no-op. if not body.clone_from_default: profiles_mod.seed_profile_skills(path, quiet=True) diff --git a/tests/hermes_cli/test_profiles.py b/tests/hermes_cli/test_profiles.py index 7ddb8fd20a8..130b1c39e40 100644 --- a/tests/hermes_cli/test_profiles.py +++ b/tests/hermes_cli/test_profiles.py @@ -33,6 +33,9 @@ from hermes_cli.profiles import ( generate_zsh_completion, _get_profiles_root, _get_default_hermes_home, + seed_profile_skills, + has_bundled_skills_opt_out, + NO_BUNDLED_SKILLS_MARKER, ) @@ -243,6 +246,116 @@ class TestCreateProfile: assert (profile_dir / "SOUL.md").exists() +# =================================================================== +# TestNoSkillsOptOut +# =================================================================== + +class TestNoSkillsOptOut: + """Tests for `hermes profile create --no-skills` and the opt-out marker.""" + + def test_no_skills_writes_marker_and_skips_seeding(self, profile_env): + profile_dir = create_profile("orchestrator", no_alias=True, no_skills=True) + + # Marker file is present + marker = profile_dir / NO_BUNDLED_SKILLS_MARKER + assert marker.is_file(), "expected .no-bundled-skills marker in profile root" + assert "--no-skills" in marker.read_text() + + # has_bundled_skills_opt_out() agrees + assert has_bundled_skills_opt_out(profile_dir) is True + + # skills/ dir exists (profile bootstrapping still creates the dir) but + # contains nothing yet because create_profile itself doesn't seed. + assert (profile_dir / "skills").is_dir() + assert list((profile_dir / "skills").iterdir()) == [] + + def test_no_skills_conflicts_with_clone(self, profile_env): + with pytest.raises(ValueError, match="mutually exclusive"): + create_profile( + "orchestrator", + no_alias=True, + no_skills=True, + clone_config=True, + ) + + def test_no_skills_conflicts_with_clone_all(self, profile_env): + with pytest.raises(ValueError, match="mutually exclusive"): + create_profile( + "orchestrator", + no_alias=True, + no_skills=True, + clone_all=True, + ) + + def test_seed_profile_skills_respects_marker(self, profile_env): + """seed_profile_skills() must no-op on opted-out profiles even when + called directly (e.g. by `hermes update`'s all-profile sync loop).""" + profile_dir = create_profile("orchestrator", no_alias=True, no_skills=True) + + # Call seed_profile_skills() directly — it should NOT invoke subprocess, + # NOT modify the skills/ dir, and return a dict with skipped_opt_out=True. + result = seed_profile_skills(profile_dir, quiet=True) + + assert result is not None + assert result.get("skipped_opt_out") is True + assert result.get("copied") == [] + # skills/ stays empty — no subprocess ran + assert list((profile_dir / "skills").iterdir()) == [] + + def test_default_profile_gets_skills_seeded(self, profile_env, monkeypatch): + """Sanity: without --no-skills, seed_profile_skills() runs the real + subprocess path. Mock the subprocess so the test is hermetic, and + just confirm the marker is NOT checked in the non-opt-out case.""" + import subprocess as _sp + + profile_dir = create_profile("coder", no_alias=True) + # No marker — not opted out + assert not (profile_dir / NO_BUNDLED_SKILLS_MARKER).exists() + assert has_bundled_skills_opt_out(profile_dir) is False + + # Mock subprocess.run to avoid actually running skill sync in tests + calls = [] + + def fake_run(*args, **kwargs): + calls.append(args) + return _sp.CompletedProcess( + args=args, returncode=0, stdout='{"copied": ["x"]}', stderr="" + ) + + monkeypatch.setattr("subprocess.run", fake_run) + result = seed_profile_skills(profile_dir, quiet=True) + + # Subprocess was invoked (the opt-out branch did NOT short-circuit) + assert len(calls) == 1 + assert result == {"copied": ["x"]} + + def test_delete_marker_re_enables_seeding(self, profile_env, monkeypatch): + """Deleting .no-bundled-skills opts the profile back in.""" + import subprocess as _sp + + profile_dir = create_profile("orchestrator", no_alias=True, no_skills=True) + assert has_bundled_skills_opt_out(profile_dir) is True + + # First call: opted out, returns skipped dict without touching subprocess + called = [] + monkeypatch.setattr( + "subprocess.run", + lambda *a, **kw: (called.append(a), _sp.CompletedProcess( + args=a, returncode=0, stdout='{"copied": []}', stderr="" + ))[1], + ) + r1 = seed_profile_skills(profile_dir, quiet=True) + assert r1.get("skipped_opt_out") is True + assert called == [] + + # Delete marker → next call runs the real path + (profile_dir / NO_BUNDLED_SKILLS_MARKER).unlink() + assert has_bundled_skills_opt_out(profile_dir) is False + r2 = seed_profile_skills(profile_dir, quiet=True) + assert r2 == {"copied": []} + assert len(called) == 1 + + # =================================================================== # TestDeleteProfile # =================================================================== From bd0c54d171efb8a31644df570b3b6a95826e8731 Mon Sep 17 00:00:00 2001 From: mrcoferland Date: Sat, 2 May 2026 23:51:13 +0000 Subject: [PATCH 020/230] fix: route Telegram image documents through photo handling --- gateway/platforms/telegram.py | 82 ++++++++++++++++++++---- tests/gateway/test_telegram_documents.py | 37 +++++++++++ 2 files changed, 105 insertions(+), 14 deletions(-) diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 83e81736876..0f0f568c10b 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -86,6 +86,22 @@ from gateway.platforms.telegram_network import ( ) from utils import atomic_replace +_TELEGRAM_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".gif"} +_TELEGRAM_IMAGE_MIME_TO_EXT = { + "image/png": ".png", + "image/jpeg": ".jpg", + "image/jpg": ".jpg", + "image/webp": ".webp", + "image/gif": ".gif", +} +_TELEGRAM_IMAGE_EXT_TO_MIME = { + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".webp": "image/webp", + ".gif": "image/gif", +} + def check_telegram_requirements() -> bool: """Check if Telegram dependencies are available.""" @@ -3239,10 +3255,59 @@ class TelegramAdapter(BasePlatformAdapter): _, ext = os.path.splitext(original_filename) ext = ext.lower() + # Normalize mime_type for robust comparisons (some clients send + # uppercase like "IMAGE/PNG"). + doc_mime = (doc.mime_type or "").lower() + # If no extension from filename, reverse-lookup from MIME type - if not ext and doc.mime_type: - mime_to_ext = {v: k for k, v in SUPPORTED_DOCUMENT_TYPES.items()} - ext = mime_to_ext.get(doc.mime_type, "") + if not ext and doc_mime: + ext = _TELEGRAM_IMAGE_MIME_TO_EXT.get(doc_mime, "") + if not ext: + mime_to_ext = {v: k for k, v in SUPPORTED_DOCUMENT_TYPES.items()} + ext = mime_to_ext.get(doc_mime, "") + + # Check file size early so image documents cannot bypass the + # document size limit by taking the image path. + MAX_DOC_BYTES = 20 * 1024 * 1024 + if not doc.file_size or doc.file_size > MAX_DOC_BYTES: + event.text = ( + "The document is too large or its size could not be verified. " + "Maximum: 20 MB." + ) + logger.info("[Telegram] Document too large: %s bytes", doc.file_size) + await self.handle_message(event) + return + + # Telegram may deliver screenshots/photos as documents. If the + # payload is actually an image, route it through the image cache + # and batching path instead of rejecting it as a document. + if ext in _TELEGRAM_IMAGE_EXTENSIONS or doc_mime.startswith("image/"): + file_obj = await doc.get_file() + image_bytes = await file_obj.download_as_bytearray() + image_ext = ext if ext in _TELEGRAM_IMAGE_EXTENSIONS else _TELEGRAM_IMAGE_MIME_TO_EXT.get(doc_mime, ".jpg") + try: + cached_path = cache_image_from_bytes(bytes(image_bytes), ext=image_ext) + except ValueError as e: + logger.warning("[Telegram] Failed to cache image document: %s", e, exc_info=True) + event.text = ( + f"Image document '{original_filename or doc_mime or ext or 'unknown'}' " + "could not be read as an image." + ) + await self.handle_message(event) + return + + event.message_type = MessageType.PHOTO + event.media_urls = [cached_path] + event.media_types = [doc_mime if doc_mime.startswith("image/") else _TELEGRAM_IMAGE_EXT_TO_MIME.get(image_ext, "image/jpeg")] + logger.info("[Telegram] Cached user image-document at %s", cached_path) + + media_group_id = getattr(msg, "media_group_id", None) + if media_group_id: + await self._queue_media_group_event(str(media_group_id), event) + else: + batch_key = self._photo_batch_key(event, msg) + self._enqueue_photo_event(batch_key, event) + return if not ext and doc.mime_type: video_mime_to_ext = {v: k for k, v in SUPPORTED_VIDEO_TYPES.items()} @@ -3270,17 +3335,6 @@ class TelegramAdapter(BasePlatformAdapter): await self.handle_message(event) return - # Check file size (Telegram Bot API limit: 20 MB) - MAX_DOC_BYTES = 20 * 1024 * 1024 - if not doc.file_size or doc.file_size > MAX_DOC_BYTES: - event.text = ( - "The document is too large or its size could not be verified. " - "Maximum: 20 MB." - ) - logger.info("[Telegram] Document too large: %s bytes", doc.file_size) - await self.handle_message(event) - return - # Download and cache file_obj = await doc.get_file() doc_bytes = await file_obj.download_as_bytearray() diff --git a/tests/gateway/test_telegram_documents.py b/tests/gateway/test_telegram_documents.py index 4b3e58f459e..136856afb8f 100644 --- a/tests/gateway/test_telegram_documents.py +++ b/tests/gateway/test_telegram_documents.py @@ -257,6 +257,43 @@ class TestDocumentDownloadBlock: assert event.media_urls and event.media_urls[0].endswith("archive.zip") assert event.media_types == ["application/zip"] + @pytest.mark.asyncio + async def test_png_document_is_routed_as_image(self, adapter): + """Telegram documents that are really PNGs should use the image path.""" + file_obj = _make_file_obj(b"\x89PNG\r\n\x1a\n" + b"\x00" * 16) + doc = _make_document(file_name="screenshot.png", mime_type="image/png", file_size=9, file_obj=file_obj) + msg = _make_message(document=doc) + update = _make_update(msg) + + with patch.object(adapter, "_photo_batch_key", return_value="batch-1"), patch.object( + adapter, "_enqueue_photo_event" + ) as enqueue_mock: + await adapter._handle_media_message(update, MagicMock()) + + enqueue_mock.assert_called_once() + event = enqueue_mock.call_args.args[1] + assert event.message_type == MessageType.PHOTO + assert event.media_urls and event.media_urls[0].endswith(".png") + assert event.media_types == ["image/png"] + assert adapter.handle_message.call_count == 0 + + @pytest.mark.asyncio + async def test_spoofed_png_document_falls_back_with_error(self, adapter): + """A .png filename with non-image bytes should fail clearly, not disappear.""" + file_obj = _make_file_obj(b"not-a-real-image") + doc = _make_document(file_name="spoofed.png", mime_type="image/png", file_size=16, file_obj=file_obj) + msg = _make_message(document=doc) + update = _make_update(msg) + + with patch.object(adapter, "_photo_batch_key", return_value="batch-2"), patch.object( + adapter, "_enqueue_photo_event" + ) as enqueue_mock: + await adapter._handle_media_message(update, MagicMock()) + + enqueue_mock.assert_not_called() + event = adapter.handle_message.call_args[0][0] + assert "could not be read as an image" in event.text + @pytest.mark.asyncio async def test_oversized_file_rejected(self, adapter): doc = _make_document(file_name="huge.pdf", file_size=25 * 1024 * 1024) From e7eb07cec7ea43bc8a7f37a6d50141c9e21392c8 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 7 May 2026 04:51:20 -0700 Subject: [PATCH 021/230] chore: AUTHOR_MAP entry for mrcoferland --- scripts/release.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/release.py b/scripts/release.py index 8249484e446..ede89cfbee2 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -844,6 +844,7 @@ AUTHOR_MAP = { "charliekerfoot@gmail.com": "CharlieKerfoot", # PR #18951 # Debug share upload-time redaction (May 2026) "dhuysamen@gmail.com": "GodsBoy", # PR #19318 + "mrcoferland@gmail.com": "mrcoferland", # PR #19023 } From 033e533d0545800e154399749595cf9b2442418d Mon Sep 17 00:00:00 2001 From: Sanjay Santhanam <51058514+Sanjays2402@users.noreply.github.com> Date: Sat, 2 May 2026 14:07:41 -0700 Subject: [PATCH 022/230] test(docker): align Dockerfile contract tests with simplified TUI flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Dockerfile dropped the manual `@hermes/ink` materialisation gymnastics in favour of letting npm workspaces resolve the bundled package naturally. Two contract tests still asserted the older flow: `test_dockerfile_installs_tui_dependencies` required: 'ui-tui/packages/hermes-ink/package-lock.json' in dockerfile_text …but the lockfile is no longer COPIED individually \u2014 the entire `ui-tui/packages/hermes-ink/` tree is COPIED instead (the workspace reference from `ui-tui/package.json` is `file:` so npm needs the real source, not just a manifest stub). `test_dockerfile_materializes_local_tui_ink_package` required a 7-clause conjunction matching specific `rm -rf` / `npm install --omit=dev` `--prefix node_modules/@hermes/ink` / `rm -rf .../react` invocations that were stripped out when the workspace resolution was simplified. Update the assertions to pin the *contract* the image actually has to carry rather than the *exact shell incantations* the old flow used: * TUI deps install: ui-tui/package.json + ui-tui/package-lock.json + ui-tui/packages/hermes-ink/ tree are all COPIED, and an npm install/ci step runs in ui-tui. * Bundled hermes-ink: the workspace package source is COPIED (so `await import('@hermes/ink')` resolves at runtime). This keeps the spirit of #15012 / #16690 (zombie reaping + bundled workspace materialisation must continue to work) without locking the Dockerfile into one specific implementation flavour. Validation: $ pytest tests/tools/test_dockerfile_pid1_reaping.py -q 6 passed in 1.43s No production code change. Fixes the two failures observed on `main` (run 25250051126): `tests/tools/test_dockerfile_pid1_reaping.py::test_dockerfile_installs_tui_dependencies` `tests/tools/test_dockerfile_pid1_reaping.py::test_dockerfile_materializes_local_tui_ink_package` --- tests/tools/test_dockerfile_pid1_reaping.py | 30 +++++++++++++-------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tests/tools/test_dockerfile_pid1_reaping.py b/tests/tools/test_dockerfile_pid1_reaping.py index 52532a78dd2..e578d8a69fd 100644 --- a/tests/tools/test_dockerfile_pid1_reaping.py +++ b/tests/tools/test_dockerfile_pid1_reaping.py @@ -106,8 +106,15 @@ def test_dockerfile_entrypoint_routes_through_the_init(dockerfile_text): def test_dockerfile_installs_tui_dependencies(dockerfile_text): + # The TUI workspace manifests must be present so ``npm install`` can + # resolve dependencies. The bundled ``hermes-ink`` workspace package is + # now COPIED into the image as a whole tree (not just its lockfile) + # because it's referenced as a ``file:`` workspace dependency from + # ``ui-tui/package.json`` — copying the tree avoids npm stopping at a + # bare ``package.json`` shell. assert "ui-tui/package.json" in dockerfile_text - assert "ui-tui/packages/hermes-ink/package-lock.json" in dockerfile_text + assert "ui-tui/package-lock.json" in dockerfile_text + assert "ui-tui/packages/hermes-ink/" in dockerfile_text assert any( "ui-tui" in step and "npm" in step and (" install" in step or " ci" in step) for step in _run_steps(dockerfile_text) @@ -122,16 +129,17 @@ def test_dockerfile_builds_tui_assets(dockerfile_text): def test_dockerfile_materializes_local_tui_ink_package(dockerfile_text): - assert any( - "ui-tui" in step - and "node_modules/@hermes/ink" in step - and "packages/hermes-ink" in step - and "rm -rf packages/hermes-ink/node_modules" in step - and "npm install --omit=dev" in step - and "--prefix node_modules/@hermes/ink" in step - and "rm -rf node_modules/@hermes/ink/node_modules/react" in step - and "await import('@hermes/ink')" in step - for step in _run_steps(dockerfile_text) + # ``hermes-ink`` is a bundled workspace package referenced from + # ``ui-tui/package.json`` via ``file:`` — not pulled from the npm + # registry. The contract this test pins is just that the image + # actually carries the package source so ``await import('@hermes/ink')`` + # can resolve at runtime; the previous, much pickier assertion (manual + # ``rm -rf`` + ``npm install --omit=dev --prefix node_modules/@hermes/ink``) + # baked in implementation details of an older materialisation flow that + # was simplified once npm workspaces handled the resolution natively. + assert "ui-tui/packages/hermes-ink/" in dockerfile_text, ( + "Dockerfile must COPY the bundled hermes-ink workspace package " + "so ``await import('@hermes/ink')`` resolves at runtime." ) From 595bcc89fc8c0e0891193180df27939c2d1ccd2d Mon Sep 17 00:00:00 2001 From: Sanjay Santhanam <51058514+Sanjays2402@users.noreply.github.com> Date: Sat, 2 May 2026 16:48:10 -0700 Subject: [PATCH 023/230] test(update): patch isatty on real streams to fix xdist-flaky --yes tests Two CI tests for the new `--yes` update flag (#18261) flaked under `pytest-xdist` on Linux/Python 3.11 even though they passed every local run on macOS/Python 3.14.4: FAILED tests/hermes_cli/test_update_yes_flag.py ::TestUpdateYesConfigMigration::test_no_yes_flag_still_prompts_in_tty `AssertionError: assert .called is False` FAILED tests/hermes_cli/test_update_yes_flag.py ::TestUpdateYesStashRestore::test_yes_restores_stash_without_prompting `AssertionError: assert .called is False` Captured stdout for the first failure shows `cmd_update` taking the "Non-interactive session \u2014 skipping config migration prompt." branch \u2014 i.e. the `sys.stdin.isatty() and sys.stdout.isatty()` check at `hermes_cli/main.py:7118` evaluated to `False` despite the test doing: with patch("hermes_cli.main.sys") as mock_sys: mock_sys.stdin.isatty.return_value = True mock_sys.stdout.isatty.return_value = True The whole-module mock is fragile under xdist worker reuse: a sibling test that imports `hermes_cli.main` first can leave another `sys` reference resolved inside the function (re-import in a helper, etc.), and the wholesale module replacement never gets consulted. Switch to `patch.object(_sys.stdin, "isatty", return_value=True)` (and the same for `stdout`). That patches the *attribute on the real stream object* \u2014 every call site, no matter how it reached `sys.stdin`, hits the patched method. Same fix applied to the stash-restore test (it took the "non-TTY \u2192 skip restore prompt" branch for the same reason). Validation: $ pytest tests/hermes_cli/test_update_yes_flag.py -q 3 passed in 5.47s No production code change. Fixes the two failures observed on `main` (run 25250051126): `tests/hermes_cli/test_update_yes_flag.py::TestUpdateYesConfigMigration::test_no_yes_flag_still_prompts_in_tty` `tests/hermes_cli/test_update_yes_flag.py::TestUpdateYesStashRestore::test_yes_restores_stash_without_prompting` Refs: #18261 (added the `--yes` flag + these tests). --- tests/hermes_cli/test_update_yes_flag.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/hermes_cli/test_update_yes_flag.py b/tests/hermes_cli/test_update_yes_flag.py index e36cc5142ef..66060b10aa8 100644 --- a/tests/hermes_cli/test_update_yes_flag.py +++ b/tests/hermes_cli/test_update_yes_flag.py @@ -113,11 +113,18 @@ class TestUpdateYesConfigMigration: args = SimpleNamespace(yes=False) - with patch("builtins.input", return_value="n") as mock_input, patch( - "hermes_cli.main.sys" - ) as mock_sys: - mock_sys.stdin.isatty.return_value = True - mock_sys.stdout.isatty.return_value = True + # Patch ``sys.stdin.isatty`` and ``sys.stdout.isatty`` directly on the + # real ``sys`` module instead of replacing ``hermes_cli.main.sys`` with + # a MagicMock. The MagicMock approach was flaky under ``pytest-xdist`` + # — a sibling test that imported ``hermes_cli.main`` first could leave + # a different ``sys`` reference resolved inside the function and the + # mock would never be consulted, with CI then taking the + # "Non-interactive session" branch instead of prompting. + import sys as _sys + + with patch("builtins.input", return_value="n") as mock_input, patch.object( + _sys.stdin, "isatty", return_value=True + ), patch.object(_sys.stdout, "isatty", return_value=True): cmd_update(args) # The user was actually prompted. assert mock_input.called @@ -156,7 +163,16 @@ class TestUpdateYesStashRestore: args = SimpleNamespace(yes=True) - cmd_update(args) + # Force a TTY-shaped session so the autostash-restore branch is + # reachable in CI workers regardless of inherited stdio (matches the + # isatty patching strategy in ``test_no_yes_flag_still_prompts_in_tty`` + # — ``patch.object`` on the real streams is robust under xdist). + import sys as _sys + + with patch.object(_sys.stdin, "isatty", return_value=True), patch.object( + _sys.stdout, "isatty", return_value=True + ): + cmd_update(args) # _restore_stashed_changes was called, and called with prompt_user=False # every time (so the user never sees "Restore local changes now?"). From a5c9c83b7861c4ca5529e8a327b93e0d50fcc667 Mon Sep 17 00:00:00 2001 From: Harish Kukreja Date: Mon, 4 May 2026 14:08:50 -0400 Subject: [PATCH 024/230] fix(web): force light color-scheme on docs iframe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Documentation tab embeds the public Hermes Agent docs site via an