Merge pull request #35038 from NousResearch/bb/gui-mainmerge

Merge main into bb/gui (catch up 264 commits)
2026-06-13 09:01:54 +00:00 · 2026-05-29 22:34:02 -05:00 · 2026-05-29 22:34:02 -05:00 · 5335869de4
commit 5335869de4
parent de8fed32fd da6646a23b
1212 changed files with 39431 additions and 9142 deletions
--- a/.github/workflows/contributor-check.yml
+++ b/.github/workflows/contributor-check.yml
@ -3,11 +3,9 @@ name: Contributor Attribution Check
 on:
  pull_request:
    branches: [main]
-    paths:
-      # Only run when code files change (not docs-only PRs)
-      - '*.py'
-      - '**/*.py'
-      - '.github/workflows/contributor-check.yml'
+  # No paths filter — the job must always run so the required check
+  # reports a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).

 permissions:
  contents: read
@ -20,7 +18,21 @@ jobs:
        with:
          fetch-depth: 0  # Full history needed for git log

+      - name: Check if relevant files changed
+        id: filter
+        run: |
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+          CHANGED=$(git diff --name-only "$BASE"..."$HEAD" -- '*.py' '**/*.py' '.github/workflows/contributor-check.yml' || true)
+          if [ -n "$CHANGED" ]; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run=false" >> "$GITHUB_OUTPUT"
+            echo "No Python files changed, skipping attribution check."
+          fi
+
      - name: Check for unmapped contributor emails
+        if: steps.filter.outputs.run == 'true'
        run: |
          # Get the merge base between this PR and main
          MERGE_BASE=$(git merge-base origin/main HEAD)
--- a/.github/workflows/deploy-site.yml
+++ b/.github/workflows/deploy-site.yml
@ -22,7 +22,12 @@ concurrency:

 jobs:
  deploy-vercel:
-    if: github.event_name == 'release'
+    # Triggered automatically on release publish (production cuts) and
+    # manually via `gh workflow run deploy-site.yml` when an out-of-band
+    # main commit needs to ship live before the next release tag — e.g.
+    # a skills-index PR that doesn't touch website/** paths and so
+    # doesn't auto-deploy via the deploy-docs path.
+    if: github.event_name == 'release' || github.event_name == 'workflow_dispatch'
    runs-on: ubuntu-latest
    steps:
      - name: Trigger Vercel Deploy
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@ -71,6 +71,8 @@ jobs:
          load: true
          platforms: linux/amd64
          tags: ${{ env.IMAGE_NAME }}:test
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
          cache-from: type=gha,scope=docker-amd64
          cache-to: type=gha,mode=max,scope=docker-amd64

@ -149,6 +151,8 @@ jobs:
          platforms: linux/amd64
          labels: |
            org.opencontainers.image.revision=${{ github.sha }}
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
          outputs: type=image,name=${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true
          cache-from: type=gha,scope=docker-amd64
          cache-to: type=gha,mode=max,scope=docker-amd64
@ -192,10 +196,12 @@ jobs:
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

-      # Build once, load into the local daemon for smoke testing.  Cached
-      # to gha with a per-arch scope; the push step below reuses every
-      # layer from this build.
-      - name: Build image (arm64, smoke test)
+      # Build once, load into the local daemon for smoke testing. PR arm64
+      # builds deliberately avoid the gha cache: cold-cache arm64 builds can
+      # outlive GitHub's short-lived Azure cache SAS token, then fail while
+      # reading or writing cache blobs before the smoke test can run.
+      - name: Build image (arm64, smoke test, uncached PR)
+        if: github.event_name == 'pull_request'
        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
        with:
          context: .
@ -203,6 +209,22 @@ jobs:
          load: true
          platforms: linux/arm64
          tags: ${{ env.IMAGE_NAME }}:test
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
+
+      # Main/release builds still use the per-arch gha cache so the digest
+      # push below can reuse layers from this smoke-test build.
+      - name: Build image (arm64, smoke test, cached publish)
+        if: github.event_name != 'pull_request'
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f  # v7.1.0
+        with:
+          context: .
+          file: Dockerfile
+          load: true
+          platforms: linux/arm64
+          tags: ${{ env.IMAGE_NAME }}:test
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
          cache-from: type=gha,scope=docker-arm64
          cache-to: type=gha,mode=max,scope=docker-arm64

@ -228,6 +250,8 @@ jobs:
          platforms: linux/arm64
          labels: |
            org.opencontainers.image.revision=${{ github.sha }}
+          build-args: |
+            HERMES_GIT_SHA=${{ github.sha }}
          outputs: type=image,name=${{ env.IMAGE_NAME }},push-by-digest=true,name-canonical=true,push=true
          cache-from: type=gha,scope=docker-arm64
          cache-to: type=gha,mode=max,scope=docker-arm64
--- a/.github/workflows/supply-chain-audit.yml
+++ b/.github/workflows/supply-chain-audit.yml
@ -3,15 +3,9 @@ name: Supply Chain Audit
 on:
  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      - '**/*.py'
-      - '**/*.pth'
-      - '**/setup.py'
-      - '**/setup.cfg'
-      - '**/sitecustomize.py'
-      - '**/usercustomize.py'
-      - '**/__init__.pth'
-      - 'pyproject.toml'
+  # No paths filter — the jobs must always run so required checks
+  # report a status (path-gated workflows leave checks "pending" forever
+  # when no matching files change, which blocks merge).

 permissions:
  pull-requests: write
@ -27,8 +21,44 @@ permissions:
 # advisory-only workflow instead.

 jobs:
+  # ── Path filter (shared by both scan and dep-bounds) ───────────────
+  changes:
+    runs-on: ubuntu-latest
+    outputs:
+      # True when any file the scanner cares about changed in this PR
+      scan: ${{ steps.filter.outputs.scan }}
+      # True when pyproject.toml changed in this PR
+      deps: ${{ steps.filter.outputs.deps }}
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
+        with:
+          fetch-depth: 0
+      - name: Check for relevant file changes
+        id: filter
+        run: |
+          BASE="${{ github.event.pull_request.base.sha }}"
+          HEAD="${{ github.event.pull_request.head.sha }}"
+          SCAN_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- \
+            '*.py' '**/*.py' '*.pth' '**/*.pth' \
+            'setup.py' 'setup.cfg' \
+            'sitecustomize.py' 'usercustomize.py' '__init__.pth' \
+            'pyproject.toml' || true)
+          if [ -n "$SCAN_FILES" ]; then
+            echo "scan=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "scan=false" >> "$GITHUB_OUTPUT"
+          fi
+          DEPS_FILES=$(git diff --name-only "$BASE"..."$HEAD" -- 'pyproject.toml' || true)
+          if [ -n "$DEPS_FILES" ]; then
+            echo "deps=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "deps=false" >> "$GITHUB_OUTPUT"
+          fi
+
  scan:
    name: Scan PR for critical supply chain risks
+    needs: changes
+    if: needs.changes.outputs.scan == 'true'
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
@ -147,10 +177,24 @@ jobs:
          echo "::error::CRITICAL supply chain risk patterns detected in this PR. See the PR comment for details."
          exit 1

+  # Gate: reports success when scan was skipped (no relevant files changed).
+  # This ensures the required check always gets a status.
+  scan-gate:
+    name: Scan PR for critical supply chain risks
+    needs: changes
+    # always() so the gate still reports SUCCESS even if `changes` fails/is
+    # skipped — without it, a failed dependency would leave the required
+    # check unreported (i.e. "pending"), the exact failure mode this fixes.
+    if: always() && needs.changes.outputs.scan != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No supply-chain-relevant files changed, skipping scan."
+
  dep-bounds:
    name: Check PyPI dependency upper bounds
+    needs: changes
+    if: needs.changes.outputs.deps == 'true'
    runs-on: ubuntu-latest
-    if: contains(github.event.pull_request.changed_files_url, 'pyproject.toml') || true
    steps:
      - name: Checkout
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
@ -211,3 +255,16 @@ jobs:
        run: |
          echo "::error::PyPI dependencies without upper bounds detected. Add <next_major ceiling per CONTRIBUTING.md policy."
          exit 1
+
+  # Gate: reports success when dep-bounds was skipped (no pyproject.toml changed).
+  # This ensures the required check always gets a status.
+  dep-bounds-gate:
+    name: Check PyPI dependency upper bounds
+    needs: changes
+    # always() so the gate still reports SUCCESS even if `changes` fails/is
+    # skipped — without it, a failed dependency would leave the required
+    # check unreported (i.e. "pending"), the exact failure mode this fixes.
+    if: always() && needs.changes.outputs.deps != 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "No pyproject.toml changes, skipping dependency bounds check."
--- a/.gitignore
+++ b/.gitignore
@ -82,6 +82,12 @@ mini-swe-agent/
 .nix-stamps/
 result
 website/static/api/skills-index.json
+# skills.json + skills-meta.json are build artifacts emitted by
+# website/scripts/extract-skills.py during prebuild — keep them out of
+# git for the same reason as skills-index.json (large, generated, change
+# every build).
+website/static/api/skills.json
+website/static/api/skills-meta.json
 models-dev-upstream/

 # Local editor / agent tooling (machine-specific; keep in global config, not the repo)
@ -100,3 +106,7 @@ docs/superpowers/*
 # also created in-repo when an agent operates in this checkout). Plans, audit
 # logs, and per-session caches are never artifacts of the codebase.
 .hermes/
+
+# Tool Search live-test harness output — non-deterministic model transcripts,
+# regenerated by scripts/tool_search_livetest.py. Never an artifact of the repo.
+scripts/out/
--- a/AGENTS.md
+++ b/AGENTS.md
@ -274,7 +274,7 @@ npm test          # vitest

 The dashboard embeds the real `hermes --tui` — **not** a rewrite.  See `hermes_cli/pty_bridge.py` + the `@app.websocket("/api/pty")` endpoint in `hermes_cli/web_server.py`.

- Browser loads `apps/dashboard/src/pages/ChatPage.tsx`, which mounts xterm.js's `Terminal` with the WebGL renderer, `@xterm/addon-fit` for container-driven resize, and `@xterm/addon-unicode11` for modern wide-character widths.
+- Browser loads `web/src/pages/ChatPage.tsx`, which mounts xterm.js's `Terminal` with the WebGL renderer, `@xterm/addon-fit` for container-driven resize, and `@xterm/addon-unicode11` for modern wide-character widths.
 - `/api/pty?token=…` upgrades to a WebSocket; auth uses the same ephemeral `_SESSION_TOKEN` as REST, via query param (browsers can't set `Authorization` on WS upgrade).
 - The server spawns whatever `hermes --tui` would spawn, through `ptyprocess` (POSIX PTY — WSL works, native Windows does not).
 - Frames: raw PTY bytes each direction; resize via `\x1b[RESIZE:<cols>;<rows>]` intercepted on the server and applied with `TIOCSWINSZ`.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -43,7 +43,7 @@ Bundled skills (in `skills/`) ship with every Hermes install. They should be **b
 - Document handling, web research, common dev workflows, system administration
 - Used regularly by a wide range of people

-If your skill is official and useful but not universally needed (e.g., a paid service integration, a heavyweight dependency), put it in **`optional-skills/`** — it ships with the repo but isn't activated by default. Users can discover it via `hermes skills browse` (labeled "official") and install it with `hermes skills install` (no third-party warning, builtin trust).
+If your skill is official and useful but not universally needed (e.g., a paid service integration, a heavyweight dependency), put it in **`optional-skills/`** — it ships with the repo but isn't activated by default. Users can discover it via `hermes skills browse` (labeled "official") and install it with `hermes skills install` (no third-party warning, built-in trust).

 If your skill is specialized, community-contributed, or niche, it's better suited for a **Skills Hub** — upload it to a skills registry and share it in the [Nous Research Discord](https://discord.gg/NousResearch). Users can install it with `hermes skills install`.

--- a/46
+++ b/46
@ -25,7 +25,7 @@ ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright
 # hermes process, the dashboard, and per-profile gateways.
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-    ca-certificates curl python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli xz-utils && \
+    ca-certificates curl iputils-ping python3 python-is-python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git openssh-client docker-cli xz-utils && \
    rm -rf /var/lib/apt/lists/*

 # ---------- s6-overlay install ----------
@ -187,6 +187,29 @@ RUN chmod -R a+rX /opt/hermes && \
 # this a fast (~1s) egg-link creation with no resolution or downloads.
 RUN uv pip install --no-cache-dir --no-deps -e "."

+# ---------- Bake build-time git revision ----------
+# .dockerignore excludes .git, so `git rev-parse HEAD` from inside the
+# container always returns nothing — meaning `hermes dump` reports
+# "(unknown)" and the startup banner drops its `· upstream <sha>` suffix.
+# That makes support triage from container bug reports impossible:
+# we can't tell which commit the user is actually running.
+#
+# Fix: write the commit SHA passed via the HERMES_GIT_SHA build-arg to
+# /opt/hermes/.hermes_build_sha at build time, and have
+# hermes_cli/build_info.py read it at runtime.  Both `hermes dump` and
+# banner.get_git_banner_state() try the baked SHA first, then fall back
+# to live `git rev-parse` for source installs (unchanged behaviour).
+#
+# The arg is optional — local `docker build` without --build-arg simply
+# omits the file, and the runtime falls back to live-git lookup.  CI
+# (.github/workflows/docker-publish.yml) passes ${{ github.sha }} so
+# every published image has it.
+ARG HERMES_GIT_SHA=
+RUN if [ -n "${HERMES_GIT_SHA}" ]; then \
+        printf '%s\n' "${HERMES_GIT_SHA}" > /opt/hermes/.hermes_build_sha && \
+        chown hermes:hermes /opt/hermes/.hermes_build_sha; \
+    fi
+
 # ---------- s6-overlay service wiring ----------
 # Static services declared at build time: main-hermes + dashboard.
 # Per-profile gateway services are registered dynamically at runtime by
@ -213,13 +236,32 @@ COPY --chmod=0755 docker/cont-init.d/02-reconcile-profiles /etc/cont-init.d/02-r
 # ---------- Runtime ----------
 ENV HERMES_WEB_DIST=/opt/hermes/hermes_cli/web_dist
 ENV HERMES_HOME=/opt/data
+
+# `docker exec` privilege-drop shim. When operators run
+# `docker exec <c> hermes ...` they default to root, and any file the
+# command writes under $HERMES_HOME (auth.json, .env, config.yaml) ends
+# up root-owned and unreadable to the supervised gateway (UID 10000).
+# The shim lives at /opt/hermes/bin/hermes, sits earliest on PATH, and
+# transparently re-exec's the real venv binary via `s6-setuidgid hermes`
+# when invoked as root. Non-root callers (supervised processes,
+# `--user hermes`, etc.) hit the short-circuit path with no overhead.
+# Recursion is impossible because the shim exec's the venv binary by
+# absolute path (/opt/hermes/.venv/bin/hermes). See the shim source for
+# the opt-out env var (HERMES_DOCKER_EXEC_AS_ROOT=1).
+COPY --chmod=0755 docker/hermes-exec-shim.sh /opt/hermes/bin/hermes
+
 # Pre-s6 entrypoint.sh did `source .venv/bin/activate` which exported
 # the venv bin onto PATH; Architecture B's main-wrapper.sh does the
 # same for the container's main process, but `docker exec` and our
 # cont-init.d scripts don't pass through the wrapper. Expose the venv
 # bin globally so `docker exec <container> hermes ...` and any
 # subprocess that doesn't activate the venv first still find hermes.
-ENV PATH="/opt/hermes/.venv/bin:/opt/data/.local/bin:${PATH}"
+#
+# /opt/hermes/bin is prepended ahead of the venv so the privilege-drop
+# shim wins PATH resolution. The shim's last act is to exec the venv
+# binary by absolute path, so this PATH ordering is transparent to
+# every other consumer.
+ENV PATH="/opt/hermes/bin:/opt/hermes/.venv/bin:/opt/data/.local/bin:${PATH}"
 RUN mkdir -p /opt/data
 VOLUME [ "/opt/data" ]

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,9 @@
 graft skills
 graft optional-skills
+# Bundled plugin manifests (plugin.yaml / plugin.yml). Without these the
+# PluginManager scan (hermes_cli/plugins.py) finds zero plugins on installs
+# built from the sdist (e.g. Homebrew, downstream packagers). package-data
+# below covers the wheel; this covers the sdist. See #34034 / #28149.
+recursive-include plugins plugin.yaml plugin.yml
 global-exclude __pycache__
 global-exclude *.py[cod]
--- a/RELEASE_v0.15.0.md
+++ b/RELEASE_v0.15.0.md
@ -0,0 +1,651 @@
+# Hermes Agent v0.15.0 (v2026.5.28)
+
+**Release Date:** May 28, 2026
+**Since v0.14.0:** 1,302 commits · 747 merged PRs · 1,746 files changed · 282,712 insertions · 36,699 deletions · 560+ issues closed (15 P0, 65 P1, 19 security-tagged) · 321 community contributors (including co-authors)
+
+> **The Velocity Release.** Hermes gets dramatically faster — to start, to run, to ship work, and to grow. The 16,083-line `run_agent.py` collapses to 3,821 (-76%) across 14 cohesive `agent/*` modules. Kanban grew into a real multi-agent platform across 104 PRs — orchestrator auto-decomposition, swarm topology, scheduled tasks, worktree-per-task, per-task model overrides. The cold-start perf wave keeps going: another second shaved off launch, 47% fewer per-conversation function calls, `hermes --version` flipping the head-to-head benchmark against Codex CLI. `session_search` is 4,500× faster and free now. Promptware defense lands against Brainworm-class attacks. Bitwarden Secrets Manager replaces N per-provider API keys with one bootstrap token. Skill bundles let one slash command load a whole workflow. The Ink TUI gets a multi-session orchestrator. Two new image_gen providers (Krea 2 Medium + Large, FAL ported to plugin), the Nous-approved MCP catalog with an interactive picker, an OpenHands orchestration skill, ntfy as the 23rd messaging platform, and a deep xAI integration round (Web Search plugin, xai-oauth `hermes proxy` upstream, retired-May-15 model detection + `hermes migrate xai`, natural TTS speech-tag pauses, base_url leak guard, OpenAI-style execution guidance for Grok). 15 P0 + 65 P1 closures alongside.
+
+---
+
+## ✨ Highlights
+
+- **The Big Refactor — `run_agent.py` is no longer 16,000 lines** — The file at the heart of Hermes — the agent conversation loop — has been reduced from 16,083 lines to 3,821 (-76%), with the extracted code redistributed across 14 cohesive modules under `agent/`. Behavior is unchanged: every extraction keeps a thin forwarder on `AIAgent`, every test patch path still works, every external caller is compatible. The reason you care: future Hermes development moves faster, plugin authors can finally grep the codebase, and the file that took 90 seconds to load in your editor opens in a blink. ([#27248](https://github.com/NousResearch/hermes-agent/pull/27248))
+
+- **Kanban grew into a real multi-agent platform — 104 PRs end to end** — Triage auto-decomposes one task into a tree of sub-tasks. `hermes kanban swarm` creates a full Swarm v1 graph in one command — root, parallel workers, gated verifier, gated synthesizer, shared blackboard. Tasks support per-task model overrides (cheap models for boilerplate, expensive ones for hard sub-tasks), board-level default workdirs, per-task worktree paths and branches, scheduled start times, configurable claim TTL, retry fingerprinting, stale-task detection, respawn guards, and a drag-to-delete trash zone. Workers report through `/workers/active`, `/runs/{id}`, and `/inspect` endpoints. ([#27572](https://github.com/NousResearch/hermes-agent/pull/27572), [#28443](https://github.com/NousResearch/hermes-agent/pull/28443), [#28364](https://github.com/NousResearch/hermes-agent/pull/28364), [#28394](https://github.com/NousResearch/hermes-agent/pull/28394), [#28462](https://github.com/NousResearch/hermes-agent/pull/28462), [#28384](https://github.com/NousResearch/hermes-agent/pull/28384), [#28467](https://github.com/NousResearch/hermes-agent/pull/28467), [#28455](https://github.com/NousResearch/hermes-agent/pull/28455), [#28452](https://github.com/NousResearch/hermes-agent/pull/28452), [#28432](https://github.com/NousResearch/hermes-agent/pull/28432), [#28468](https://github.com/NousResearch/hermes-agent/pull/28468), [#28420](https://github.com/NousResearch/hermes-agent/pull/28420))
+
+- **Cold-start perf wave keeps going — another second saved, 47% fewer per-turn function calls** — Three new optimization rounds: defer `openai._base_client` import (-240ms / -17MB on every CLI invocation), hot-path optimizations cut 47% of per-conversation function calls (399k → 213k for 31-turn chat), defer compression-feasibility check (-170 to -290ms on every agent construction), adaptive subprocess polling (-195ms per tool call, 1+ second per turn). Termux cold start drops from 2.9s to 0.8s. `hermes --version` cold drops 63% (701ms → 258ms), flipping the head-to-head benchmark against Codex CLI from 5/11 wins to 6/11. ([#28864](https://github.com/NousResearch/hermes-agent/pull/28864), [#28866](https://github.com/NousResearch/hermes-agent/pull/28866), [#28957](https://github.com/NousResearch/hermes-agent/pull/28957), [#29006](https://github.com/NousResearch/hermes-agent/pull/29006), [#29419](https://github.com/NousResearch/hermes-agent/pull/29419), [#30121](https://github.com/NousResearch/hermes-agent/pull/30121), [#30609](https://github.com/NousResearch/hermes-agent/pull/30609), [#31968](https://github.com/NousResearch/hermes-agent/pull/31968))
+
+- **`session_search` rebuilt — no LLM, no cost, 4,500× faster** — The old `session_search` was an aux-LLM-powered tool that cost ~$0.30/call and took ~30 seconds to summarize three sessions, sometimes confabulating when the right session wasn't even in the FTS5 hit list. The new shape is one tool with three modes (discovery, scroll, browse) inferred from which args are set — no `mode` parameter, no aux-LLM, no config knob, no companion skill. Discovery is ~20ms instead of ~90s; scroll is ~1ms. Searching your past sessions for context is now free and instant. ([#27590](https://github.com/NousResearch/hermes-agent/pull/27590))
+
+- **Promptware defense — Brainworm-class attacks blocked at three chokepoints** — Inspired by recent Brainworm / Promptware Kill Chain research (Origin HQ, arxiv 2601.09625), Hermes now defends the context window against prompt-injection attacks that try to hijack the agent via tool output, recalled memory, or stored skills. Single source of truth (`tools/threat_patterns.py`) with ~15 new Brainworm/C2 patterns; recalled memory is scanned at load time; tool results get delimiter markers so a malicious file or remote service can't impersonate Hermes' own system content. Paired with a new `security-guidance` plugin that pattern-matches dangerous code writes. ([#32269](https://github.com/NousResearch/hermes-agent/pull/32269), [#33131](https://github.com/NousResearch/hermes-agent/pull/33131), [#9151](https://github.com/NousResearch/hermes-agent/pull/9151))
+
+- **Bitwarden Secrets Manager — one bootstrap token replaces every per-provider API key** — Stop keeping plaintext API keys in `~/.hermes/.env`. Install Bitwarden Secrets Manager (`bws` auto-installs lazily on first use), point Hermes at it with one bootstrap token (`BWS_ACCESS_TOKEN`), and every credential you need comes from Bitwarden at startup. Rotate a key in the Bitwarden web app and the rotation actually takes effect — Bitwarden defaults to source-of-truth so its values overwrite matching env vars on startup. Flip `secrets.bitwarden.override_existing: false` to invert. EU Cloud and self-hosted Bitwarden server URLs supported. Detected credentials are now labeled with their source so you can see at a glance which keys came from Bitwarden vs. the local env. ([#30035](https://github.com/NousResearch/hermes-agent/pull/30035), [#31378](https://github.com/NousResearch/hermes-agent/pull/31378), [#30364](https://github.com/NousResearch/hermes-agent/pull/30364))
+
+- **ntfy as the 23rd messaging platform — push notifications without an account** — ntfy is the self-hostable push-notification service with no signup, no API key, just a topic URL. Hermes now adapts to it as a platform plugin (zero edits to core), so your agent can send you push notifications from any cron job, kanban task completion, or chat `send_message` — to your phone, your watch, your desktop, your homelab. (salvages [#30625](https://github.com/NousResearch/hermes-agent/pull/30625) → originally [#4043](https://github.com/NousResearch/hermes-agent/pull/4043)) ([#30867](https://github.com/NousResearch/hermes-agent/pull/30867))
+
+- **Skill bundles — `/<name>` loads multiple skills at once** — A skill bundle is a named group of skills that loads them all together with one slash command. Set up your "writing day" bundle (humanizer + ideation + obsidian + youtube-content) and `/writing-day` activates all four for the session. Skills Hub now has health checks, a freshness badge, and a watchdog cron. Three new optional skills land: `code-wiki` (Karpathy's LLM-Wiki, persistent indexed dev wiki), `openhands` (delegate to OpenHands for parallel coding agents), and `web-pentest` (OWASP-style web pentest recipes). ([#28373](https://github.com/NousResearch/hermes-agent/pull/28373), [#32345](https://github.com/NousResearch/hermes-agent/pull/32345), [#32240](https://github.com/NousResearch/hermes-agent/pull/32240), [#32261](https://github.com/NousResearch/hermes-agent/pull/32261), [#32265](https://github.com/NousResearch/hermes-agent/pull/32265))
+
+- **TUI session orchestrator — multiple live sessions in one TUI window** — The Ink TUI gained an active-session switcher overlay. List, switch between, refresh, and close multiple live process-local sessions without leaving the TUI; dispatch a new session with a session-scoped model picker. Plus a wave of TUI polish — mouse-tracking DEC mode presets, scrollback preservation across branches and termux, slash-dropdown fixes, x.com link rendering, and CJK / IME input rendering improvements. (salvages [#27642](https://github.com/NousResearch/hermes-agent/pull/27642)) ([#32980](https://github.com/NousResearch/hermes-agent/pull/32980), [#30084](https://github.com/NousResearch/hermes-agent/pull/30084))
+
+- **Two new image_gen providers — Krea 2 Medium + Large, FAL ported to plugin** — Krea joins the image_gen lineup as a built-in plugin: `Krea 2 Medium` ($0.03) and `Krea 2 Large` ($0.06), auto-discovered, selectable via `hermes tools` → Image Generation → Krea. Available through both the native Krea plugin and the FAL.ai catalog. The FAL.ai backend got pulled out of the monolithic image-generation tool into `plugins/image_gen/fal/`, completing the four-way architectural parity already established by web, browser, and video_gen — new image providers are now one file, not a fork. ([#33236](https://github.com/NousResearch/hermes-agent/pull/33236), [#30380](https://github.com/NousResearch/hermes-agent/pull/30380), [#33506](https://github.com/NousResearch/hermes-agent/pull/33506))
+
+- **Nous-approved MCP catalog with interactive picker** — A curated catalog of Nous-vetted MCP servers, mirroring the optional-skills shape. Run `hermes mcp` and you get an interactive picker; install with one keystroke, credentials prompted at install time and written to `~/.hermes/.env`. Ships with the n8n manifest first. Closes the discovery gap that left users hunting GitHub for trusted MCP servers. ([#30870](https://github.com/NousResearch/hermes-agent/pull/30870))
+
+- **OpenHands orchestration skill** — A new optional skill under `optional-skills/autonomous-ai-agents/openhands/` lets the agent delegate coding tasks to the OpenHands CLI alongside `claude-code`, `codex`, and `opencode`. OpenHands is the model-agnostic member of that family — any LiteLLM-supported provider works (OpenAI, Anthropic, OpenRouter, your own), so you can route a sub-task to the cheapest model that can finish it. Drop-in worker for kanban swarms and `/delegate` flows. (closes [#477](https://github.com/NousResearch/hermes-agent/issues/477)) ([#32261](https://github.com/NousResearch/hermes-agent/pull/32261))
+
+- **Deep xAI integration round — Web Search plugin, OAuth proxy upstream, May 15 retirement detection, natural TTS, security hardening** — Six interlocking xAI improvements:
+    - **xAI Web Search** lands as a `plugins/web/xai/` provider, slots alongside Brave / Tavily / Exa / SearXNG / DDGS / Firecrawl — reuses your existing Grok OAuth or `XAI_API_KEY` credentials, no new env vars. ([#29042](https://github.com/NousResearch/hermes-agent/pull/29042))
+    - **`hermes proxy` gains an xAI upstream** — your local OpenAI-compatible endpoint can now be backed by SuperGrok OAuth, no PKCE-refresh code to write in your client. ([#28356](https://github.com/NousResearch/hermes-agent/pull/28356))
+    - **May 15 model retirement detection** — `grok-4`, `grok-4-fast{,-reasoning,-non-reasoning}`, `grok-3`, `grok-code-fast-1`, `grok-imagine-image-pro` etc. are detected in doctor and chat startup, with `hermes migrate xai` to one-shot config migration to the supported model. No more silent 404s after the retirement date. ([#29277](https://github.com/NousResearch/hermes-agent/pull/29277))
+    - **Opt-in `auto_speech_tags`** for xAI TTS — inserts light `[pause]` tags between paragraphs and sentences for more natural-sounding voice replies. Default OFF. ([#29376](https://github.com/NousResearch/hermes-agent/pull/29376))
+    - **`xai-oauth` `base_url` pinned to `x.ai` origin** — closes a silent credential-leak vector where `XAI_BASE_URL` could repoint OAuth-authenticated inference to an attacker-controlled host. ([#28952](https://github.com/NousResearch/hermes-agent/pull/28952))
+    - **OpenAI-style execution guidance applied to Grok models** — Grok and xai-oauth now get the same family-specific execution discipline block GPT/Codex have, so the model stops claiming completion without tool calls and stops suggesting workarounds instead of using existing tools. ([#27797](https://github.com/NousResearch/hermes-agent/pull/27797))
+    - Plus `x_search` degraded-results surfacing, tier-gated 403 with API-key fallback, PKCE `code_challenge` round-trip fix, dead-token quarantine on terminal refresh failure, MiniMax-style short-token refresh on per-request, and `WKE=unauthenticated` honor at both classifier sites. ([#29484](https://github.com/NousResearch/hermes-agent/pull/29484), [#28351](https://github.com/NousResearch/hermes-agent/pull/28351), [#27560](https://github.com/NousResearch/hermes-agent/pull/27560), [#28116](https://github.com/NousResearch/hermes-agent/pull/28116), [#30619](https://github.com/NousResearch/hermes-agent/pull/30619), [#30872](https://github.com/NousResearch/hermes-agent/pull/30872))
+
+---
+
+## 🏗️ Core Agent & Architecture
+
+### The Big Refactor — `run_agent.py` 16k → 3.8k
+
+- `run_agent.py` from 16,083 → 3,821 lines (-76%), extracted into 14 cohesive `agent/*` modules. `run_conversation` alone was 3,877 lines before the refactor. Every extraction keeps a thin forwarder on `AIAgent`, every test-patch path is preserved, every external caller stays compatible. ([#27248](https://github.com/NousResearch/hermes-agent/pull/27248))
+
+### Agent loop & conversation
+
+- Auxiliary task layered fallback (primary → chain → main agent → graceful fail) on capacity errors (402/429/connection). (salvages [#26811](https://github.com/NousResearch/hermes-agent/pull/26811) + [#26998](https://github.com/NousResearch/hermes-agent/pull/26998)) ([#27625](https://github.com/NousResearch/hermes-agent/pull/27625))
+- Buffer retry/fallback status; surface only on terminal failure (no more noisy "retrying..." spam in mid-run output). ([#33816](https://github.com/NousResearch/hermes-agent/pull/33816))
+- Host contract for external context engines — condenses 5 prior PRs into one extension surface. ([#33750](https://github.com/NousResearch/hermes-agent/pull/33750))
+- Fallback immediately on provider content-policy blocks. ([#33883](https://github.com/NousResearch/hermes-agent/pull/33883))
+- Re-pad `reasoning_content` on cross-provider fallback to require-side providers. (salvage [#33784](https://github.com/NousResearch/hermes-agent/pull/33784)) ([#33795](https://github.com/NousResearch/hermes-agent/pull/33795))
+- Per-turn tool-outcome verifier — patch tool gets indent preservation, CRLF preservation, per-file failure escalation. ([#32273](https://github.com/NousResearch/hermes-agent/pull/32273))
+- Single-knob native vision for custom-provider models. ([#29679](https://github.com/NousResearch/hermes-agent/pull/29679))
+- Background review fork isolated from external memory plugins. ([#27190](https://github.com/NousResearch/hermes-agent/pull/27190))
+- Background review inherits parent toolset config for `tools[]` cache parity. ([#29704](https://github.com/NousResearch/hermes-agent/pull/29704))
+- Recover from providers returning list-type tool content. ([#30259](https://github.com/NousResearch/hermes-agent/pull/30259))
+- Treat partial-stream stub responses as length truncation rather than clean stop. ([#30998](https://github.com/NousResearch/hermes-agent/pull/30998))
+- OpenAI execution guidance applied to xAI Grok / xai-oauth. ([#27797](https://github.com/NousResearch/hermes-agent/pull/27797))
+- ContextVars propagate to concurrent tool worker threads.
+- Preload `jiter` native parser. ([#33692](https://github.com/NousResearch/hermes-agent/pull/33692))
+- Expose context engine tools with saved toolsets. (salvage of [#31194](https://github.com/NousResearch/hermes-agent/pull/31194)) ([#33719](https://github.com/NousResearch/hermes-agent/pull/33719))
+
+### Sessions & memory
+
+- `session_search` rebuilt — single-shape (discovery + scroll + browse), no aux-LLM, ~20ms vs. ~90s. ([#27590](https://github.com/NousResearch/hermes-agent/pull/27590))
+- Salvage [#29182](https://github.com/NousResearch/hermes-agent/pull/29182) — opt-in JSON snapshot writer for sessions. ([#29278](https://github.com/NousResearch/hermes-agent/pull/29278))
+- Persist `platform_message_id` for recall across gateway restarts. ([#29449](https://github.com/NousResearch/hermes-agent/pull/29449))
+- Inline memory-context mentions stay visible in conversation. ([#28132](https://github.com/NousResearch/hermes-agent/pull/28132))
+- Recalled memory labeled informational, not authoritative. ([#28583](https://github.com/NousResearch/hermes-agent/pull/28583))
+- Memory + context-engine tool injection gated on `enabled_toolsets`. ([#30177](https://github.com/NousResearch/hermes-agent/pull/30177))
+- Guard against external drift in `MEMORY.md` / `USER.md`. ([#30877](https://github.com/NousResearch/hermes-agent/pull/30877))
+- Honcho runtime peer mapping — correctness follow-ups + setup wizard + docs. ([#30077](https://github.com/NousResearch/hermes-agent/pull/30077))
+- Periodic memory logging for leak detection. (salvage of [#17667](https://github.com/NousResearch/hermes-agent/pull/17667)) ([#27102](https://github.com/NousResearch/hermes-agent/pull/27102))
+
+### Codex / Responses-API maturation
+
+- TTFB watchdog for stalled Codex Responses streams. ([#32042](https://github.com/NousResearch/hermes-agent/pull/32042))
+- Actionable hint when stale-call detector fires on known silent-reject pattern. ([#32016](https://github.com/NousResearch/hermes-agent/pull/32016), [#33133](https://github.com/NousResearch/hermes-agent/pull/33133))
+- Drop SDK `responses.stream()` helper; consume events directly. ([#33042](https://github.com/NousResearch/hermes-agent/pull/33042))
+- Gracefully recover from `invalid_encrypted_content`. (salvage of [#10144](https://github.com/NousResearch/hermes-agent/pull/10144)) ([#33035](https://github.com/NousResearch/hermes-agent/pull/33035))
+- Recover Codex Responses streams with null output. ([#32963](https://github.com/NousResearch/hermes-agent/pull/32963), [#33390](https://github.com/NousResearch/hermes-agent/pull/33390))
+- Drop foreign-issuer reasoning and transient `rs_tmp` reasoning replay state. ([#33156](https://github.com/NousResearch/hermes-agent/pull/33156), [#33146](https://github.com/NousResearch/hermes-agent/pull/33146))
+- Codex 429 quota classified as rate-limit, not missing credentials. ([#33168](https://github.com/NousResearch/hermes-agent/pull/33168))
+- Codex chat path falls back to credential_pool when singleton is empty. ([#33189](https://github.com/NousResearch/hermes-agent/pull/33189))
+- Codex re-auth syncs credential_pool. ([#33164](https://github.com/NousResearch/hermes-agent/pull/33164))
+- Omit `tools` key when no tools registered. ([#33409](https://github.com/NousResearch/hermes-agent/pull/33409))
+- Parse Codex image-generation SSE directly. ([#32933](https://github.com/NousResearch/hermes-agent/pull/32933))
+
+---
+
+## 🎛️ Kanban — Multi-Agent Maturation Wave
+
+### Orchestration & dispatch
+
+- Orchestrator-driven auto-decomposition on triage. ([#27572](https://github.com/NousResearch/hermes-agent/pull/27572))
+- Kanban swarm topology helper — `hermes kanban swarm` creates a Swarm v1 graph (root + parallel workers + gated verifier + gated synthesizer + shared blackboard). (salvages [#26791](https://github.com/NousResearch/hermes-agent/pull/26791) by @Niraven) ([#28443](https://github.com/NousResearch/hermes-agent/pull/28443))
+- Dispatcher wires review agents from the review column. ([#28449](https://github.com/NousResearch/hermes-agent/pull/28449))
+- Stale-detection for running tasks in dispatcher. ([#28452](https://github.com/NousResearch/hermes-agent/pull/28452))
+- Respawn guard blocks repeat worker storms. ([#28455](https://github.com/NousResearch/hermes-agent/pull/28455))
+- Respawn guard defers `blocker_auth` instead of auto-blocking. ([#28683](https://github.com/NousResearch/hermes-agent/pull/28683))
+- Cross-profile cron jobs surface in dashboard. ([#28457](https://github.com/NousResearch/hermes-agent/pull/28457))
+- Worker visibility endpoints: `/workers/active`, `/runs/{id}`, `/inspect`. (salvages [#23761](https://github.com/NousResearch/hermes-agent/pull/23761) by @Interstellar-code) ([#28432](https://github.com/NousResearch/hermes-agent/pull/28432))
+
+### Task configuration & scheduling
+
+- Per-task model override. ([#28364](https://github.com/NousResearch/hermes-agent/pull/28364))
+- Board-level default workdir. ([#28394](https://github.com/NousResearch/hermes-agent/pull/28394))
+- Configurable worktree paths and branches. ([#28462](https://github.com/NousResearch/hermes-agent/pull/28462))
+- Scheduled task start times. ([#28384](https://github.com/NousResearch/hermes-agent/pull/28384))
+- Scheduled status for delayed follow-ups. ([#28467](https://github.com/NousResearch/hermes-agent/pull/28467))
+- Trimmed task comments. ([#28399](https://github.com/NousResearch/hermes-agent/pull/28399))
+- Initial-status for human-ops cards. ([#28414](https://github.com/NousResearch/hermes-agent/pull/28414))
+- `max_in_progress` config to cap concurrent running tasks. ([#28420](https://github.com/NousResearch/hermes-agent/pull/28420))
+- Filter tasks by workflow fields. ([#28454](https://github.com/NousResearch/hermes-agent/pull/28454))
+- `--sort` for `hermes kanban list`. ([#28427](https://github.com/NousResearch/hermes-agent/pull/28427))
+- Optional `board` parameter on all MCP tools. ([#28444](https://github.com/NousResearch/hermes-agent/pull/28444))
+- Stamp originating ACP session_id on tasks. ([#28447](https://github.com/NousResearch/hermes-agent/pull/28447))
+- `auto_promote_children` config toggle. ([#28344](https://github.com/NousResearch/hermes-agent/pull/28344))
+- `archive --rm` to hard-delete archived tasks. ([#28355](https://github.com/NousResearch/hermes-agent/pull/28355))
+- Promote dependents when parent is archived. ([#28372](https://github.com/NousResearch/hermes-agent/pull/28372))
+- Promote blocked tasks when parent dependencies complete. ([#28377](https://github.com/NousResearch/hermes-agent/pull/28377))
+- Demote ready children when parent is reopened. ([#28382](https://github.com/NousResearch/hermes-agent/pull/28382))
+- `promote` verb for manual `todo→ready` recovery + bulk `--ids`. (salvage [#29464](https://github.com/NousResearch/hermes-agent/pull/29464)) ([#31334](https://github.com/NousResearch/hermes-agent/pull/31334))
+
+### Dashboard
+
+- Drag-to-delete trash zone + bulk delete. ([#28468](https://github.com/NousResearch/hermes-agent/pull/28468))
+- Surface per-task `model_override` in show + tool output. ([#28442](https://github.com/NousResearch/hermes-agent/pull/28442))
+- Cross-profile notification delivery via `kanban.notification_sources`. ([#28395](https://github.com/NousResearch/hermes-agent/pull/28395))
+- Scratch-workspace deletion warning for users. ([#30949](https://github.com/NousResearch/hermes-agent/pull/30949))
+- Mobile dashboard UX polish. ([#28127](https://github.com/NousResearch/hermes-agent/pull/28127))
+
+### Reliability
+
+- Worker log retention configurable. ([#27867](https://github.com/NousResearch/hermes-agent/pull/27867))
+- Configurable claim TTL. ([#28392](https://github.com/NousResearch/hermes-agent/pull/28392))
+- Fingerprint crash errors to prevent fleet-wide retry exhaustion. ([#28380](https://github.com/NousResearch/hermes-agent/pull/28380))
+- Reset failure counters on `unblock_task`. ([#28379](https://github.com/NousResearch/hermes-agent/pull/28379))
+- Detect cycles in `decompose_triage_task` sibling-link pre-validation. ([#28088](https://github.com/NousResearch/hermes-agent/pull/28088))
+- Surface unusable triage auxiliary model (auto-decompose aware). ([#27871](https://github.com/NousResearch/hermes-agent/pull/27871))
+- Align failure diagnostics with retry limit. ([#27868](https://github.com/NousResearch/hermes-agent/pull/27868))
+- Align worker terminal timeout with task runtime. ([#27864](https://github.com/NousResearch/hermes-agent/pull/27864))
+- Auto-install bundled skills (kanban-worker) on init. ([#28368](https://github.com/NousResearch/hermes-agent/pull/28368))
+- Make legacy task migration idempotent. ([#28397](https://github.com/NousResearch/hermes-agent/pull/28397))
+- Serialize DB initialization. ([#28383](https://github.com/NousResearch/hermes-agent/pull/28383))
+- Persist worker session metadata on completion. ([#28387](https://github.com/NousResearch/hermes-agent/pull/28387))
+- Pass `accept-hooks` to worker chat subprocess. ([#28393](https://github.com/NousResearch/hermes-agent/pull/28393))
+- Preserve worker tools with restricted toolsets. ([#28396](https://github.com/NousResearch/hermes-agent/pull/28396))
+- Avoid unsafe Windows worker Hermes shim resolution. ([#28398](https://github.com/NousResearch/hermes-agent/pull/28398))
+- Sync slash subcommands with live parser. ([#28376](https://github.com/NousResearch/hermes-agent/pull/28376))
+- Show scheduled kanban tasks in dashboard. ([#28400](https://github.com/NousResearch/hermes-agent/pull/28400))
+- Assign single-task kanban decompositions. ([#28401](https://github.com/NousResearch/hermes-agent/pull/28401))
+- Configurable `max_tokens` for kanban specify. ([#28374](https://github.com/NousResearch/hermes-agent/pull/28374))
+- Per-job profile support for cron. ([#28124](https://github.com/NousResearch/hermes-agent/pull/28124))
+- Codex app-server: include every Kanban-pinned path in `writable_roots`. ([#28435](https://github.com/NousResearch/hermes-agent/pull/28435))
+- Cache kanban worker guidance at session init for prompt-cache reuse. ([#28425](https://github.com/NousResearch/hermes-agent/pull/28425))
+
+---
+
+## ⚡ Performance
+
+- `openai._base_client` import deferred — 240ms / 17MB off every CLI cold start. ([#28864](https://github.com/NousResearch/hermes-agent/pull/28864))
+- Agent-loop hot-path optimizations — 47% fewer per-conversation function calls (399k → 213k for 31-turn chat). ([#28866](https://github.com/NousResearch/hermes-agent/pull/28866))
+- Compression-feasibility check deferred — 170-290ms off every agent construction. ([#28957](https://github.com/NousResearch/hermes-agent/pull/28957))
+- Adaptive subprocess poll — ~195ms off every tool call, 1+ second per turn. ([#29006](https://github.com/NousResearch/hermes-agent/pull/29006))
+- Termux TUI cold start speedup. ([#29419](https://github.com/NousResearch/hermes-agent/pull/29419))
+- Termux non-TUI cold start speedup. (salvage [#29438](https://github.com/NousResearch/hermes-agent/pull/29438)) ([#30121](https://github.com/NousResearch/hermes-agent/pull/30121))
+- Termux fast-path version + deferred bare-prompt agent startup. ([#30609](https://github.com/NousResearch/hermes-agent/pull/30609))
+- Cut hermes `--version` wall time 63% — flips head-to-head vs Codex CLI. ([#31968](https://github.com/NousResearch/hermes-agent/pull/31968))
+- Date-only timestamp + loud gateway-DB roundtrip logging — improves prompt-cache hit rate. ([#27675](https://github.com/NousResearch/hermes-agent/pull/27675))
+- Cache kanban worker guidance at session init for prompt-cache reuse. ([#28425](https://github.com/NousResearch/hermes-agent/pull/28425))
+
+---
+
+## 🔧 Tool System
+
+### Tool surface
+
+- `patch`: indent preservation, CRLF preservation, per-file failure escalation. ([#32273](https://github.com/NousResearch/hermes-agent/pull/32273))
+- `terminal`: warn at call time when `background=true` runs silently. ([#31289](https://github.com/NousResearch/hermes-agent/pull/31289))
+- `terminal`: nudge homebrewed CI pollers at the tool surface. ([#33142](https://github.com/NousResearch/hermes-agent/pull/33142))
+- `x_search`: surface degraded results + validate dates. ([#29484](https://github.com/NousResearch/hermes-agent/pull/29484))
+- `x_search`: auto-enable toolset when xAI credentials are configured. ([#27376](https://github.com/NousResearch/hermes-agent/pull/27376))
+- `computer_use`: route SOM/vision captures via auxiliary.vision. ([#30126](https://github.com/NousResearch/hermes-agent/pull/30126))
+- `transcription`: reject symlinked audio inputs. ([#10082](https://github.com/NousResearch/hermes-agent/pull/10082))
+- TTS: prevent double `[pause]` in xAI auto speech tags. ([#32237](https://github.com/NousResearch/hermes-agent/pull/32237))
+- TTS: preserve native audio outside Telegram voice delivery. ([#28512](https://github.com/NousResearch/hermes-agent/pull/28512))
+- TTS: opt-in xAI `auto_speech_tags` speech-tag pauses for natural voice replies. ([#29376](https://github.com/NousResearch/hermes-agent/pull/29376))
+- Voice: chunk oversized CLI recordings. ([#30044](https://github.com/NousResearch/hermes-agent/pull/30044))
+- Voice: honor `PULSE_SERVER` / `PIPEWIRE_REMOTE` inside Docker. ([#22534](https://github.com/NousResearch/hermes-agent/pull/22534))
+
+### Browser
+
+- All cloud browser providers (Browserbase, Anchor, Camofox, Hyperbrowser, etc.) migrated to image_gen-style plugins. (salvages [#25580](https://github.com/NousResearch/hermes-agent/pull/25580)) ([#27403](https://github.com/NousResearch/hermes-agent/pull/27403))
+- Auto-launch Chromium-family browser for CDP. ([#29106](https://github.com/NousResearch/hermes-agent/pull/29106))
+- Docker: discover agent-browser Chromium binary at boot. ([#33184](https://github.com/NousResearch/hermes-agent/pull/33184))
+
+### Image generation
+
+- **Krea** provider plugin (Krea 2 Medium + Large). ([#33236](https://github.com/NousResearch/hermes-agent/pull/33236))
+- FAL backend ported to `plugins/image_gen/fal`. (salvage [#27966](https://github.com/NousResearch/hermes-agent/pull/27966)) ([#30380](https://github.com/NousResearch/hermes-agent/pull/30380))
+- Cache xAI ephemeral URL responses to disk. ([#31759](https://github.com/NousResearch/hermes-agent/pull/31759))
+
+### Web search
+
+- **xAI Web Search** as a provider plugin. ([#29042](https://github.com/NousResearch/hermes-agent/pull/29042))
+
+### MCP
+
+- **Nous-approved MCP catalog** with interactive picker. ([#30870](https://github.com/NousResearch/hermes-agent/pull/30870))
+- **TLS client certificate (mTLS) support** for HTTP and SSE MCP servers. ([#33721](https://github.com/NousResearch/hermes-agent/pull/33721))
+- Stdin paste-back fallback for headless OAuth flow. ([#32053](https://github.com/NousResearch/hermes-agent/pull/32053))
+- `skip` at paste prompt bypasses auth without disabling server. ([#32069](https://github.com/NousResearch/hermes-agent/pull/32069))
+- Registry-aware `mcp_` prefix on both ends of round-trip. ([#31700](https://github.com/NousResearch/hermes-agent/pull/31700))
+
+---
+
+## 🧩 Skills Ecosystem
+
+### Skills system
+
+- **Skill bundles** — `/<name>` loads multiple skills. ([#28373](https://github.com/NousResearch/hermes-agent/pull/28373))
+- Skills Hub: health checks, freshness badge, and a watchdog cron. ([#32345](https://github.com/NousResearch/hermes-agent/pull/32345))
+- Opt-in AST deep diagnostics on skill writes. (salvage of [#30918](https://github.com/NousResearch/hermes-agent/pull/30918)) ([#31198](https://github.com/NousResearch/hermes-agent/pull/31198))
+- Bundled/pinned skill protection in background-review prompts. ([#28338](https://github.com/NousResearch/hermes-agent/pull/28338))
+- Show user-modified skill names in bundled skill sync summary. ([#28671](https://github.com/NousResearch/hermes-agent/pull/28671))
+- Load symlinked skill slash commands. ([#27759](https://github.com/NousResearch/hermes-agent/pull/27759))
+- Deduplicate Skills Hub search results by identifier, not name. ([#29490](https://github.com/NousResearch/hermes-agent/pull/29490))
+
+### New skills
+
+- `openhands` — delegate-to-OpenHands orchestration skill (closes [#477](https://github.com/NousResearch/hermes-agent/issues/477)) ([#32261](https://github.com/NousResearch/hermes-agent/pull/32261))
+- `code-wiki` — persistent indexed dev wiki (closes [#486](https://github.com/NousResearch/hermes-agent/issues/486)) ([#32240](https://github.com/NousResearch/hermes-agent/pull/32240))
+- `web-pentest` — OWASP recipes (closes [#400](https://github.com/NousResearch/hermes-agent/issues/400)) ([#32265](https://github.com/NousResearch/hermes-agent/pull/32265))
+- `baoyu-article-illustrator` ([#28287](https://github.com/NousResearch/hermes-agent/pull/28287))
+
+---
+
+## ☁️ Providers
+
+### xAI deep integration
+
+- **xAI Web Search** as a `plugins/web/xai/` provider plugin. ([#29042](https://github.com/NousResearch/hermes-agent/pull/29042))
+- **`hermes proxy` xAI upstream** — OpenAI-compatible local proxy backed by xai-oauth. ([#28356](https://github.com/NousResearch/hermes-agent/pull/28356))
+- **May 15 model retirement detection + `hermes migrate xai`** for grok-4 / grok-3 / grok-code-fast-1 / grok-imagine-image-pro. ([#29277](https://github.com/NousResearch/hermes-agent/pull/29277))
+- **Opt-in `auto_speech_tags`** for natural xAI TTS voice replies. ([#29376](https://github.com/NousResearch/hermes-agent/pull/29376))
+- **xai-oauth base_url pinned to x.ai origin** — closes silent credential-leak vector. ([#28952](https://github.com/NousResearch/hermes-agent/pull/28952))
+- **OpenAI-style execution guidance** applied to Grok / xai-oauth models. ([#27797](https://github.com/NousResearch/hermes-agent/pull/27797))
+- xAI: detect retired May 15 models in doctor/chat startup. ([#29277](https://github.com/NousResearch/hermes-agent/pull/29277))
+- xAI: resolve Grok Build context for OAuth. ([#30579](https://github.com/NousResearch/hermes-agent/pull/30579))
+- xAI OAuth: tier-gated 403 with API-key fallback. ([#28351](https://github.com/NousResearch/hermes-agent/pull/28351))
+- xAI OAuth: PKCE `code_challenge` echo. ([#27560](https://github.com/NousResearch/hermes-agent/pull/27560))
+- xAI OAuth: quarantine dead tokens on terminal refresh failure. ([#28116](https://github.com/NousResearch/hermes-agent/pull/28116))
+- xAI OAuth: honor `WKE=unauthenticated` disambiguator at both classifier sites. ([#30872](https://github.com/NousResearch/hermes-agent/pull/30872))
+- xAI OAuth: accept bare-code manual paste (state=None). (closes [#26923](https://github.com/NousResearch/hermes-agent/issues/26923)) ([#33880](https://github.com/NousResearch/hermes-agent/pull/33880))
+- xAI OAuth: fall back to manual paste on loopback timeout. ([#33231](https://github.com/NousResearch/hermes-agent/pull/33231))
+- xAI proxy: handle 429 rate-limit responses in proxy retry path. ([#33743](https://github.com/NousResearch/hermes-agent/pull/33743))
+
+### Other providers
+
+- **OpenAI API as a first-class provider** (distinct from Codex runtime). ([#31898](https://github.com/NousResearch/hermes-agent/pull/31898))
+- **Microsoft Entra ID** auth for Azure Foundry (with 1M Anthropic-Messages beta preserved on Bearer). (salvages [#27509](https://github.com/NousResearch/hermes-agent/pull/27509), [#27022](https://github.com/NousResearch/hermes-agent/pull/27022)) ([#28101](https://github.com/NousResearch/hermes-agent/pull/28101), [#28084](https://github.com/NousResearch/hermes-agent/pull/28084))
+- **OpenRouter** sticky routing — `session_id` passed via `extra_body` so a long-running session keeps landing on the same upstream provider. (@Cybourgeoisie) ([#33939](https://github.com/NousResearch/hermes-agent/pull/33939))
+- Nous: JWT token for inference; stop replaying invalid Nous refresh tokens. (@rewbs) ([#27663](https://github.com/NousResearch/hermes-agent/pull/27663))
+- Nous Portal: one-shot setup, status CLI, and Nous-included markers. ([#30860](https://github.com/NousResearch/hermes-agent/pull/30860))
+- Anthropic adapter: extract 7 helpers from `convert_messages_to_anthropic`. (salvage [#27784](https://github.com/NousResearch/hermes-agent/pull/27784)) ([#30386](https://github.com/NousResearch/hermes-agent/pull/30386))
+- Catalog: add `qwen3.7-max` to Alibaba + Alibaba-Coding-Plan model lists. ([#33129](https://github.com/NousResearch/hermes-agent/pull/33129))
+- opencode-go: route `qwen3.7-max` via `anthropic_messages`. (@beardthelion) ([#32780](https://github.com/NousResearch/hermes-agent/pull/32780))
+- opencode-go: expose Kimi K2 + DeepSeek reasoning controls. ([#30845](https://github.com/NousResearch/hermes-agent/pull/30845))
+- Remove Vercel AI Gateway and Vercel Sandbox.
+- MiniMax OAuth: refresh short-lived access tokens per request. ([#30619](https://github.com/NousResearch/hermes-agent/pull/30619))
+- Codex OAuth: quarantine terminal refresh errors. ([#28118](https://github.com/NousResearch/hermes-agent/pull/28118))
+- Codex: drop dead model slugs that HTTP 400 on ChatGPT Pro. ([#33424](https://github.com/NousResearch/hermes-agent/pull/33424))
+- Codex: sync `manual:device_code` pool entries on re-auth. ([#33744](https://github.com/NousResearch/hermes-agent/pull/33744))
+- MiniMax OAuth: quarantine terminal refresh errors. ([#28119](https://github.com/NousResearch/hermes-agent/pull/28119))
+
+---
+
+## 🔑 Secrets
+
+- **Bitwarden Secrets Manager** integration with lazy `bws` install. ([#30035](https://github.com/NousResearch/hermes-agent/pull/30035))
+- Bitwarden: EU Cloud + self-hosted server URL support. ([#31378](https://github.com/NousResearch/hermes-agent/pull/31378))
+- Label detected credentials with their source (Bitwarden). ([#30364](https://github.com/NousResearch/hermes-agent/pull/30364))
+
+---
+
+## 📱 Messaging Platforms (Gateway)
+
+### Gateway core
+
+- **Deliverable mode** — agents ship artifacts as native uploads from any platform (Slack/Discord/Telegram/Teams/Email). ([#27813](https://github.com/NousResearch/hermes-agent/pull/27813))
+- `hermes send` — pipe any script's output to any messaging platform. (salvage of [#19631](https://github.com/NousResearch/hermes-agent/pull/19631)) ([#27188](https://github.com/NousResearch/hermes-agent/pull/27188))
+- Debounce queued text follow-ups during active sessions. (salvage of [#31235](https://github.com/NousResearch/hermes-agent/pull/31235)) ([#31341](https://github.com/NousResearch/hermes-agent/pull/31341))
+- Plugin-transformed final_response delivered through streaming gate. ([#31433](https://github.com/NousResearch/hermes-agent/pull/31433))
+- Refresh cached agent tools on `/reload-mcp`. ([#32815](https://github.com/NousResearch/hermes-agent/pull/32815))
+- Harden kanban + provider cleanup races on long-running workloads. ([#29479](https://github.com/NousResearch/hermes-agent/pull/29479))
+
+### New / reorganized adapters
+
+- **ntfy** — 23rd platform, push notifications, plugin shape, zero core edits. (salvages [#30625](https://github.com/NousResearch/hermes-agent/pull/30625) → [#4043](https://github.com/NousResearch/hermes-agent/pull/4043)) ([#30867](https://github.com/NousResearch/hermes-agent/pull/30867))
+- **Discord** adapter migrated to bundled plugin. (salvage of [#24356](https://github.com/NousResearch/hermes-agent/pull/24356)) ([#30591](https://github.com/NousResearch/hermes-agent/pull/30591))
+- **Mattermost** adapter migrated to bundled plugin. (salvage of [#30916](https://github.com/NousResearch/hermes-agent/pull/30916)) ([#31748](https://github.com/NousResearch/hermes-agent/pull/31748))
+
+### Telegram
+
+- Edit status messages in place instead of appending. (based on [#30141](https://github.com/NousResearch/hermes-agent/pull/30141) by @qike-ms) ([#30864](https://github.com/NousResearch/hermes-agent/pull/30864))
+- Skip-STT audio path + 2GB cap via local Bot API server. ([#28541](https://github.com/NousResearch/hermes-agent/pull/28541))
+- Route image documents (.png/.jpg/.webp/.gif) through vision pipeline. ([#28519](https://github.com/NousResearch/hermes-agent/pull/28519))
+- Route audio file attachments away from STT pipeline. ([#28478](https://github.com/NousResearch/hermes-agent/pull/28478))
+- `disable_topic_auto_rename` gateway flag. ([#28523](https://github.com/NousResearch/hermes-agent/pull/28523))
+- `ignore_root_dm` config to drop messages without thread_id. ([#28536](https://github.com/NousResearch/hermes-agent/pull/28536))
+- Chat-scoped auth without sender user_id. ([#28525](https://github.com/NousResearch/hermes-agent/pull/28525))
+- Fail-closed auth fallback when `TELEGRAM_ALLOWED_USERS` is empty. ([#28494](https://github.com/NousResearch/hermes-agent/pull/28494))
+- Roll over tool progress bubbles + scope audio_file_paths. ([#28482](https://github.com/NousResearch/hermes-agent/pull/28482))
+- Avoid duplicate text after auto-TTS voice replies. ([#28509](https://github.com/NousResearch/hermes-agent/pull/28509))
+- Mark final voice reply notify-worthy so Telegram delivers it audibly. ([#28504](https://github.com/NousResearch/hermes-agent/pull/28504))
+
+### Discord
+
+- Recover Windows voice opus decoding. ([#33182](https://github.com/NousResearch/hermes-agent/pull/33182))
+- `allow_any_attachment` config to accept arbitrary file types. ([#27245](https://github.com/NousResearch/hermes-agent/pull/27245))
+- Transcribe native voice notes. ([#28993](https://github.com/NousResearch/hermes-agent/pull/28993))
+- Define UI view classes after lazy install. ([#28817](https://github.com/NousResearch/hermes-agent/pull/28817))
+
+### Signal / Matrix / Feishu / Slack / WeCom
+
+- Signal: `require_mention` filter for group chats. ([#28574](https://github.com/NousResearch/hermes-agent/pull/28574))
+- Matrix: warn on clock-skew silent message drops. ([#27330](https://github.com/NousResearch/hermes-agent/pull/27330))
+- Matrix E2EE installs full dep set; plugins respect `is_connected`. ([#31688](https://github.com/NousResearch/hermes-agent/pull/31688))
+- Feishu: require webhook auth secret + honor config extras. ([#30746](https://github.com/NousResearch/hermes-agent/pull/30746))
+- Feishu: enforce auth and chat binding for approval buttons. ([#30744](https://github.com/NousResearch/hermes-agent/pull/30744))
+- Slack: socket recovery + Windows restart dedupe. ([#28873](https://github.com/NousResearch/hermes-agent/pull/28873))
+- WeCom: safe-parse untrusted XML. ([#32442](https://github.com/NousResearch/hermes-agent/pull/32442))
+
+### DingTalk / Webhooks / Microsoft Graph
+
+- DingTalk: transcribe native voice notes. ([#28993](https://github.com/NousResearch/hermes-agent/pull/28993))
+- Webhook: enforce `INSECURE_NO_AUTH` safety rail on dynamic route reloads. ([#30863](https://github.com/NousResearch/hermes-agent/pull/30863))
+- Webhook: restrict default toolset capabilities. ([#30745](https://github.com/NousResearch/hermes-agent/pull/30745))
+- Microsoft Graph: harden webhook auth requirements. ([#30169](https://github.com/NousResearch/hermes-agent/pull/30169))
+
+---
+
+## 🖥️ CLI & TUI
+
+### CLI
+
+- `/update` slash command in CLI and TUI. ([#23854](https://github.com/NousResearch/hermes-agent/pull/23854))
+- Update auto-rollback when post-pull syntax check fails. ([#28669](https://github.com/NousResearch/hermes-agent/pull/28669))
+- `--branch` flag for `hermes update`. (@jquesnelle) ([#29591](https://github.com/NousResearch/hermes-agent/pull/29591))
+- `/exit --delete` flag to remove session on quit. (salvage of [#17665](https://github.com/NousResearch/hermes-agent/pull/17665)) ([#27101](https://github.com/NousResearch/hermes-agent/pull/27101))
+- `▶ N` indicator in status bar for running `/background` tasks. ([#27175](https://github.com/NousResearch/hermes-agent/pull/27175))
+- Live background terminal-process count in status bar. ([#32061](https://github.com/NousResearch/hermes-agent/pull/32061))
+- Append session recap to `/status` output. (salvage of [#18587](https://github.com/NousResearch/hermes-agent/pull/18587)) ([#27176](https://github.com/NousResearch/hermes-agent/pull/27176))
+- Configurable paste-collapse thresholds (TUI + CLI). (salvage [#29723](https://github.com/NousResearch/hermes-agent/pull/29723)) ([#32087](https://github.com/NousResearch/hermes-agent/pull/32087))
+- `/resume` accepts position numbers. ([#31709](https://github.com/NousResearch/hermes-agent/pull/31709))
+- Bring tool-call display back — verbose mode, specific failure reasons, todo progress. ([#31293](https://github.com/NousResearch/hermes-agent/pull/31293))
+- Validate runtime token refresh in Qwen auth status. ([#31196](https://github.com/NousResearch/hermes-agent/pull/31196))
+
+### TUI
+
+- **TUI session orchestrator** — multiple live sessions in one TUI window. (salvages [#27642](https://github.com/NousResearch/hermes-agent/pull/27642)) ([#32980](https://github.com/NousResearch/hermes-agent/pull/32980))
+- `mouse_tracking` DEC mode presets. (salvage of [#26681](https://github.com/NousResearch/hermes-agent/pull/26681) by @OutThisLife) ([#30084](https://github.com/NousResearch/hermes-agent/pull/30084))
+- Termux scrollback preservation + touch-friendly defaults. ([#28910](https://github.com/NousResearch/hermes-agent/pull/28910))
+- Full assistant text in scrollback (no history truncation). ([#28829](https://github.com/NousResearch/hermes-agent/pull/28829))
+- Preserve scrollback when branching sessions. ([#30162](https://github.com/NousResearch/hermes-agent/pull/30162))
+- Preserve Python dunder identifiers in markdown. ([#28582](https://github.com/NousResearch/hermes-agent/pull/28582))
+- Active profile shown in TUI prompt. ([#28581](https://github.com/NousResearch/hermes-agent/pull/28581))
+- Improve Charizard completion menu contrast. ([#28346](https://github.com/NousResearch/hermes-agent/pull/28346))
+- Stop slash dropdown chopping last char of `/goal`. ([#31311](https://github.com/NousResearch/hermes-agent/pull/31311))
+- Clipboard copy on linux/wayland. ([#29342](https://github.com/NousResearch/hermes-agent/pull/29342))
+- Anchor `splitReasoning` unclosed-tag regex; stop eating last paragraph. ([#29426](https://github.com/NousResearch/hermes-agent/pull/29426))
+- Surface verbose tool details. ([#30225](https://github.com/NousResearch/hermes-agent/pull/30225))
+- Load Linux skills on Termux + salvage @adybag14-cyber's Termux gates. ([#30166](https://github.com/NousResearch/hermes-agent/pull/30166))
+- Handle images with codex app-server. ([#31220](https://github.com/NousResearch/hermes-agent/pull/31220))
+- Refresh virtual transcript on viewport resize. ([#31077](https://github.com/NousResearch/hermes-agent/pull/31077))
+- Ignore late thinking deltas after completion. ([#31055](https://github.com/NousResearch/hermes-agent/pull/31055))
+- Commit composer input bursts immediately. ([#31053](https://github.com/NousResearch/hermes-agent/pull/31053))
+- Log parent gateway lifecycle exits. ([#31051](https://github.com/NousResearch/hermes-agent/pull/31051))
+- Clear TTS env var on voice off + TTS indicator in status bar. ([#30987](https://github.com/NousResearch/hermes-agent/pull/30987))
+- Pass `--expose-gc` as node argv instead of NODE_OPTIONS. ([#29998](https://github.com/NousResearch/hermes-agent/pull/29998))
+- Align composer cursorLayout with wrap-ansi to kill multiline cursor drift. ([#27489](https://github.com/NousResearch/hermes-agent/pull/27489))
+- Harden Terminal.app rendering and color paths. ([#27251](https://github.com/NousResearch/hermes-agent/pull/27251))
+- Keep `/goal` verdict out of compact status row. ([#27971](https://github.com/NousResearch/hermes-agent/pull/27971))
+- Clamp curses color 8 for 8-color terminals (Docker). ([#30260](https://github.com/NousResearch/hermes-agent/pull/30260))
+
+---
+
+## 🔒 Security & Reliability
+
+### Promptware & memory hardening
+
+- **Promptware defense** — shared threat patterns + memory load-time scan + tool-result delimiters. ([#32269](https://github.com/NousResearch/hermes-agent/pull/32269))
+- Expand memory content scanning patterns to parity with skills guard. ([#9151](https://github.com/NousResearch/hermes-agent/pull/9151))
+- Harden Skills Guard multi-word prompt patterns. (@YLChen-007) ([#26852](https://github.com/NousResearch/hermes-agent/pull/26852))
+- Split cron scanner so skill prose stops false-positiving exfil patterns. ([#32339](https://github.com/NousResearch/hermes-agent/pull/32339))
+
+### File safety
+
+- Protect Hermes control-plane files from prompt injection (`auth.json`, `config.yaml`, `webhook_subscriptions.json`, `mcp-tokens/`). (salvages @PratikRai0101's [#14157](https://github.com/NousResearch/hermes-agent/pull/14157)) ([#30397](https://github.com/NousResearch/hermes-agent/pull/30397))
+- Write-deny `<root>/.env` when running under a profile. ([#29687](https://github.com/NousResearch/hermes-agent/pull/29687))
+- Defense-in-depth read-deny on credential stores. (salvages [#17659](https://github.com/NousResearch/hermes-agent/pull/17659) + [#8055](https://github.com/NousResearch/hermes-agent/pull/8055)) ([#30721](https://github.com/NousResearch/hermes-agent/pull/30721))
+- TTS `output_path` traversal + update ZIP symlink reject. (salvage [#6693](https://github.com/NousResearch/hermes-agent/pull/6693) + [#15881](https://github.com/NousResearch/hermes-agent/pull/15881)) ([#32056](https://github.com/NousResearch/hermes-agent/pull/32056))
+- Reject symlinked audio inputs. ([#10082](https://github.com/NousResearch/hermes-agent/pull/10082))
+
+### Credential safety
+
+- Avoid persisting borrowed credential secrets — runtime env-sourced keys no longer leak into `auth.json`. ([#31416](https://github.com/NousResearch/hermes-agent/pull/31416))
+- Validate Nous Portal `inference_base_url` against host allowlist. (salvages [#27612](https://github.com/NousResearch/hermes-agent/pull/27612)) ([#30611](https://github.com/NousResearch/hermes-agent/pull/30611))
+- Harden API server key placeholder handling. ([#30738](https://github.com/NousResearch/hermes-agent/pull/30738))
+- Harden Google Chat OAuth credential persistence. (@Zyrixtrex) ([#24788](https://github.com/NousResearch/hermes-agent/pull/24788))
+- xAI OAuth: pin inference `base_url` to x.ai origin. ([#28952](https://github.com/NousResearch/hermes-agent/pull/28952))
+- Quarantine dead OAuth tokens on terminal refresh failure (xAI, Codex, MiniMax). ([#28116](https://github.com/NousResearch/hermes-agent/pull/28116), [#28118](https://github.com/NousResearch/hermes-agent/pull/28118), [#28119](https://github.com/NousResearch/hermes-agent/pull/28119))
+
+### Supply-chain
+
+- **On-demand supply-chain audit via OSV.dev** — `hermes audit`. ([#31460](https://github.com/NousResearch/hermes-agent/pull/31460))
+- `hermes update` syntax-validates critical files post-pull, auto-rollback on failure. ([#28669](https://github.com/NousResearch/hermes-agent/pull/28669))
+- Quarantine `hermes.exe` vs concurrent Windows instance. ([#26677](https://github.com/NousResearch/hermes-agent/pull/26677))
+
+### Other hardening
+
+- Restrict default webhook toolset capabilities. ([#30745](https://github.com/NousResearch/hermes-agent/pull/30745))
+- Harden Microsoft Graph webhook auth requirements. ([#30169](https://github.com/NousResearch/hermes-agent/pull/30169))
+- Require source CIDR allowlisting for public msgraph webhook binds. ([#33722](https://github.com/NousResearch/hermes-agent/pull/33722))
+- Require `API_SERVER_KEY` before dispatching API server work. ([#33232](https://github.com/NousResearch/hermes-agent/pull/33232))
+- env_passthrough: apply GHSA-rhgp-j443-p4rf filter to config.yaml path. (@roadhero) ([#27794](https://github.com/NousResearch/hermes-agent/pull/27794))
+- Dashboard + WeCom: restrict markdown link schemes; safe-parse untrusted XML. ([#32442](https://github.com/NousResearch/hermes-agent/pull/32442))
+- Salvage project-plugin RCE bypass fix from PR [#29311](https://github.com/NousResearch/hermes-agent/pull/29311) (GHSA-5qr3-c538-wm9j). ([#30837](https://github.com/NousResearch/hermes-agent/pull/30837))
+- Cross-profile soft guard on file-write tools + system-prompt hint. ([#31290](https://github.com/NousResearch/hermes-agent/pull/31290))
+- Reject unsafe tar members in Android psutil compatibility installer. ([#33742](https://github.com/NousResearch/hermes-agent/pull/33742))
+- Reject non-regular tar members during tirith auto-install. ([#33786](https://github.com/NousResearch/hermes-agent/pull/33786))
+
+---
+
+## 🪟 Native Windows (Beta Continued)
+
+- Complete Windows bootstrap — `dep_ensure` + `install.ps1` + detection. (@alt-glitch) ([#27845](https://github.com/NousResearch/hermes-agent/pull/27845))
+- `install.ps1`: strip BOM, `-Commit`/`-Tag` pin params, harden git ops. (@jquesnelle) ([#28169](https://github.com/NousResearch/hermes-agent/pull/28169))
+- Consolidate ACP browser bootstrap into `install.{sh,ps1}`. (@alt-glitch) ([#27851](https://github.com/NousResearch/hermes-agent/pull/27851))
+- `hermes update` quarantines live `hermes.exe`. ([#26677](https://github.com/NousResearch/hermes-agent/pull/26677))
+- Discord voice opus decoding on Windows. ([#33182](https://github.com/NousResearch/hermes-agent/pull/33182))
+- Windows Docker Desktop compatible compose file. (@Sunil123135) ([#31031](https://github.com/NousResearch/hermes-agent/pull/31031))
+
+---
+
+## 🖥️ Web Dashboard
+
+- Hardened Slack socket recovery + Windows restart dedupe. ([#28873](https://github.com/NousResearch/hermes-agent/pull/28873))
+- Web dashboard: migrate checkboxes to `@nous-research/ui` + design-system polish. (@austinpickett) ([#28814](https://github.com/NousResearch/hermes-agent/pull/28814))
+- Web dashboard: collapsible sidebar. (@austinpickett) ([#33421](https://github.com/NousResearch/hermes-agent/pull/33421))
+- Dashboard typography & contrast pass. (salvage of [#28832](https://github.com/NousResearch/hermes-agent/pull/28832)) ([#30714](https://github.com/NousResearch/hermes-agent/pull/30714))
+- Skills page: lazy-fetch catalog instead of bundling 34MB into JS. ([#33809](https://github.com/NousResearch/hermes-agent/pull/33809))
+
+---
+
+## 🐳 Docker
+
+- **s6-overlay container supervision** — abstract `ServiceManager` protocol (systemd/launchd/Windows/s6 backends), per-profile gateway supervision in-container, container-restart reconciliation, hadolint/shellcheck CI. (salvage of [#30136](https://github.com/NousResearch/hermes-agent/pull/30136), @benbarclay) ([#31760](https://github.com/NousResearch/hermes-agent/pull/31760))
+- Auto-redirect `gateway run` to supervised mode inside the s6 image. (@benbarclay) ([#33583](https://github.com/NousResearch/hermes-agent/pull/33583))
+- Tee supervised gateway stdout to docker logs. (@benbarclay) ([#33621](https://github.com/NousResearch/hermes-agent/pull/33621))
+- Drop `docker exec` to hermes uid before invoking the CLI. (@benbarclay) ([#33628](https://github.com/NousResearch/hermes-agent/pull/33628))
+- Align HOME for dashboard and s6 gateway services. (@Dusk1e) ([#33481](https://github.com/NousResearch/hermes-agent/pull/33481))
+- Bake build-time git SHA into image so `hermes dump` reports it. (@benbarclay) ([#33655](https://github.com/NousResearch/hermes-agent/pull/33655))
+- `hermes update` prints `docker pull` guidance instead of bogus git error. (@benbarclay) ([#33659](https://github.com/NousResearch/hermes-agent/pull/33659))
+- Upgrade Node to 22 LTS via multi-stage from `node:22-bookworm-slim`. (@benbarclay) ([#33060](https://github.com/NousResearch/hermes-agent/pull/33060))
+- Drop `build-essential` from apt install. (@benbarclay) ([#33028](https://github.com/NousResearch/hermes-agent/pull/33028))
+- Propagate env through s6 to cont-init and main CMD. ([#32412](https://github.com/NousResearch/hermes-agent/pull/32412))
+- Targeted chown to preserve host file ownership in `HERMES_HOME`. ([#33033](https://github.com/NousResearch/hermes-agent/pull/33033))
+- `mkdir HERMES_HOME` as root in stage2 before chown / privilege drop. ([#33078](https://github.com/NousResearch/hermes-agent/pull/33078))
+- chown `ui-tui` and `node_modules` on UID remap so TUI esbuild works. ([#33045](https://github.com/NousResearch/hermes-agent/pull/33045))
+- Include `anthropic`, `bedrock`, `azure-identity` extras in image. ([#30504](https://github.com/NousResearch/hermes-agent/pull/30504))
+- Stop pushing per-commit SHA tags to Docker Hub. ([#29387](https://github.com/NousResearch/hermes-agent/pull/29387))
+- Simplify Docker tagging — push both `:main` and `:latest` on main push. ([#33225](https://github.com/NousResearch/hermes-agent/pull/33225))
+- Test slicing across GH actions jobs. (@ethernet8023) ([#30575](https://github.com/NousResearch/hermes-agent/pull/30575))
+- Discover agent-browser Chromium binary at boot. ([#33184](https://github.com/NousResearch/hermes-agent/pull/33184))
+
+---
+
+## 🌐 API Server
+
+- **Session control API** — `/api/sessions/*` (list/create/read/patch/delete/fork) + SSE-streaming chat. (salvages [#29302](https://github.com/NousResearch/hermes-agent/pull/29302) by @Codename-11 + multimodal followup by @Schwartz10) ([#33134](https://github.com/NousResearch/hermes-agent/pull/33134))
+- `GET /v1/skills` and `/v1/toolsets`. ([#33016](https://github.com/NousResearch/hermes-agent/pull/33016))
+- Coerce stringified booleans in stream/store/approval payloads. (salvage [#26639](https://github.com/NousResearch/hermes-agent/pull/26639)) ([#27293](https://github.com/NousResearch/hermes-agent/pull/27293))
+- Honor `key_env` in auth-failure fallback resolution. ([#30840](https://github.com/NousResearch/hermes-agent/pull/30840))
+
+---
+
+## 🎟️ ACP (VS Code / Zed / JetBrains)
+
+- Session edit auto-approval modes. (salvage of [#27034](https://github.com/NousResearch/hermes-agent/pull/27034)) ([#27862](https://github.com/NousResearch/hermes-agent/pull/27862))
+- Enrich Zed permission cards — command in title + `reject_always`. ([#28148](https://github.com/NousResearch/hermes-agent/pull/28148))
+- Replay session history before responding to `session/load`. ([#26957](https://github.com/NousResearch/hermes-agent/pull/26957), [#26943](https://github.com/NousResearch/hermes-agent/pull/26943))
+- Plugin-transformed final_response delivered through streaming gate. ([#31433](https://github.com/NousResearch/hermes-agent/pull/31433))
+
+---
+
+## 🔌 Plugin Surface
+
+- `register_tts_provider()` plugin hook. (salvage of [#30420](https://github.com/NousResearch/hermes-agent/pull/30420)) ([#31745](https://github.com/NousResearch/hermes-agent/pull/31745))
+- `register_transcription_provider()` hook + `stt.providers` command-provider registry. (salvage of [#30493](https://github.com/NousResearch/hermes-agent/pull/30493)) ([#31907](https://github.com/NousResearch/hermes-agent/pull/31907))
+- `register_auxiliary_task()` in PluginContext API. (salvage [#29817](https://github.com/NousResearch/hermes-agent/pull/29817)) ([#31177](https://github.com/NousResearch/hermes-agent/pull/31177))
+- Bundled `security-guidance` plugin. ([#33131](https://github.com/NousResearch/hermes-agent/pull/33131))
+- Discord and Mattermost migrated to bundled plugins. ([#30591](https://github.com/NousResearch/hermes-agent/pull/30591), [#31748](https://github.com/NousResearch/hermes-agent/pull/31748))
+- ntfy as platform plugin. ([#30867](https://github.com/NousResearch/hermes-agent/pull/30867))
+- Surface category-namespaced plugins in `hermes plugins list`. ([#27187](https://github.com/NousResearch/hermes-agent/pull/27187))
+- Plugin discovery failures raised to WARNING level. ([#28318](https://github.com/NousResearch/hermes-agent/pull/28318))
+- `hermes_plugins` included in gateway.log component filter. ([#28313](https://github.com/NousResearch/hermes-agent/pull/28313))
+- Seed plugin extras before `is_connected` gate. ([#31703](https://github.com/NousResearch/hermes-agent/pull/31703))
+- Dashboard: allowlist plugin assets + denylist subprocess-influencing env vars. ([#32277](https://github.com/NousResearch/hermes-agent/pull/32277))
+
+---
+
+## 📦 Distribution & Install
+
+- Install-method stamping + Docker detection. (@alt-glitch) ([#27843](https://github.com/NousResearch/hermes-agent/pull/27843))
+- Nix `#messaging` and `#full` package variants. (@alt-glitch) ([#33108](https://github.com/NousResearch/hermes-agent/pull/33108))
+- Pre-load messaging gateway deps via `--extra messaging`. (salvage [#26394](https://github.com/NousResearch/hermes-agent/pull/26394)) ([#27558](https://github.com/NousResearch/hermes-agent/pull/27558))
+- Avoid piping installer directly into `iex` (Windows). ([#28347](https://github.com/NousResearch/hermes-agent/pull/28347))
+- Ship bundled skills in wheel. ([#28421](https://github.com/NousResearch/hermes-agent/pull/28421))
+- Ship dashboard plugin assets in wheel. ([#28406](https://github.com/NousResearch/hermes-agent/pull/28406))
+- Make Camofox lazy-installed instead of eager. ([#27055](https://github.com/NousResearch/hermes-agent/pull/27055))
+- Wire STT lazy-install into transcription_tools.py. ([#30256](https://github.com/NousResearch/hermes-agent/pull/30256))
+
+---
+
+## 🐛 Notable Bug Fixes (highlights only)
+
+- Match bare custom provider by active base URL in `hermes model`. ([#28908](https://github.com/NousResearch/hermes-agent/pull/28908))
+- Route `auxiliary.vision.provider=openai` to api.openai.com, skip text-only main. ([#31452](https://github.com/NousResearch/hermes-agent/pull/31452))
+- Lint: skip per-file shell linter when LSP will handle the file. ([#29054](https://github.com/NousResearch/hermes-agent/pull/29054))
+- Treat empty credential pool entries as unauthenticated in `/model` picker. ([#28312](https://github.com/NousResearch/hermes-agent/pull/28312))
+- Reverted within window: Firecrawl integration tag, send_message @username auto-mentions, Telegram quick-command-only menus, Telegram pin-on-turn.
+
+---
+
+## 🧪 Testing
+
+- Disarm lazy-install probe so `_HAS_FASTER_WHISPER` patches work. ([#30334](https://github.com/NousResearch/hermes-agent/pull/30334))
+- Cover default board dashboard pin. ([#28361](https://github.com/NousResearch/hermes-agent/pull/28361))
+- Cover `_task_dict` `task_age` fallback. ([#28365](https://github.com/NousResearch/hermes-agent/pull/28365))
+- Allowlist `tmp_path` for `kanban_notify` artifact delivery tests. ([#30851](https://github.com/NousResearch/hermes-agent/pull/30851), [#30852](https://github.com/NousResearch/hermes-agent/pull/30852))
+- Cover null output stream terminal events in Codex. ([#33137](https://github.com/NousResearch/hermes-agent/pull/33137))
+
+---
+
+## 📚 Documentation
+
+- **30-day docs overhaul** — full correctness audit, every PR in the window covered, Nous Portal weave, sidebar reorg. ([#33782](https://github.com/NousResearch/hermes-agent/pull/33782))
+- Dedicated Nous Portal integration page and setup guide. ([#31296](https://github.com/NousResearch/hermes-agent/pull/31296))
+- Providers: move Nous Portal first, Google Gemini OAuth last. ([#31287](https://github.com/NousResearch/hermes-agent/pull/31287))
+- `session_search` rewrite for single-shape tool. ([#27840](https://github.com/NousResearch/hermes-agent/pull/27840))
+- Kanban: document failure_limit, max_retries, inline create shortcuts, goals & kanban settings. ([#28357](https://github.com/NousResearch/hermes-agent/pull/28357), [#28358](https://github.com/NousResearch/hermes-agent/pull/28358), [#28359](https://github.com/NousResearch/hermes-agent/pull/28359), [#28360](https://github.com/NousResearch/hermes-agent/pull/28360), [#28362](https://github.com/NousResearch/hermes-agent/pull/28362))
+- Kanban Codex lane skill. ([#28430](https://github.com/NousResearch/hermes-agent/pull/28430))
+- xAI OAuth: note X Premium+ also unlocks Grok OAuth. ([#29055](https://github.com/NousResearch/hermes-agent/pull/29055))
+- Docs site: Docker audio bridge notes, "Installing more tools in the container", xurl auth HOME in Docker.
+- Email: clarify gateway vs Himalaya setup. (@helix4u) ([#33634](https://github.com/NousResearch/hermes-agent/pull/33634))
+- Auth docs: replace stale `hermes login` references with `hermes auth add`. ([#32859](https://github.com/NousResearch/hermes-agent/pull/32859))
+
+---
+
+## 👥 Contributors
+
+### Core
+- @teknium1 (lead)
+
+### Notable salvages & cherry-picks
+
+- **@benbarclay** — s6-overlay container supervision (29 commits salvaged), Node 22 LTS upgrade, build-essential cleanup, `gateway run` auto-redirect in s6, tee supervised stdout to docker logs, `hermes update` Docker guidance, build-time SHA stamping
+- **@OutThisLife** — `mouse_tracking` DEC mode presets
+- **@jquesnelle** — Windows installer hardening, `--branch` flag for `hermes update`, install.ps1 BOM strip / commit-pin
+- **@alt-glitch** — Windows `dep_ensure` bootstrap, Nix package variants (`.#messaging`, `.#full`), install-method stamping, ACP browser bootstrap consolidation
+- **@austinpickett** — `/update` slash command, dashboard checkboxes → `@nous-research/ui`, mobile dashboard polish, collapsible sidebar
+- **@ethernet8023** — CI test slicing across GH Actions jobs, TUI clipboard copy fix
+- **@kshitijk4poor** — doctor section banner + fail-and-issue helpers extraction, post-tag salvage cluster (curator-fallout, kanban SQLite hardening, install world-readable uv dirs, xAI bare-code paste)
+- **@rewbs** — Nous JWT inference switch + refresh-token replay fix
+- **@Codename-11** + **@Schwartz10** — session control API (REST + SSE + multimodal followup)
+- **@Niraven** — kanban swarm topology helper
+- **@Interstellar-code** — kanban worker visibility endpoints
+- **@adybag14-cyber** — termux cold-start optimizations (multiple PRs)
+- **@qike-ms** — Telegram in-place status edits design
+- **@sprmn24** — ntfy adapter
+- **@Jaaneek** — xAI Web Search provider plugin
+- **@yannsunn** — xAI upstream adapter for `hermes proxy`
+- **@Cybourgeoisie** — OpenRouter sticky routing via session_id
+- **@memosr** — Nous Portal base_url allowlist validation
+- **@Sunil123135** — Windows Docker Desktop compose file
+- **@Dusk1e** — Docker HOME alignment for dashboard + s6 gateway services
+- **@beardthelion** — opencode-go anthropic_messages routing
+- **@YLChen-007** — Skills Guard multi-word prompt patterns
+- **@roadhero** — env_passthrough GHSA-rhgp-j443-p4rf filter
+- **@Zyrixtrex** — Google Chat OAuth credential persistence hardening
+- **@briandevans**, **@tomqiaozc** — defense-in-depth read-deny on credential stores
+- **@PratikRai0101** — control-plane file write protection
+- **@helix4u**, **@Bartok9**, **@zccyman** — auxiliary fallback ladder components
+- **@ms-alan**, **@ticketclosed-wontfix**, **@donovan-yohan** — TUI session orchestrator + follow-ups
+- **@daimon-nous[bot]** — cron per-job profile support
+- **@bisko** — re-pad `reasoning_content` on cross-provider fallback
+
+### All Contributors
+
+@02356abc, @0xchainer, @0xDevNinja, @0xjackyang, @0xsir0000, @0z1-ghb, @8bit64k, @aaronlab, @AceWattGit,
+@ACR27, @adam91holt, @AdamPlatin123, @Ade5954, @AdityaRajeshGadgil, @adybag14-cyber, @AhmetArif0, @ai-hana-ai,
+@alaamohanad169-ship-it, @alber70g, @albert748, @alt-glitch, @aqilaziz, @argabor, @asdlem, @austinpickett,
+@avifenesh, @awizemann, @B0Tch1, @Bartok9, @BaxBit, @Beandon13, @beardthelion, @benbarclay, @bensargotest-sys,
+@binhnt92, @bird, @bisko, @BlackishGreen33, @booker1207, @bradhallett, @briandevans, @Brixyy, @brndnsvr,
+@BROCCOLO1D, @btorresgil, @burjorjee, @carltonawong, @Carry00, @chaconne67, @chdlc, @chromalinx, @ChyuWei,
+@CipherFrame, @cmullins70, @CNSeniorious000, @codeblackhole1024, @Codename-11, @colin-chang, @counterposition,
+@cresslank, @CryptoByz, @cyb0rgk1tty, @Cybourgeoisie, @daizhonggeng, @darvsum, @davidcampbelldc, @deas,
+@dgians, @dillweed, @DoGMaTiiC, @donovan-yohan, @draplater, @Drexuxux, @dskwe, @dsr-restyn, @Dusk1e,
+@dusterbloom, @duyua9, @egilewski, @el-analista, @eliteworkstation94-ai, @eloklam, @EloquentBrush0x, @emonty,
+@emozilla, @erhnysr, @erikengervall, @Erosika, @ether-btc, @ethernet8023, @EvilHumphrey, @fabiosiqueira,
+@falasi, @falconexe, @fardoche6, @felix-windsor, @Fewmanism, @ffr31mr, @flamiinngo, @flanny7, @flooryyyy,
+@fonhal, @francip, @fujinice, @gianfrancopiana, @glennc, @Glucksberg, @godlin-gh, @Grogger, @guillaumemeyer,
+@Gutslabs, @H-Ali13381, @hanzckernel, @haran2001, @hawknewton, @hayka-pacha, @hehehe0803, @helix4u, @HenkDz,
+@Hermes, @hermesagent26, @Hinotoi-agent, @hongchen1993, @honor2030, @houenyang-momo, @ht1072, @hueilau,
+@iamfoz, @ilonagaja509-glitch, @InB4DevOps, @indigokarasu, @Interstellar-code, @iqdoctor, @iRonin, @Jaaneek,
+@JabberELF, @jacevys, @jackey8616, @jackjin1997, @jdelmerico, @jfuenmayor, @Jiahui-Gu, @JimLiu, @joe102084,
+@JohnC1009, @jonpol01, @Jpalmer95, @Julientalbot, @justemu, @justincc, @jvinals, @karthikeyann, @kasunvinod,
+@kchuang1015, @kenyonxu, @khungate, @kiranvk-2011, @kjames2001, @konsisumer, @kpadilha, @kriscolab,
+@krislidimo, @kronexoi, @kshitijk4poor, @kunci115, @Kylejeong2, @kylekahraman, @LaPhilosophie, @leeseoki0,
+@lemassykoi, @Lempkey, @LeonJS, @LeonSGP43, @lidge-jun, @LifeJiggy, @liuhao1024, @LizerAIDev, @loicnico96,
+@loongfay, @m0n3r0, @malaiwah, @matthewlai, @mavrickdeveloper, @maxmilian, @McClean-Edison, @memosr,
+@Mind-Dragon, @momowind, @MoonJuhan, @MoonRay305, @moortekweb-art, @MorAlekss, @ms-alan, @Nami4D,
+@nehaaprasaad, @nekwo, @nftpoetrist, @NickLarcombe, @nidhi-singh02, @Niraven, @nnnet, @noctilust, @novax635,
+@nthrow, @nv-kasikritc, @nycomar, @OCWC22, @oemtalks, @OmX, @ooovenenoso, @orcool, @oseftg, @outsourc-e,
+@OutThisLife, @Paperclip, @PaTTeeL, @pepelax, @phoenixshen, @Pluviobyte, @pnascimento9596, @pochi-gio, @pr7426,
+@PratikRai0101, @Prithvi1994, @psionic73, @ptichalouf, @Que0x, @QuenVix, @quocanh261997, @qWaitCrypto, @Qwinty,
+@r266-tech, @rak135, @rdasilva1016-ui, @rewbs, @roadhero, @rodrigoeqnit, @RonHillDev, @roycepersonalassistant,
+@rudi193-cmd, @RyanRana, @sadiksaifi, @samahn0601, @samggggflynn, @SamuelZ12, @sanghyuk-seo-nexcube,
+@Saurav0989, @savanne-kham, @Schrotti77, @Schwartz10, @SerenityTn, @sgtworkman, @sharziki, @shaun0927,
+@shellybotmoyer, @shunsuke-hikiyama, @SimbaKingjoe, @SimoKiihamaki, @sir-ad, @Slimydog21, @slowtokki0409,
+@Soju06, @someaka, @soynchux, @sprmn24, @Stark-X, @steezkelly, @stepanov1975, @stephenschoettler,
+@stevehq26-bot, @steveonjava, @Strontvod, @subtract0, @Sunil123135, @superearn-fisher, @Sylw3ster, @tchanee,
+@that-ambuj, @thedavidmurray, @TheOnlyMika, @therahul-yo, @thewillhuang, @ticketclosed-wontfix, @Timur00Kh,
+@tomqiaozc, @Tosko4, @Tranquil-Flow, @tw2818, @uzunkuyruk, @vaddisrinivas, @vanthinh6886, @vgocoder,
+@victorGPT, @vynxevainglory-ai, @waefrebeorn, @walli, @wangpuv, @wanwan2qq, @wesleysimplicio, @worlldz,
+@wpengpeng168, @WuKongAI-CMU, @wuli666, @Wysie, @wysie, @xxxigm, @yannsunn, @YanzhongSu, @YarrowQiao, @ygd58,
+@YLChen-007, @yoniebans, @yu-xin-c, @YuanHanzhong, @zapabob, @zccyman, @ziliangpeng, @zwolniony, @Zyrixtrex
+
+---
+
+**Full Changelog**: [v2026.5.16...v2026.5.28](https://github.com/NousResearch/hermes-agent/compare/v2026.5.16...v2026.5.28)
--- a/RELEASE_v0.15.1.md
+++ b/RELEASE_v0.15.1.md
@ -0,0 +1,110 @@
+# Hermes Agent v0.15.1 (v2026.5.29)
+
+**Release Date:** May 29, 2026
+**Since v0.15.0:** 28 commits · 21 merged PRs · hotfix release · 9 contributors
+
+> **The Patch Release.** A same-day hotfix for v0.15.0. Headline fix: the dashboard infinite-reload loop that hit anyone running v0.15.0 in loopback mode (Docker, hosted Hermes, fresh installs). A handful of other v0.15.0 follow-ups go along for the ride — kanban worker SIGTERM, `/model` picker unification, `/yolo` session bypass, the full 19,932-entry skills.sh catalog, `.md` media delivery restoration, gateway probe-stepdown safety, web-URL redaction passthrough, kanban worker vision on referenced images, hindsight observation-default. Docker users get an explicit `--insecure` opt-in env var (no more bind-host inference), MCP server bare-command PATH resolution, and arm64 PR-build cache fixes.
+
+---
+
+## ✨ Highlights
+
+- **Dashboard 401 reload loop fixed** — In loopback mode the dashboard's identity probe (`/api/auth/me`) returns 401 by design, but v0.15.0's stale-token reload guard treated every 401 as a rotated session token and full-page-reloaded to pick up a fresh one. Every successful sibling call cleared the one-shot reload guard, so the page reload-looped forever (Firefox: "Navigated to /sessions" storm; Chrome: React re-render storm). Fix adds an `allowUnauthorized` opt-out to `fetchJSON` that skips only the loopback stale-token reload — 401 still throws so `AuthWidget` swallows it, gated-mode `login_url` redirects are unaffected. Closes [#34206](https://github.com/NousResearch/hermes-agent/issues/34206), [#34202](https://github.com/NousResearch/hermes-agent/issues/34202). ([#30698](https://github.com/NousResearch/hermes-agent/pull/30698) — @austinpickett)
+
+- **Docker dashboard `--insecure` is now an explicit env opt-in, never derived from bind host** — Previously the Docker entrypoint inferred `--insecure` when the dashboard bound to a non-loopback host. That conflated "I want LAN access" with "I want to disable the same-origin guard." The fix splits them: bind host is bind host, and disabling the dashboard's loopback auth requires an explicit `HERMES_DASHBOARD_INSECURE=1`. Existing setups that genuinely wanted insecure binding must now set the env var. ([#34188](https://github.com/NousResearch/hermes-agent/pull/34188), [#34204](https://github.com/NousResearch/hermes-agent/pull/34204) — @benbarclay)
+
+- **MCP bare command resolution under Docker** — MCP servers configured with bare commands (`npx`, `npm`, `node`) now resolve against `/usr/local/bin` so they actually launch inside the Docker image where those binaries live. v0.15.0 left these failing silently in containers when the agent's effective PATH didn't include the Node toolchain location. ([#34186](https://github.com/NousResearch/hermes-agent/pull/34186) — @benbarclay)
+
+- **Skills page sidebar / source pills restored** — A stale `useMemo` dependency in the new dashboard skills page collapsed the source pills and category sidebar to "All" only. Fixed; both surfaces now reflect the live catalog state. ([#34194](https://github.com/NousResearch/hermes-agent/pull/34194))
+
+- **Kanban worker can be killed again** — `SIGTERM` on a kanban worker was being absorbed by an intermediate process and the worker stayed running. Closes [#28181](https://github.com/NousResearch/hermes-agent/issues/28181). ([#34045](https://github.com/NousResearch/hermes-agent/pull/34045))
+
+- **Full skills.sh catalog (858 → 19,932 entries)** — The skills hub page was pulling a partial paginated catalog. The fetch now walks the sitemap, so all 19,932 skills.sh entries surface in the picker instead of just the first 858. ([#34025](https://github.com/NousResearch/hermes-agent/pull/34025))
+
+---
+
+## 🐛 Bug Fixes
+
+### Dashboard / Web
+
+- **`/api/auth/me` 401 no longer triggers reload loop** in loopback mode — ([#30698](https://github.com/NousResearch/hermes-agent/pull/30698) — @austinpickett)
+- **Skills page source pills + category sidebar restored** — stale `useMemo` dep ([#34194](https://github.com/NousResearch/hermes-agent/pull/34194))
+
+### Docker
+
+- **`--insecure` is now explicit opt-in via env var**, not derived from bind host ([#34188](https://github.com/NousResearch/hermes-agent/pull/34188) — @benbarclay)
+- **Dashboard test suite repaired** to match the insecure-opt-in fix ([#34204](https://github.com/NousResearch/hermes-agent/pull/34204) — @benbarclay)
+- **arm64 PR builds skip the GHA cache** to avoid cache-thrash on cross-arch builders ([#33704](https://github.com/NousResearch/hermes-agent/pull/33704) — @BROCCOLO1D)
+
+### MCP
+
+- **Bare `npx`/`npm`/`node` resolve against `/usr/local/bin`** for Docker compatibility ([#34186](https://github.com/NousResearch/hermes-agent/pull/34186) — @benbarclay)
+
+### Kanban
+
+- **Worker SIGTERM actually terminates the process** ([#34045](https://github.com/NousResearch/hermes-agent/pull/34045))
+- **Workers receive images referenced in task bodies** for vision-capable models ([#34210](https://github.com/NousResearch/hermes-agent/pull/34210))
+
+### Gateway
+
+- **`.md` files deliver again** — media-delivery validation defaults to denylist-only instead of an overly-narrow allowlist ([#34022](https://github.com/NousResearch/hermes-agent/pull/34022))
+- **Probe stepdown safety** — on a context-overflow without an explicit provider context limit, the agent no longer steps down to a smaller model based on an unknown ceiling (salvage of [#33673](https://github.com/NousResearch/hermes-agent/pull/33673)) ([#33826](https://github.com/NousResearch/hermes-agent/pull/33826))
+
+### CLI
+
+- **`/yolo` mid-session enables the per-session bypass** instead of just toggling the env var (which the running agent had already snapshotted) ([#33931](https://github.com/NousResearch/hermes-agent/pull/33931) — @kshitijk4poor)
+- **`/model` and `hermes model` show the same list**, plus disk cache for picker startup ([#33867](https://github.com/NousResearch/hermes-agent/pull/33867))
+
+### Skills
+
+- **Full skills.sh catalog via sitemap** — 858 → 19,932 entries ([#34025](https://github.com/NousResearch/hermes-agent/pull/34025))
+
+### Redaction
+
+- **Web URLs pass through unchanged** — the redactor was eating query parameters that looked credential-shaped ([#34029](https://github.com/NousResearch/hermes-agent/pull/34029))
+
+---
+
+## ✨ Small Features
+
+- **Hindsight default narrowed to observation-only** for `recall_types` — tool path is also narrowed ([#34079](https://github.com/NousResearch/hermes-agent/pull/34079) — @nicoloboschi, follow-up [#34091](https://github.com/NousResearch/hermes-agent/pull/4df62d239e38bf8c212a595721c9c01e176f6c3a) — @kshitijk4poor)
+- **Memory providers receive completed-turn message context** — salvage of [#28065](https://github.com/NousResearch/hermes-agent/pull/28065) ([#34097](https://github.com/NousResearch/hermes-agent/pull/34097) — @kshitijk4poor, credit to @devwdave)
+
+---
+
+## 📚 Documentation
+
+- **`--no-supervise` / `HERMES_GATEWAY_NO_SUPERVISE` documented** in the reference docs (follow-up to [#33583](https://github.com/NousResearch/hermes-agent/pull/33583)) ([#33751](https://github.com/NousResearch/hermes-agent/pull/33751) — @r266-tech)
+
+---
+
+## 🛠️ Infrastructure
+
+- **Vercel deploy workflow accepts `workflow_dispatch`** so docs deploys can be manually triggered ([#34081](https://github.com/NousResearch/hermes-agent/pull/34081))
+- **`@nous-research/ui` bumped to 0.18.2** (Nix `npmDepsHash` also updated to match) ([#34193](https://github.com/NousResearch/hermes-agent/pull/34193) follow-ups — @austinpickett)
+
+---
+
+## 👥 Contributors
+
+### Core
+- @teknium1
+
+### Community
+- @austinpickett — dashboard 401 reload-loop fix (the headline), `@nous-research/ui` bump, Nix `npmDepsHash` updates
+- @benbarclay — Docker `--insecure` opt-in, MCP bare-command resolution, dashboard test repair
+- @kshitijk4poor — `/yolo` session bypass, completed-turn memory context salvage, hindsight follow-up docs
+- @nicoloboschi — hindsight `recall_types` observation default
+- @BROCCOLO1D — arm64 PR build cache fix
+- @r266-tech — `--no-supervise` reference docs
+- @yangguangjin — probe stepdown safety (salvage of @yanghd's #33673)
+- @devwdave — completed-turn memory context (credited via salvage)
+- @andrewhosf — co-author
+
+### Issue Reporters (the 401 loop)
+- @routesmith ([#34206](https://github.com/NousResearch/hermes-agent/issues/34206))
+- @beeaton ([#34202](https://github.com/NousResearch/hermes-agent/issues/34202))
+
+---
+
+**Full Changelog**: [v2026.5.28...v2026.5.29](https://github.com/NousResearch/hermes-agent/compare/v2026.5.28...v2026.5.29)
--- a/acp_adapter/tools.py
+++ b/acp_adapter/tools.py
@ -907,72 +907,6 @@ def _build_polished_completion_content(
    return [_text(text)]


-def _build_patch_mode_content(patch_text: str) -> List[Any]:
-    """Parse V4A patch mode input into ACP diff blocks when possible."""
-    if not patch_text:
-        return [acp.tool_content(acp.text_block(""))]
-
-    try:
-        from tools.patch_parser import OperationType, parse_v4a_patch
-
-        operations, error = parse_v4a_patch(patch_text)
-        if error or not operations:
-            return [acp.tool_content(acp.text_block(patch_text))]
-
-        content: List[Any] = []
-        for op in operations:
-            if op.operation == OperationType.UPDATE:
-                old_chunks: list[str] = []
-                new_chunks: list[str] = []
-                for hunk in op.hunks:
-                    old_lines = [line.content for line in hunk.lines if line.prefix in {" ", "-"}]
-                    new_lines = [line.content for line in hunk.lines if line.prefix in {" ", "+"}]
-                    if old_lines or new_lines:
-                        old_chunks.append("\n".join(old_lines))
-                        new_chunks.append("\n".join(new_lines))
-
-                old_text = "\n...\n".join(chunk for chunk in old_chunks if chunk)
-                new_text = "\n...\n".join(chunk for chunk in new_chunks if chunk)
-                if old_text or new_text:
-                    content.append(
-                        acp.tool_diff_content(
-                            path=op.file_path,
-                            old_text=old_text or None,
-                            new_text=new_text or "",
-                        )
-                    )
-                continue
-
-            if op.operation == OperationType.ADD:
-                added_lines = [line.content for hunk in op.hunks for line in hunk.lines if line.prefix == "+"]
-                content.append(
-                    acp.tool_diff_content(
-                        path=op.file_path,
-                        new_text="\n".join(added_lines),
-                    )
-                )
-                continue
-
-            if op.operation == OperationType.DELETE:
-                content.append(
-                    acp.tool_diff_content(
-                        path=op.file_path,
-                        old_text=f"Delete file: {op.file_path}",
-                        new_text="",
-                    )
-                )
-                continue
-
-            if op.operation == OperationType.MOVE:
-                content.append(
-                    acp.tool_content(acp.text_block(f"Move file: {op.file_path} -> {op.new_path}"))
-                )
-
-        return content or [acp.tool_content(acp.text_block(patch_text))]
-    except Exception:
-        return [acp.tool_content(acp.text_block(patch_text))]
-
-
 def _strip_diff_prefix(path: str) -> str:
    raw = str(path or "").strip()
    if raw.startswith(("a/", "b/")):
--- a/acp_registry/agent.json
+++ b/acp_registry/agent.json
@ -1,7 +1,7 @@
 {
  "id": "hermes-agent",
  "name": "Hermes Agent",
-  "version": "0.14.0",
+  "version": "0.15.1",
  "description": "Self-improving open-source AI agent by Nous Research with ACP editor integration, persistent memory, skills, and rich tool support.",
  "repository": "https://github.com/NousResearch/hermes-agent",
  "website": "https://hermes-agent.nousresearch.com/docs/user-guide/features/acp",
@ -9,7 +9,7 @@
  "license": "MIT",
  "distribution": {
    "uvx": {
-      "package": "hermes-agent[acp]==0.14.0",
+      "package": "hermes-agent[acp]==0.15.1",
      "args": ["hermes-acp"]
    }
  }
--- a/agent/init.py
+++ b/agent/init.py
@ -4,3 +4,5 @@ These modules contain pure utility functions and self-contained classes
 that were previously embedded in the 3,600-line run_agent.py. Extracting
 them makes run_agent.py focused on the AIAgent orchestrator class.
 """
+
+from . import jiter_preload as _jiter_preload  # noqa: F401
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@ -27,7 +27,6 @@ import threading
 import time
 import uuid
 from datetime import datetime
-from pathlib import Path
 from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse, parse_qs, urlunparse

@ -37,7 +36,6 @@ from agent.memory_manager import StreamingContextScrubber
 from agent.model_metadata import (
    MINIMUM_CONTEXT_LENGTH,
    fetch_model_metadata,
-    get_model_context_length,
    is_local_endpoint,
    query_ollama_num_ctx,
 )
@ -52,7 +50,6 @@ from agent.tool_guardrails import (
 from hermes_cli.config import cfg_get
 from hermes_cli.timeouts import get_provider_request_timeout
 from hermes_constants import get_hermes_home
-from model_tools import check_toolset_requirements, get_tool_definitions
 from utils import base_url_host_matches

 # Use the same logger name as run_agent so tests patching ``run_agent.logger``
@ -1201,6 +1198,18 @@ def init_agent(
        _agent_section = {}
    agent._tool_use_enforcement = _agent_section.get("tool_use_enforcement", "auto")

+    # Universal task-completion guidance toggle.  Default True.  Surfaced
+    # as a separate flag from tool_use_enforcement because the guidance
+    # applies to ALL models, not just the model families enforcement
+    # targets.
+    agent._task_completion_guidance = bool(_agent_section.get("task_completion_guidance", True))
+
+    # Local Python toolchain probe toggle.  Default True.  When False,
+    # the probe is skipped entirely (no subprocess calls, no system-prompt
+    # line).  Useful for users on exotic setups where the probe heuristics
+    # are noisy.
+    agent._environment_probe = bool(_agent_section.get("environment_probe", True))
+
    # App-level API retry count (wraps each model API call).  Default 3,
    # overridable via agent.api_max_retries in config.yaml.  See #11616.
    try:
@ -1462,7 +1471,6 @@ def init_agent(

    # Reject models whose context window is below the minimum required
    # for reliable tool-calling workflows (64K tokens).
-    from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
    _ctx = getattr(agent.context_compressor, "context_length", 0)
    if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH:
        raise ValueError(
@ -1522,6 +1530,7 @@ def init_agent(
                platform=agent.platform or "cli",
                model=agent.model,
                context_length=getattr(agent.context_compressor, "context_length", 0),
+                conversation_id=getattr(agent, "_gateway_session_key", None),
            )
        except Exception as _ce_err:
            _ra().logger.debug("Context engine on_session_start: %s", _ce_err)
--- a/agent/agent_runtime_helpers.py
+++ b/agent/agent_runtime_helpers.py
@ -25,24 +25,17 @@ from __future__ import annotations
 import copy
 import json
 import logging
-import os
 import re
-import threading
 import time
-import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional

 from hermes_cli.timeouts import get_provider_request_timeout
-from agent.message_sanitization import (
-    _repair_tool_call_arguments,
-    _sanitize_surrogates,
-)
 from agent.tool_dispatch_helpers import _trajectory_normalize_msg, make_tool_result_message
 from agent.trajectory import convert_scratchpad_to_think
 from agent.credential_pool import STATUS_EXHAUSTED
-from agent.error_classifier import classify_api_error, FailoverReason
+from agent.error_classifier import FailoverReason
 from utils import base_url_host_matches, base_url_hostname, env_var_enabled, atomic_json_write

 logger = logging.getLogger(__name__)
@ -1699,6 +1692,8 @@ def invoke_tool(agent, function_name: str, function_args: dict, effective_task_i
            session_id=agent.session_id or "",
            enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
            skip_pre_tool_call_hook=True,
+            enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+            disabled_toolsets=getattr(agent, "disabled_toolsets", None),
        )


@ -1994,6 +1989,36 @@ def copy_reasoning_content_for_api(agent, source_msg: dict, api_msg: dict) -> No
    api_msg.pop("reasoning_content", None)


+def reapply_reasoning_echo_for_provider(agent, api_messages: list) -> int:
+    """Re-pad assistant turns with reasoning_content for the active provider.
+
+    ``api_messages`` is built once, before the retry loop, while the *primary*
+    provider is active.  If a mid-conversation fallback then switches to a
+    require-side provider (DeepSeek / Kimi / MiMo thinking mode), assistant
+    turns that were built when the prior provider did NOT need the echo-back go
+    out without ``reasoning_content`` and the new provider rejects them with
+    HTTP 400 ("The reasoning_content in the thinking mode must be passed back").
+
+    Calling this immediately before building the request kwargs re-applies the
+    pad against the *current* provider.  It is idempotent and a no-op unless
+    ``_needs_thinking_reasoning_pad()`` is True for the active provider, so it
+    is safe to call every iteration and covers every fallback path.
+
+    Returns the number of assistant turns that gained reasoning_content.
+    """
+    if not agent._needs_thinking_reasoning_pad():
+        return 0
+    padded = 0
+    for api_msg in api_messages:
+        if api_msg.get("role") != "assistant":
+            continue
+        if api_msg.get("reasoning_content"):
+            continue
+        copy_reasoning_content_for_api(agent, api_msg, api_msg)
+        if api_msg.get("reasoning_content"):
+            padded += 1
+    return padded
+

 def _iter_pool_sockets(client: Any):
    """Yield raw sockets reachable from an OpenAI/httpx client pool.
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -77,16 +77,16 @@ ADAPTIVE_EFFORT_MAP = {
 # xhigh as a distinct level between high and max; older adaptive-thinking
 # models (4.6) reject it with a 400.  Keep this substring list in sync with
 # the Anthropic migration guide as new model families ship.
-_XHIGH_EFFORT_SUBSTRINGS = ("4-7", "4.7")
+_XHIGH_EFFORT_SUBSTRINGS = ("4-7", "4.7", "4-8", "4.8")

 # Models where extended thinking is deprecated/removed (4.6+ behavior: adaptive
 # is the only supported mode; 4.7 additionally forbids manual thinking entirely
 # and drops temperature/top_p/top_k).
-_ADAPTIVE_THINKING_SUBSTRINGS = ("4-6", "4.6", "4-7", "4.7")
+_ADAPTIVE_THINKING_SUBSTRINGS = ("4-6", "4.6", "4-7", "4.7", "4-8", "4.8")

 # Models where temperature/top_p/top_k return 400 if set to non-default values.
 # This is the Opus 4.7 contract; future 4.x+ models are expected to follow it.
-_NO_SAMPLING_PARAMS_SUBSTRINGS = ("4-7", "4.7")
+_NO_SAMPLING_PARAMS_SUBSTRINGS = ("4-7", "4.7", "4-8", "4.8")
 _FAST_MODE_SUPPORTED_SUBSTRINGS = ("opus-4-6", "opus-4.6")

 # ── Max output token limits per Anthropic model ───────────────────────
@ -94,6 +94,8 @@ _FAST_MODE_SUPPORTED_SUBSTRINGS = ("opus-4-6", "opus-4.6")
 # max_tokens as a mandatory field.  Previously we hardcoded 16384, which
 # starves thinking-enabled models (thinking tokens count toward the limit).
 _ANTHROPIC_OUTPUT_LIMITS = {
+    # Claude 4.8
+    "claude-opus-4-8":   128_000,
    # Claude 4.7
    "claude-opus-4-7":   128_000,
    # Claude 4.6
@ -892,20 +894,6 @@ def read_claude_code_credentials() -> Optional[Dict[str, Any]]:
    return None


-def read_claude_managed_key() -> Optional[str]:
-    """Read Claude's native managed key from ~/.claude.json for diagnostics only."""
-    claude_json = Path.home() / ".claude.json"
-    if claude_json.exists():
-        try:
-            data = json.loads(claude_json.read_text(encoding="utf-8"))
-            primary_key = data.get("primaryApiKey", "")
-            if isinstance(primary_key, str) and primary_key.strip():
-                return primary_key.strip()
-        except (json.JSONDecodeError, OSError, IOError) as e:
-            logger.debug("Failed to read ~/.claude.json: %s", e)
-    return None
-
-
 def is_claude_code_token_valid(creds: Dict[str, Any]) -> bool:
    """Check if Claude Code credentials have a non-expired access token."""
    import time
@ -1254,10 +1242,16 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
    print()

    try:
-        webbrowser.open(auth_url)
-        print("  (Browser opened automatically)")
+        from hermes_cli.auth import _can_open_graphical_browser as _can_open_gui
    except Exception:
-        pass
+        _can_open_gui = lambda: True  # noqa: E731 — degrade to prior behavior
+
+    if _can_open_gui():
+        try:
+            webbrowser.open(auth_url)
+            print("  (Browser opened automatically)")
+        except Exception:
+            pass

    print()
    print("After authorizing, you'll see a code. Paste it below.")
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -700,12 +700,20 @@ class _CodexCompletionsAdapter:
            # xAI's Responses endpoint rejects ``pattern`` and ``format`` JSON Schema
            # keywords (HTTP 400). Strip them here to match the parity guarantee that
            # chat_completion_helpers.py provides for the main-agent xAI path.
+            #
+            # Deep-copy before sanitizing — ``list(tools)`` is only a shallow
+            # copy of the outer list, but the sanitizers mutate the inner
+            # parameter dicts in place.  Without a deep copy the caller's
+            # tool registry permanently loses its slash-containing enum
+            # constraints after the first auxiliary xAI call.  See #27907.
            try:
+                import copy as _copy
                from tools.schema_sanitizer import (
                    strip_pattern_and_format,
                    strip_slash_enum,
                )
-                tools, _ = strip_pattern_and_format(list(tools))
+                tools = _copy.deepcopy(list(tools))
+                tools, _ = strip_pattern_and_format(tools)
                tools, _ = strip_slash_enum(tools)
            except Exception as exc:
                logger.warning(
@ -1235,8 +1243,23 @@ def _read_nous_auth() -> Optional[dict]:


 def _nous_api_key(provider: dict) -> str:
-    """Extract the Nous runtime credential from the compatibility field."""
-    return provider.get("agent_key") or provider.get("access_token", "")
+    """Extract a usable Nous inference JWT from stored auth state."""
+    from hermes_cli.auth import _nous_invoke_jwt_is_usable
+
+    for token_key, expiry_key in (
+        ("agent_key", "agent_key_expires_at"),
+        ("access_token", "expires_at"),
+    ):
+        token = provider.get(token_key)
+        if not isinstance(token, str) or not token.strip():
+            continue
+        if _nous_invoke_jwt_is_usable(
+            token,
+            scope=provider.get("scope"),
+            expires_at=provider.get(expiry_key),
+        ):
+            return token
+    return ""


 def _nous_base_url() -> str:
@ -1248,25 +1271,16 @@ def _resolve_nous_runtime_api(*, force_refresh: bool = False) -> Optional[tuple[
    """Return fresh Nous runtime credentials when available.

    This mirrors the main agent's 401 recovery path and keeps auxiliary
-    clients aligned with the singleton auth store + JWT/mint flow instead of
+    clients aligned with the singleton auth store + JWT refresh flow instead of
    relying only on whatever raw tokens happen to be sitting in auth.json
    or the credential pool.
    """
    try:
-        from hermes_cli.auth import (
-            NOUS_INFERENCE_AUTH_MODE_AUTO,
-            NOUS_INFERENCE_AUTH_MODE_LEGACY,
-            resolve_nous_runtime_credentials,
-        )
+        from hermes_cli.auth import resolve_nous_runtime_credentials

        creds = resolve_nous_runtime_credentials(
-            min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
            timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
-            inference_auth_mode=(
-                NOUS_INFERENCE_AUTH_MODE_LEGACY
-                if force_refresh
-                else NOUS_INFERENCE_AUTH_MODE_AUTO
-            ),
+            force_refresh=force_refresh,
        )
    except Exception as exc:
        logger.debug("Auxiliary Nous runtime credential resolution failed: %s", exc)
@ -1550,13 +1564,9 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
        _mark_provider_unhealthy("nous", ttl=60)
        return None, None
    if runtime is None and nous:
-        # Runtime credential mint failed but stored Nous auth is still present.
-        # Falls back to the raw stored token below; surface a debug line so
-        # operators investigating expired/invalid sessions have a breadcrumb,
-        # without blocking the fallback path the rest of this function relies on.
        logger.debug(
-            "Auxiliary Nous: runtime credential mint failed; falling back to "
-            "stored auth.json token."
+            "Auxiliary Nous: runtime JWT refresh failed; checking stored "
+            "auth.json token."
        )
    global auxiliary_is_nous
    auxiliary_is_nous = True
@ -1594,6 +1604,13 @@ def _try_nous(vision: bool = False) -> Tuple[Optional[OpenAI], Optional[str]]:
        api_key, base_url = runtime
    else:
        api_key = _nous_api_key(nous or {})
+        if not api_key:
+            logger.warning(
+                "Auxiliary Nous client unavailable: no usable inference JWT found "
+                "(run: hermes auth add nous)."
+            )
+            _mark_provider_unhealthy("nous", ttl=60)
+            return None, None
        base_url = str((nous or {}).get("inference_base_url") or _nous_base_url()).rstrip("/")
    return (
        OpenAI(
@ -2244,11 +2261,15 @@ def _is_payment_error(exc: Exception) -> bool:
    # but sometimes wrap them in 429 or other codes.
    # Daily quota exhaustion from Bedrock, Vertex AI, and similar providers
    # uses different language but is semantically identical to credit exhaustion.
-    if status in {402, 429, None}:
+    if status in {402, 404, 429, None}:
        if any(kw in err_lower for kw in (
            "credits", "insufficient funds",
            "can only afford", "billing",
            "payment required",
+            "out of funds", "run out of funds",
+            "balance_depleted", "no usable credits",
+            "model_not_supported_on_free_tier",
+            "not available on the free tier",
            # Daily / monthly / weekly quota exhaustion keywords
            "quota exceeded", "quota_exceeded",
            "too many tokens per day", "daily limit",
@ -2260,6 +2281,18 @@ def _is_payment_error(exc: Exception) -> bool:
    return False


+def _nous_portal_account_has_fresh_paid_access() -> bool:
+    """Return True only when the fresh Nous account API says paid access is allowed."""
+    try:
+        from hermes_cli.nous_account import get_nous_portal_account_info
+
+        account_info = get_nous_portal_account_info(force_fresh=True)
+        return account_info.paid_service_access is True
+    except Exception as exc:
+        logger.debug("Auxiliary Nous paid-entitlement refresh check failed: %s", exc)
+        return False
+
+
 def _is_rate_limit_error(exc: Exception) -> bool:
    """Detect rate-limit errors that warrant provider fallback.

@ -2288,6 +2321,10 @@ def _is_rate_limit_error(exc: Exception) -> bool:
        if not any(kw in err_lower for kw in (
            "credits", "insufficient funds", "billing",
            "payment required", "can only afford",
+            "out of funds", "run out of funds",
+            "balance_depleted", "no usable credits",
+            "model_not_supported_on_free_tier",
+            "not available on the free tier",
        )):
            return True
    return False
@ -2337,7 +2374,16 @@ def _is_auth_error(exc: Exception) -> bool:
    if status == 401:
        return True
    err_lower = str(exc).lower()
-    return "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower()
+    if "error code: 401" in err_lower or "authenticationerror" in type(exc).__name__.lower():
+        return True
+    # xAI returns HTTP 403 with "unauthenticated:bad-credentials" when an OAuth2
+    # access token has expired or is invalid — semantically a 401 auth failure,
+    # even though the status code is 403 (PermissionDenied).
+    if status == 403 and "bad-credentials" in err_lower:
+        return True
+    if "unauthenticated" in err_lower and "bad-credentials" in err_lower:
+        return True
+    return False


 def _is_unsupported_parameter_error(exc: Exception, param: str) -> bool:
@ -2490,6 +2536,8 @@ def _recoverable_pool_provider(
        return "copilot"
    if base_url_host_matches(base, "api.kimi.com"):
        return "kimi-coding"
+    if base_url_host_matches(base, "api.x.ai"):
+        return "xai-oauth"
    # For api_key providers not in the hardcoded list (e.g. opencode-go), match
    # the client base URL against all registered api_key providers so that
    # credential-pool rotation works for any provider the user configured.
@ -2686,15 +2734,11 @@ def _refresh_provider_credentials(provider: str) -> bool:
            _evict_cached_clients(normalized)
            return True
        if normalized == "nous":
-            from hermes_cli.auth import (
-                NOUS_INFERENCE_AUTH_MODE_LEGACY,
-                resolve_nous_runtime_credentials,
-            )
+            from hermes_cli.auth import resolve_nous_runtime_credentials

            creds = resolve_nous_runtime_credentials(
-                min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
                timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
-                inference_auth_mode=NOUS_INFERENCE_AUTH_MODE_LEGACY,
+                force_refresh=True,
            )
            if not str(creds.get("api_key", "") or "").strip():
                return False
@ -2711,6 +2755,24 @@ def _refresh_provider_credentials(provider: str) -> bool:
                return False
            _evict_cached_clients(normalized)
            return True
+        if normalized == "xai-oauth":
+            # Preference: pool-level refresh (uses refresh_token from pool entry),
+            # then fall back to singleton auth-store resolver.
+            pool = load_pool(normalized)
+            if pool and pool.has_credentials():
+                # Ensure a current entry is selected before trying to refresh.
+                pool.select()
+                refreshed = pool.try_refresh_current()
+                if refreshed is not None and str(getattr(refreshed, "runtime_api_key", "") or "").strip():
+                    _evict_cached_clients(normalized)
+                    return True
+            from hermes_cli.auth import resolve_xai_oauth_runtime_credentials
+
+            creds = resolve_xai_oauth_runtime_credentials(force_refresh=True)
+            if not str(creds.get("api_key", "") or "").strip():
+                return False
+            _evict_cached_clients(normalized)
+            return True
    except Exception as exc:
        logger.debug("Auxiliary provider credential refresh failed for %s: %s", normalized, exc)
        return False
@ -4663,24 +4725,23 @@ def _build_call_kwargs(
        kwargs["temperature"] = temperature

    if max_tokens is not None:
-        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
-        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
-        # ZAI vision models (glm-4v-flash, glm-4v-plus, etc.) reject max_tokens with
-        # error code 1210 ("API 调用参数有误") on multimodal requests — skip it.
-        _model_lower = (model or "").lower()
-        _skip_max_tokens = (
-            provider == "zai"
-            and ("4v" in _model_lower or "5v" in _model_lower or "-v" in _model_lower)
+        # We do NOT cap output by default. Most chat-completions providers treat
+        # an omitted max_tokens as "use the model's max output", which is what we
+        # want for auxiliary tasks (compression summaries, titles, vision, etc.) —
+        # an explicit cap only risks truncating a summary or 400-ing on providers
+        # that reject the parameter outright (e.g. GitHub Copilot / newer OpenAI
+        # GPT-5 models require max_completion_tokens, not max_tokens; ZAI vision
+        # models reject it entirely with error 1210). Omitting it sidesteps all of
+        # those wire-format quirks at once.
+        #
+        # The one exception is the Anthropic Messages wire (MiniMax and any
+        # ``/anthropic`` endpoint reached through the OpenAI SDK wrapper), where
+        # max_tokens is a MANDATORY field — omitting it is a hard 400. Keep it only
+        # there.
+        _effective_base = base_url or (
+            _current_custom_base_url() if provider == "custom" else ""
        )
-        if _skip_max_tokens:
-            pass  # ZAI vision models do not accept max_tokens
-        elif provider == "custom":
-            custom_base = base_url or _current_custom_base_url()
-            if base_url_hostname(custom_base) == "api.openai.com":
-                kwargs["max_completion_tokens"] = max_tokens
-            else:
-                kwargs["max_tokens"] = max_tokens
-        else:
+        if _is_anthropic_compat_endpoint(provider, _effective_base):
            kwargs["max_tokens"] = max_tokens

    if tools:
@ -4937,6 +4998,41 @@ def call_llm(
            resolved_provider == "nous"
            or base_url_host_matches(_base_info, "inference-api.nousresearch.com")
        )
+        if (
+            _is_payment_error(first_err)
+            and client_is_nous
+            and _nous_portal_account_has_fresh_paid_access()
+        ):
+            refreshed_client, refreshed_model = _refresh_nous_auxiliary_client(
+                cache_provider=resolved_provider or "nous",
+                model=final_model,
+                async_mode=False,
+                base_url=resolved_base_url,
+                api_key=resolved_api_key,
+                api_mode=resolved_api_mode,
+                main_runtime=main_runtime,
+                is_vision=(task == "vision"),
+            )
+            if refreshed_client is not None:
+                logger.info(
+                    "Auxiliary %s: refreshed Nous runtime credentials after paid account check, retrying",
+                    task or "call",
+                )
+                if refreshed_model and refreshed_model != kwargs.get("model"):
+                    kwargs["model"] = refreshed_model
+                try:
+                    return _validate_llm_response(
+                        refreshed_client.chat.completions.create(**kwargs), task)
+                except Exception as retry_err:
+                    if not (
+                        _is_auth_error(retry_err)
+                        or _is_payment_error(retry_err)
+                        or _is_connection_error(retry_err)
+                        or _is_rate_limit_error(retry_err)
+                    ):
+                        raise
+                    first_err = retry_err
+
        if _is_auth_error(first_err) and client_is_nous:
            refreshed_client, refreshed_model = _refresh_nous_auxiliary_client(
                cache_provider=resolved_provider or "nous",
@ -5339,6 +5435,40 @@ async def async_call_llm(
            resolved_provider == "nous"
            or base_url_host_matches(_client_base, "inference-api.nousresearch.com")
        )
+        if (
+            _is_payment_error(first_err)
+            and client_is_nous
+            and _nous_portal_account_has_fresh_paid_access()
+        ):
+            refreshed_client, refreshed_model = _refresh_nous_auxiliary_client(
+                cache_provider=resolved_provider or "nous",
+                model=final_model,
+                async_mode=True,
+                base_url=resolved_base_url,
+                api_key=resolved_api_key,
+                api_mode=resolved_api_mode,
+                is_vision=(task == "vision"),
+            )
+            if refreshed_client is not None:
+                logger.info(
+                    "Auxiliary %s (async): refreshed Nous runtime credentials after paid account check, retrying",
+                    task or "call",
+                )
+                if refreshed_model and refreshed_model != kwargs.get("model"):
+                    kwargs["model"] = refreshed_model
+                try:
+                    return _validate_llm_response(
+                        await refreshed_client.chat.completions.create(**kwargs), task)
+                except Exception as retry_err:
+                    if not (
+                        _is_auth_error(retry_err)
+                        or _is_payment_error(retry_err)
+                        or _is_connection_error(retry_err)
+                        or _is_rate_limit_error(retry_err)
+                    ):
+                        raise
+                    first_err = retry_err
+
        if _is_auth_error(first_err) and client_is_nous:
            refreshed_client, refreshed_model = _refresh_nous_auxiliary_client(
                cache_provider=resolved_provider or "nous",
--- a/agent/background_review.py
+++ b/agent/background_review.py
@ -483,6 +483,11 @@ def _run_review_in_thread(
            finally:
                clear_thread_tool_whitelist()

+            # Snapshot review actions before teardown. close() is allowed to
+            # clean per-session state, but the user-visible self-improvement
+            # summary still needs the completed review agent's tool results.
+            review_messages = list(getattr(review_agent, "_session_messages", []))
+
            # Tear down memory providers while stdout is still
            # redirected so background thread teardown (Honcho flush,
            # Hindsight sync, etc.) stays silent.  The finally block
@ -495,7 +500,6 @@ def _run_review_in_thread(
                review_agent.close()
            except Exception:
                pass
-            review_messages = list(getattr(review_agent, "_session_messages", []))
            review_agent = None

        # Scan the review agent's messages for successful tool actions
--- a/agent/bedrock_adapter.py
+++ b/agent/bedrock_adapter.py
@ -1167,18 +1167,6 @@ def _extract_provider_from_arn(arn: str) -> str:
    """
    match = re.search(r"foundation-model/([^.]+)", arn)
    return match.group(1) if match else ""
-
-
-def get_bedrock_model_ids(region: str) -> List[str]:
-    """Return a flat list of available Bedrock model IDs for the given region.
-
-    Convenience wrapper around ``discover_bedrock_models()`` for use in
-    the model selection UI.
-    """
-    models = discover_bedrock_models(region)
-    return [m["id"] for m in models]
-
-
 # ---------------------------------------------------------------------------
 # Error classification — Bedrock-specific exceptions
 # ---------------------------------------------------------------------------
--- a/agent/browser_registry.py
+++ b/agent/browser_registry.py
@ -186,37 +186,6 @@ def _resolve(configured: Optional[str]) -> Optional[BrowserProvider]:
    return None


-def get_active_browser_provider() -> Optional[BrowserProvider]:
-    """Resolve the currently-active cloud browser provider.
-
-    Reads ``browser.cloud_provider`` from config.yaml; falls back per the
-    module docstring. Returns None for local mode or when no provider is
-    available.
-    """
-    try:
-        from hermes_cli.config import read_raw_config
-
-        cfg = read_raw_config()
-        browser_cfg = cfg.get("browser", {})
-    except Exception as exc:
-        logger.debug("Could not read browser config: %s", exc)
-        browser_cfg = {}
-
-    configured: Optional[str] = None
-    if isinstance(browser_cfg, dict) and "cloud_provider" in browser_cfg:
-        try:
-            from tools.tool_backend_helpers import normalize_browser_cloud_provider
-
-            configured = normalize_browser_cloud_provider(
-                browser_cfg.get("cloud_provider")
-            )
-        except Exception as exc:
-            logger.debug("normalize_browser_cloud_provider failed: %s", exc)
-            configured = None
-
-    return _resolve(configured)
-
-
 def _reset_for_tests() -> None:
    """Clear the registry. **Test-only.**"""
    with _lock:
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -15,49 +15,23 @@ sites unchanged.  Symbols that tests patch on ``run_agent`` (e.g.

 from __future__ import annotations

-import concurrent.futures
-import contextvars
-import copy
 import json
 import logging
 import os
-import random
 import re
-import sys
 import threading
 import time
 import uuid
-from datetime import datetime
-from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Dict, List, Optional, Tuple
-from urllib.parse import urlparse, parse_qs, urlunparse
+from typing import Any, Dict, Optional

 from hermes_cli.timeouts import get_provider_request_timeout, get_provider_stale_timeout
 from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
-from agent.error_classifier import classify_api_error, FailoverReason
+from agent.error_classifier import FailoverReason
 from agent.model_metadata import is_local_endpoint
 from agent.message_sanitization import (
    _sanitize_surrogates,
-    _sanitize_messages_surrogates,
-    _sanitize_structure_surrogates,
-    _sanitize_messages_non_ascii,
-    _sanitize_tools_non_ascii,
-    _sanitize_structure_non_ascii,
-    _strip_images_from_messages,
-    _strip_non_ascii,
    _repair_tool_call_arguments,
-    _escape_invalid_chars_in_json_strings,
-)
-from agent.tool_dispatch_helpers import (
-    _is_multimodal_tool_result,
-    _multimodal_text_summary,
-)
-from agent.retry_utils import jittered_backoff
-from agent.tool_guardrails import (
-    ToolGuardrailDecision,
-    append_toolguard_guidance,
-    toolguard_synthetic_result,
 )
 from tools.terminal_tool import is_persistent_env
 from utils import base_url_host_matches, base_url_hostname
@ -175,13 +149,6 @@ def interruptible_api_call(agent, api_kwargs: dict):
            request_client_holder["owner_tid"] = threading.get_ident()
        return client

-    def _take_request_client():
-        with request_client_lock:
-            client = request_client_holder.get("client")
-            request_client_holder["client"] = None
-            request_client_holder["owner_tid"] = None
-            return client
-
    def _close_request_client_once(reason: str) -> None:
        # #29507: dispatch on the calling thread.
        #
@ -310,8 +277,15 @@ def interruptible_api_call(agent, api_kwargs: dict):
    else:
        _codex_idle_timeout_default = 12.0

+    # No-byte TTFB cutoff. The OpenAI SDK's own streaming read timeout is far
+    # longer (openai 2.x DEFAULT_TIMEOUT.read = 600s), so a tight 12s default
+    # killed subscription-backed Codex requests mid-prefill before the backend
+    # had a chance to emit its first SSE event. Default to 120s — long enough to
+    # clear normal backend admission / prompt prefill, short enough to still
+    # reconnect promptly when the socket is genuinely wedged. Set
+    # HERMES_CODEX_TTFB_TIMEOUT_SECONDS=0 to disable this watchdog entirely.
    _ttfb_enabled = _codex_watchdog_enabled
-    _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 12.0)
+    _ttfb_timeout = _env_float("HERMES_CODEX_TTFB_TIMEOUT_SECONDS", 120.0)
    if _ttfb_timeout <= 0:
        _ttfb_enabled = False
    elif _openai_codex_backend:
@ -333,7 +307,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
                _ttfb_disable_above,
            )
        else:
-            _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 20.0)
+            _ttfb_cap = _env_float("HERMES_CODEX_TTFB_MAX_SECONDS", 120.0)
            if _ttfb_cap > 0 and _ttfb_timeout > _ttfb_cap:
                logger.info(
                    "Capping openai-codex no-byte TTFB timeout from %.0fs to %.0fs "
@ -403,13 +377,13 @@ def interruptible_api_call(agent, api_kwargs: dict):
                _elapsed, _ttfb_timeout, api_kwargs.get("model", "unknown"),
            )
            if _silent_hint:
-                agent._emit_status(
+                agent._buffer_status(
                    f"⚠️ No first byte from provider in {int(_elapsed)}s "
                    f"(codex stream, model: {api_kwargs.get('model', 'unknown')}). "
                    f"Reconnecting. {_silent_hint}"
                )
            else:
-                agent._emit_status(
+                agent._buffer_status(
                    f"⚠️ No first byte from provider in {int(_elapsed)}s "
                    f"(codex stream, model: {api_kwargs.get('model', 'unknown')}). "
                    f"Reconnecting."
@ -455,7 +429,7 @@ def interruptible_api_call(agent, api_kwargs: dict):
                api_kwargs.get("model", "unknown"),
                f"{_est_tokens_for_codex_watchdog:,}",
            )
-            agent._emit_status(
+            agent._buffer_status(
                f"⚠️ Codex stream sent no events for {int(_event_stale_elapsed)}s "
                f"after first byte (model: {api_kwargs.get('model', 'unknown')}). "
                f"Reconnecting."
@ -493,13 +467,13 @@ def interruptible_api_call(agent, api_kwargs: dict):
                api_kwargs.get("model", "unknown"), f"{_est_ctx:,}",
            )
            if _silent_hint:
-                agent._emit_status(
+                agent._buffer_status(
                    f"⚠️ No response from provider for {int(_elapsed)}s "
                    f"(non-streaming, model: {api_kwargs.get('model', 'unknown')}). "
                    f"{_silent_hint}"
                )
            else:
-                agent._emit_status(
+                agent._buffer_status(
                    f"⚠️ No response from provider for {int(_elapsed)}s "
                    f"(non-streaming, model: {api_kwargs.get('model', 'unknown')}). "
                    f"Aborting call."
@ -614,12 +588,23 @@ def build_api_kwargs(agent, api_messages: list) -> dict:
        # It also rejects ``enum`` values containing ``/`` (HuggingFace IDs
        # like ``Qwen/Qwen3.5-0.8B`` shipped by MCP servers) — same 400 with
        # the same opaque message; strip those enums too.
+        #
+        # Deep-copy ``tools_for_api`` before sanitizing: the sanitizers
+        # mutate in place (documented contract on ``strip_slash_enum`` /
+        # ``strip_pattern_and_format``), and ``tools_for_api`` is a direct
+        # reference to ``agent.tools``.  Without the copy, the first xAI
+        # request permanently strips constraints from the shared per-agent
+        # tool registry — every subsequent non-xAI call from the same
+        # agent (auxiliary task routed to Anthropic, OpenRouter fallback,
+        # main-model swap) sees the already-stripped schema.  See #27907.
        if is_xai_responses:
            try:
+                import copy as _copy
                from tools.schema_sanitizer import (
                    strip_pattern_and_format,
                    strip_slash_enum,
                )
+                tools_for_api = _copy.deepcopy(tools_for_api)
                tools_for_api, _ = strip_pattern_and_format(tools_for_api)
                tools_for_api, _ = strip_slash_enum(tools_for_api)
            except Exception as exc:
@ -1262,7 +1247,7 @@ def try_activate_fallback(agent, reason: "FailoverReason | None" = None) -> bool
                api_mode=agent.api_mode,
            )

-        agent._emit_status(
+        agent._buffer_status(
            f"🔄 Primary model failed — switching to fallback: "
            f"{fb_model} via {fb_provider}"
        )
@ -1636,13 +1621,6 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
            request_client_holder["owner_tid"] = threading.get_ident()
        return client

-    def _take_request_client():
-        with request_client_lock:
-            client = request_client_holder.get("client")
-            request_client_holder["client"] = None
-            request_client_holder["owner_tid"] = None
-            return client
-
    def _close_request_client_once(reason: str) -> None:
        # See #29507 explanation in the non-streaming variant above. A
        # stranger thread (the interrupt-check / stale-stream detector loop)
@ -2251,7 +2229,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                            mid_tool_call=False,
                            diag=request_client_holder.get("diag"),
                        )
-                        agent._emit_status(
+                        agent._buffer_status(
                            "❌ Provider returned malformed streaming data after "
                            f"{_max_stream_retries + 1} attempts. "
                            "The provider may be experiencing issues — "
@ -2358,7 +2336,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                _stale_elapsed, _stream_stale_timeout,
                api_kwargs.get("model", "unknown"), f"{_est_ctx:,}",
            )
-            agent._emit_status(
+            agent._buffer_status(
                f"⚠️ No response from provider for {int(_stale_elapsed)}s "
                f"(model: {api_kwargs.get('model', 'unknown')}, "
                f"context: ~{_est_ctx:,} tokens). "
--- a/agent/codex_responses_adapter.py
+++ b/agent/codex_responses_adapter.py
@ -980,6 +980,48 @@ def _extract_responses_reasoning_text(item: Any) -> str:
    return ""


+def _format_responses_error(error_obj: Any, response_status: str) -> str:
+    """Build a human-readable error string from a Responses ``response.error`` payload.
+
+    The OpenAI Responses API carries failure details under ``response.error``
+    on terminal ``response.failed`` events, in the shape
+    ``{"code": "rate_limit_exceeded", "message": "Slow down", "param": ...}``.
+    Earlier code only surfaced ``message``, which left users staring at bare
+    strings like ``"Slow down"`` while the failure mode (rate limit vs
+    context-length vs internal_error vs model-overloaded) was hidden in
+    ``code``. We now prefix ``code`` when both are present so consumers can
+    distinguish failure modes without parsing the bare message.
+
+    Falls back to ``code`` alone when ``message`` is empty, and to a stable
+    default referencing the response status when no error payload is
+    available at all. Adapted from anomalyco/opencode#28757.
+    """
+    # Pull code and message from either dict or attribute-style payloads.
+    code: Any = None
+    message: Any = None
+    if isinstance(error_obj, dict):
+        code = error_obj.get("code")
+        message = error_obj.get("message")
+    elif error_obj is not None:
+        code = getattr(error_obj, "code", None)
+        message = getattr(error_obj, "message", None)
+
+    code_str = str(code).strip() if isinstance(code, str) else (str(code).strip() if code else "")
+    message_str = str(message).strip() if isinstance(message, str) else (str(message).strip() if message else "")
+
+    if code_str and message_str:
+        return f"{code_str}: {message_str}"
+    if message_str:
+        return message_str
+    if code_str:
+        return code_str
+    if error_obj:
+        # Last-resort: stringify whatever the provider sent so it's at least
+        # visible in logs/UI rather than silently swallowed.
+        return str(error_obj)
+    return f"Responses API returned status '{response_status}'"
+
+
 # ---------------------------------------------------------------------------
 # Full response normalization
 # ---------------------------------------------------------------------------
@ -1023,10 +1065,7 @@ def _normalize_codex_response(

    if response_status in {"failed", "cancelled"}:
        error_obj = getattr(response, "error", None)
-        if isinstance(error_obj, dict):
-            error_msg = error_obj.get("message") or str(error_obj)
-        else:
-            error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'"
+        error_msg = _format_responses_error(error_obj, response_status)
        raise RuntimeError(error_msg)

    content_parts: List[str] = []
--- a/agent/codex_runtime.py
+++ b/agent/codex_runtime.py
@ -16,7 +16,6 @@ compatibility.

 from __future__ import annotations

-import json
 import logging
 import os
 import time
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -75,6 +75,44 @@ _IMAGE_TOKEN_ESTIMATE = 1600
 _IMAGE_CHAR_EQUIVALENT = _IMAGE_TOKEN_ESTIMATE * _CHARS_PER_TOKEN
 _SUMMARY_FAILURE_COOLDOWN_SECONDS = 600

+# Hard ceiling for the deterministic summary-failure handoff.  The fallback is
+# only meant to preserve continuity anchors from the dropped window, not to
+# become another unbounded transcript copy after the LLM summarizer failed.
+_FALLBACK_SUMMARY_MAX_CHARS = 8_000
+_FALLBACK_TURN_MAX_CHARS = 700
+
+
+_PATH_MENTION_RE = re.compile(r"(?:/|~/?|[A-Za-z]:\\)[^\s`'\")\]}<>]+")
+
+
+def _dedupe_append(items: list[str], value: str, *, limit: int) -> None:
+    value = value.strip()
+    if value and value not in items and len(items) < limit:
+        items.append(value)
+
+
+def _extract_tool_call_name_and_args(tool_call: Any) -> tuple[str, str]:
+    """Return a best-effort ``(name, arguments)`` pair for dict/object tool calls."""
+    if isinstance(tool_call, dict):
+        fn = tool_call.get("function") or {}
+        return str(fn.get("name") or "unknown"), str(fn.get("arguments") or "")
+
+    fn = getattr(tool_call, "function", None)
+    if fn is None:
+        return "unknown", ""
+    return str(getattr(fn, "name", None) or "unknown"), str(getattr(fn, "arguments", None) or "")
+
+
+def _extract_tool_call_id(tool_call: Any) -> str:
+    if isinstance(tool_call, dict):
+        return str(tool_call.get("id") or "")
+    return str(getattr(tool_call, "id", "") or "")
+
+
+def _collect_path_mentions(text: str, relevant_files: list[str], *, limit: int = 12) -> None:
+    for match in _PATH_MENTION_RE.findall(text):
+        _dedupe_append(relevant_files, match.rstrip(".,:;"), limit=limit)
+

 def _content_length_for_budget(raw_content: Any) -> int:
    """Return the effective char-length of a message's content for token budgeting.
@ -221,6 +259,114 @@ def _truncate_tool_call_args_json(args: str, head_chars: int = 200) -> str:
    return json.dumps(shrunken, ensure_ascii=False)


+_IMAGE_PART_TYPES = frozenset({"image_url", "input_image", "image"})
+
+
+def _is_image_part(part: Any) -> bool:
+    """True if ``part`` is a multimodal image content block.
+
+    Recognizes all three shapes the agent handles:
+      - OpenAI chat.completions: ``{"type": "image_url", "image_url": ...}``
+      - OpenAI Responses API:    ``{"type": "input_image", "image_url": "..."}``
+      - Anthropic native:        ``{"type": "image", "source": {...}}``
+    """
+    if not isinstance(part, dict):
+        return False
+    return part.get("type") in _IMAGE_PART_TYPES
+
+
+def _content_has_images(content: Any) -> bool:
+    """True if a message's ``content`` is a multimodal list with image parts."""
+    if not isinstance(content, list):
+        return False
+    return any(_is_image_part(p) for p in content)
+
+
+def _strip_images_from_content(content: Any) -> Any:
+    """Return a copy of ``content`` with every image part replaced by a
+    short text placeholder.
+
+    - String content is returned unchanged.
+    - Non-list, non-string content is returned unchanged.
+    - List content: image parts become ``{"type": "text", "text": "[Attached
+      image — stripped after compression]"}``; other parts are preserved as-is.
+
+    Input is never mutated.
+    """
+    if not isinstance(content, list):
+        return content
+    if not any(_is_image_part(p) for p in content):
+        return content
+
+    new_parts: List[Any] = []
+    for p in content:
+        if _is_image_part(p):
+            new_parts.append({
+                "type": "text",
+                "text": "[Attached image — stripped after compression]",
+            })
+        else:
+            new_parts.append(p)
+    return new_parts
+
+
+def _strip_historical_media(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Replace image parts in older messages with placeholder text.
+
+    The anchor is the *last* user message that has any image content. Every
+    message before that anchor gets its image parts replaced with a short
+    placeholder so the outgoing request stops re-shipping the same multi-MB
+    base-64 image blobs on every turn.
+
+    If no user message carries images, the list is returned unchanged.
+    If the only user message with images is the very first one (nothing
+    earlier to strip), the list is returned unchanged.
+
+    Shallow copies of touched messages only; input is never mutated.
+    Port of Kilo-Org/kilocode#9434 (adapted for the OpenAI-style message
+    shape the hermes compressor emits).
+    """
+    if not messages:
+        return messages
+
+    # Find the newest user message that carries at least one image part.
+    # We anchor on image-bearing user messages (not all user messages) so
+    # a plain text follow-up after a big-image turn still strips the old
+    # image — matching the problem kilocode#9434 set out to solve.
+    anchor = -1
+    for i in range(len(messages) - 1, -1, -1):
+        msg = messages[i]
+        if not isinstance(msg, dict):
+            continue
+        if msg.get("role") != "user":
+            continue
+        if _content_has_images(msg.get("content")):
+            anchor = i
+            break
+
+    if anchor <= 0:
+        # No image-bearing user message, or it's the very first message —
+        # nothing before it to strip.
+        return messages
+
+    changed = False
+    result: List[Dict[str, Any]] = []
+    for i, msg in enumerate(messages):
+        if i >= anchor or not isinstance(msg, dict):
+            result.append(msg)
+            continue
+        content = msg.get("content")
+        if not _content_has_images(content):
+            result.append(msg)
+            continue
+        new_msg = msg.copy()
+        new_msg["content"] = _strip_images_from_content(content)
+        result.append(new_msg)
+        changed = True
+
+    return result if changed else messages
+
+
 def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> str:
    """Create an informative 1-line summary of a tool call + result.

@ -372,6 +518,10 @@ class ContextCompressor(ContextEngine):
        self._last_compression_savings_pct = 100.0
        self._ineffective_compression_count = 0
        self._summary_failure_cooldown_until = 0.0  # transient errors must not block a fresh session
+        self.last_real_prompt_tokens = 0
+        self.last_compression_rough_tokens = 0
+        self.last_rough_tokens_when_real_prompt_fit = 0
+        self.awaiting_real_usage_after_compression = False

    def update_model(
        self,
@ -429,8 +579,8 @@ class ContextCompressor(ContextEngine):
        self.quiet_mode = quiet_mode
        # When True, summary-generation failure aborts compression entirely
        # (returns messages unchanged, sets _last_compress_aborted=True).
-        # When False (default = historical behavior), insert a static
-        # "summary unavailable" placeholder and drop the middle window.
+        # When False (default = historical behavior), insert a
+        # deterministic "summary unavailable" handoff and drop the middle window.
        self.abort_on_summary_failure = abort_on_summary_failure

        self.context_length = get_model_context_length(
@ -469,6 +619,10 @@ class ContextCompressor(ContextEngine):

        self.last_prompt_tokens = 0
        self.last_completion_tokens = 0
+        self.last_real_prompt_tokens = 0
+        self.last_compression_rough_tokens = 0
+        self.last_rough_tokens_when_real_prompt_fit = 0
+        self.awaiting_real_usage_after_compression = False

        self.summary_model = summary_model_override or ""

@ -502,6 +656,44 @@ class ContextCompressor(ContextEngine):
        self.last_prompt_tokens = usage.get("prompt_tokens", 0)
        self.last_completion_tokens = usage.get("completion_tokens", 0)
        self.last_total_tokens = usage.get("total_tokens", self.last_prompt_tokens + self.last_completion_tokens)
+        if self.last_prompt_tokens > 0:
+            self.last_real_prompt_tokens = self.last_prompt_tokens
+            if self.last_prompt_tokens < self.threshold_tokens:
+                if self.awaiting_real_usage_after_compression and self.last_compression_rough_tokens > 0:
+                    self.last_rough_tokens_when_real_prompt_fit = self.last_compression_rough_tokens
+            else:
+                self.last_rough_tokens_when_real_prompt_fit = 0
+        self.awaiting_real_usage_after_compression = False
+
+    def should_defer_preflight_to_real_usage(self, rough_tokens: int) -> bool:
+        """Return True when a high rough preflight estimate is known-noisy.
+
+        ``estimate_request_tokens_rough(..., tools=...)`` intentionally
+        overestimates schema-heavy requests so Hermes compresses before a
+        provider rejects the payload. After a successful compressed API call,
+        though, provider ``prompt_tokens`` are a better signal than repeating
+        compaction from the same rough schema overhead. Defer only while the
+        rough estimate has grown modestly since a request the provider proved
+        fit under the threshold.
+        """
+        if rough_tokens < self.threshold_tokens:
+            return False
+        if self.last_real_prompt_tokens <= 0:
+            return False
+        if self.last_real_prompt_tokens >= self.threshold_tokens:
+            return False
+
+        baseline = self.last_rough_tokens_when_real_prompt_fit or self.last_compression_rough_tokens
+        if baseline <= 0:
+            return False
+
+        growth = max(0, rough_tokens - baseline)
+        tolerated_growth = max(4096, int(self.threshold_tokens * 0.05))
+        if growth > tolerated_growth:
+            return False
+
+        self.last_rough_tokens_when_real_prompt_fit = max(baseline, rough_tokens)
+        return True

    def should_compress(self, prompt_tokens: int = None) -> bool:
        """Check if context exceeds the compression threshold.
@ -776,6 +968,195 @@ class ContextCompressor(ContextEngine):

        return "\n\n".join(parts)

+    def _build_static_fallback_summary(
+        self,
+        turns_to_summarize: List[Dict[str, Any]],
+        reason: str | None = None,
+    ) -> str:
+        """Build a deterministic handoff when the LLM summarizer is unavailable.
+
+        This is intentionally much less rich than an LLM-written summary, but it
+        is still better than a bare "N messages were removed" marker.  It keeps
+        the most useful continuity anchors that can be extracted locally:
+        recent user asks, assistant/tool actions, files/commands mentioned in
+        tool calls, and any error text.  The result uses the normal summary
+        structure so downstream prompts can recover gracefully after a provider
+        outage or summary-model failure.
+        """
+        user_asks: list[str] = []
+        assistant_actions: list[str] = []
+        tool_actions: list[str] = []
+        relevant_files: list[str] = []
+        blockers: list[str] = []
+        last_dropped_turns: list[str] = []
+
+        def _compact_fallback_turn(value: Any) -> str:
+            text = redact_sensitive_text(_content_text_for_contains(value))
+            text = re.sub(r"\bgh[pousr]_[A-Za-z0-9_]{8,}\b", "[REDACTED]", text)
+            text = re.sub(r"\s+", " ", text).strip()
+            if len(text) > _FALLBACK_TURN_MAX_CHARS:
+                text = text[: _FALLBACK_TURN_MAX_CHARS - 15].rstrip() + " ...[truncated]"
+            return re.sub(r"\bgh[pousr]_[A-Za-z0-9_.-]+", "[REDACTED]", text)
+
+        def _remember_dropped_turn(label: str, text: str, *, limit: int = 8) -> None:
+            text = text.strip()
+            if not text:
+                return
+            last_dropped_turns.append(f"{label}: {text}")
+            if len(last_dropped_turns) > limit:
+                del last_dropped_turns[0]
+
+        def _collect_paths_from_jsonish(obj: Any) -> None:
+            if isinstance(obj, dict):
+                for key, val in obj.items():
+                    if key in {"path", "workdir", "file_path", "output_path"} and isinstance(val, str):
+                        _dedupe_append(relevant_files, val, limit=12)
+                    _collect_paths_from_jsonish(val)
+            elif isinstance(obj, list):
+                for val in obj:
+                    _collect_paths_from_jsonish(val)
+            elif isinstance(obj, str):
+                _collect_path_mentions(obj, relevant_files)
+
+        call_id_to_tool: dict[str, tuple[str, str]] = {}
+        for msg in turns_to_summarize:
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                for tc in msg.get("tool_calls") or []:
+                    name, raw_args = _extract_tool_call_name_and_args(tc)
+                    args = redact_sensitive_text(raw_args)
+                    call_id = _extract_tool_call_id(tc)
+                    if call_id:
+                        call_id_to_tool[call_id] = (name, args)
+                    if args:
+                        try:
+                            parsed = json.loads(args)
+                        except Exception:
+                            parsed = args
+                        _collect_paths_from_jsonish(parsed)
+
+        for msg in turns_to_summarize:
+            role = msg.get("role", "unknown")
+            text = _compact_fallback_turn(msg.get("content"))
+            _collect_path_mentions(text, relevant_files)
+
+            turn_text = text
+            turn_tool_names: list[str] = []
+            if role == "assistant" and msg.get("tool_calls"):
+                for tc in msg.get("tool_calls") or []:
+                    name, _args = _extract_tool_call_name_and_args(tc)
+                    turn_tool_names.append(name)
+                if turn_tool_names:
+                    prefix = "tool calls: " + ", ".join(turn_tool_names[:6])
+                    turn_text = f"{prefix}; {turn_text}" if turn_text else prefix
+            _remember_dropped_turn(str(role).upper(), turn_text)
+
+            if len(text) > 600:
+                text = text[:420].rstrip() + " ... " + text[-160:].lstrip()
+
+            if role == "user" and text:
+                user_asks.append(text)
+            elif role == "assistant":
+                tool_names: list[str] = []
+                for tc in msg.get("tool_calls") or []:
+                    name, _args = _extract_tool_call_name_and_args(tc)
+                    tool_names.append(name)
+                if tool_names:
+                    assistant_actions.append(
+                        "Called tool(s): " + ", ".join(tool_names[:6])
+                    )
+                elif text:
+                    assistant_actions.append(text)
+            elif role == "tool":
+                call_id = str(msg.get("tool_call_id") or "")
+                tool_name, tool_args = call_id_to_tool.get(call_id, ("unknown", ""))
+                tool_actions.append(
+                    _summarize_tool_result(tool_name, tool_args, text or "")
+                )
+                if re.search(
+                    r"\b(error|failed|exception|traceback|timeout|timed out|fatal)\b",
+                    text,
+                    re.I,
+                ):
+                    blockers.append(text[:500])
+
+        def _bullets(items: list[str], limit: int = 8) -> str:
+            unique: list[str] = []
+            seen: set[str] = set()
+            for item in items:
+                item = item.strip()
+                if not item or item in seen:
+                    continue
+                seen.add(item)
+                unique.append(item)
+                if len(unique) >= limit:
+                    break
+            return "\n".join(f"- {item}" for item in unique) if unique else "None."
+
+        completed: list[str] = []
+        for idx, item in enumerate((assistant_actions + tool_actions)[:12], start=1):
+            completed.append(f"{idx}. {item}")
+
+        active_task = (
+            f"User asked: {user_asks[-1]!r}"
+            if user_asks
+            else "Unknown from deterministic fallback."
+        )
+        previous_summary_note = ""
+        if self._previous_summary:
+            previous_summary_note = (
+                "\n\nPrevious compaction summary was present and should still be treated as "
+                "background continuity context, but the latest LLM summary update failed."
+            )
+
+        reason_text = f" Summary failure reason: {reason}." if reason else ""
+        body = f"""## Active Task
+{active_task}
+
+## Goal
+Recovered from a deterministic fallback because the LLM context summarizer was unavailable. Continue from the protected recent messages after this summary and use current file/system state for exact details.{previous_summary_note}
+
+## Constraints & Preferences
+- This fallback was generated locally without an LLM summary call.
+- Secrets and credentials were redacted before preservation.
+- The summary may be incomplete; prefer verifying current files, git state, processes, and test results instead of assuming omitted details.
+
+## Completed Actions
+{chr(10).join(completed) if completed else "None recoverable from compacted turns."}
+
+## Active State
+Unknown from deterministic fallback. Inspect current repository/session state if needed.
+
+## In Progress
+{active_task}
+
+## Blocked
+{_bullets(blockers, limit=5)}
+
+## Key Decisions
+None recoverable from deterministic fallback.
+
+## Resolved Questions
+None recoverable from deterministic fallback.
+
+## Pending User Asks
+{active_task}
+
+## Relevant Files
+{_bullets(relevant_files, limit=12)}
+
+## Remaining Work
+Continue from the most recent unfulfilled user ask and protected tail messages. Verify state with tools before making claims.
+
+## Last Dropped Turns
+{_bullets(last_dropped_turns, limit=8)}
+
+## Critical Context
+Summary generation was unavailable, so this is a best-effort deterministic fallback for {len(turns_to_summarize)} compacted message(s).{reason_text}"""
+        summary = self._with_summary_prefix(redact_sensitive_text(body.strip()))
+        if len(summary) > _FALLBACK_SUMMARY_MAX_CHARS:
+            summary = summary[: _FALLBACK_SUMMARY_MAX_CHARS - 42].rstrip() + "\n...[fallback summary truncated]"
+        return summary
+
    def _fallback_to_main_for_compression(self, e: Exception, reason: str) -> None:
        """Switch from a separate ``summary_model`` back to the main model.

@ -803,7 +1184,11 @@ class ContextCompressor(ContextEngine):
        self.summary_model = ""  # empty = use main model
        self._summary_failure_cooldown_until = 0.0  # no cooldown — retry immediately

-    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
+    def _generate_summary(
+        self,
+        turns_to_summarize: List[Dict[str, Any]],
+        focus_topic: Optional[str] = None,
+    ) -> Optional[str]:
        """Generate a structured summary of conversation turns.

        Uses a structured template (Goal, Progress, Decisions, Resolved/Pending
@ -1500,9 +1885,9 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        #   True  → ABORT compression entirely. Return messages unchanged
        #           and set _last_compress_aborted=True so callers can warn
        #           the user and stop the auto-compress retry loop.
-        #   False → Fall through to the legacy fallback path below: insert
-        #           a static "summary unavailable" placeholder and drop the
-        #           middle window.  Records _last_summary_fallback_used /
+        #   False → Fall through to the default fallback path below: insert
+        #           a deterministic "summary unavailable" handoff and drop
+        #           the middle window.  Records _last_summary_fallback_used /
        #           _last_summary_dropped_count for gateway hygiene to
        #           surface a warning.
        # Default is False (historical behavior).
@ -1535,21 +1920,18 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                    )
            compressed.append(msg)

-        # Legacy fallback path: LLM summary failed and abort_on_summary_failure
-        # is False (the default).  Insert a static placeholder so the model
-        # knows context was lost rather than silently dropping everything.
+        # If LLM summary failed, insert a deterministic fallback so the model
+        # gets at least locally recoverable continuity anchors instead of a
+        # content-free "N messages were removed" marker.
        if not summary:
            if not self.quiet_mode:
-                logger.warning("Summary generation failed — inserting static fallback context marker")
+                logger.warning("Summary generation failed — inserting deterministic fallback context summary")
            n_dropped = compress_end - compress_start
            self._last_summary_dropped_count = n_dropped
            self._last_summary_fallback_used = True
-            summary = (
-                f"{SUMMARY_PREFIX}\n"
-                f"Summary generation was unavailable. {n_dropped} message(s) were "
-                f"removed to free context space but could not be summarized. The removed "
-                f"messages contained earlier work in this session. Continue based on the "
-                f"recent messages below and the current state of any files or resources."
+            summary = self._build_static_fallback_summary(
+                turns_to_summarize,
+                reason=self._last_summary_error,
            )

        _merge_summary_into_tail = False
@ -1609,6 +1991,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio

        compressed = self._sanitize_tool_pairs(compressed)

+        # Replace image parts in all compressed messages before the newest
+        # image-bearing user turn with a short text placeholder. Without
+        # this, tail messages keep their original multi-MB base-64 image
+        # payloads forever, which can push every subsequent API request
+        # past the provider's body-size limit and wedge the session.
+        # Port of Kilo-Org/kilocode#9434.
+        compressed = _strip_historical_media(compressed)
+
        new_estimate = estimate_messages_tokens_rough(compressed)
        saved_estimate = display_tokens - new_estimate

--- a/agent/context_engine.py
+++ b/agent/context_engine.py
@ -71,7 +71,12 @@ class ContextEngine(ABC):
    def update_from_response(self, usage: Dict[str, Any]) -> None:
        """Update tracked token usage from an API response.

-        Called after every LLM call with the usage dict from the response.
+        Called after every LLM call with a normalized usage dict. The legacy
+        keys ``prompt_tokens``, ``completion_tokens``, and ``total_tokens``
+        are always present. Newer hosts also include canonical buckets:
+        ``input_tokens``, ``output_tokens``, ``cache_read_tokens``,
+        ``cache_write_tokens``, and ``reasoning_tokens``. Engines should
+        treat those fields as optional for compatibility with older hosts.
        """

    @abstractmethod
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -34,13 +34,33 @@ import tempfile
 import uuid
 from datetime import datetime
 from pathlib import Path
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional, Tuple

 from agent.model_metadata import estimate_request_tokens_rough

 logger = logging.getLogger(__name__)


+def _compression_lock_holder(agent: Any) -> str:
+    """Build a unique holder id for the lock: pid:tid:agent-instance:uuid.
+
+    The pid+tid prefix lets ops tell crashed/abandoned holders apart from
+    live ones (expiry-based recovery uses the timestamp, but ``holder``
+    is what shows up in diagnostics + log lines). The agent instance id
+    and a per-acquire uuid disambiguate two co-resident agents on the
+    same thread (background_review forks run on a worker thread, but
+    on machines where compression itself dispatches to a thread pool
+    we want each acquire to be unique).
+    """
+    import threading
+    return (
+        f"pid={os.getpid()}"
+        f":tid={threading.get_ident()}"
+        f":agent={id(agent):x}"
+        f":nonce={uuid.uuid4().hex[:8]}"
+    )
+
+
 def check_compression_model_feasibility(agent: Any) -> None:
    """Warn at session start if the auxiliary compression model's context
    window is smaller than the main model's compression threshold.
@ -305,6 +325,103 @@ def compress_context(
        "🗜️ Compacting context — summarizing earlier conversation so I can continue..."
    )

+    # ── Compression lock ────────────────────────────────────────────────
+    # Atomic, state.db-backed lock per session_id.  Without this, two
+    # AIAgent instances that share the same session_id (most commonly the
+    # parent-turn agent and its background-review fork — see
+    # ``agent/background_review.py``: ``review_agent.session_id =
+    # agent.session_id``) can each call compress() on overlapping
+    # snapshots of the same conversation.  Both succeed, both rotate
+    # ``agent.session_id`` to a fresh id, both create child sessions in
+    # state.db parented to the same old id.  The gateway's SessionEntry
+    # only catches one rotation, so the other child becomes an orphan
+    # that silently accumulates writes — Damien's repro shape.
+    #
+    # Acquire keyed on the OLD session_id (the rotation target's parent),
+    # because that's the id that competing paths see and read from
+    # SessionEntry at the start of their own compression attempt.
+    #
+    # If we can't acquire the lock, another path is mid-compression on
+    # this session.  Aborting is correct: the messages are unchanged, the
+    # other path's rotation will produce the canonical new session_id,
+    # and our caller's auto-compress loop sees ``len(returned) == len(input)``
+    # and stops retrying for this cycle. The session is NOT corrupted —
+    # we just sit out this round and let the winner finish.
+    _lock_db = getattr(agent, "_session_db", None)
+    _lock_sid = agent.session_id or ""
+    _lock_holder: Optional[str] = None
+    # Probe whether the lock subsystem is actually available on this
+    # SessionDB instance.  A process running mismatched module versions
+    # (e.g. ``conversation_compression.py`` reloaded after a pull but the
+    # long-lived ``hermes_state.SessionDB`` class still bound to the
+    # pre-#34351 version in memory) has the call site but not the method.
+    # In that case ``try_acquire_compression_lock`` raises AttributeError —
+    # NOT a ``sqlite3.Error`` — so the method's own fail-open guard never
+    # runs and the exception propagates to the outer agent loop, which
+    # prints the error and retries.  Because compression never succeeds,
+    # the token count never drops and the loop re-triggers compaction
+    # forever (the "API call #47/#48/#49 ... has no attribute
+    # try_acquire_compression_lock" spin).  Fail OPEN here: if the lock
+    # subsystem is missing or broken in any unexpected way, skip locking
+    # and proceed with compression.  Skipping the lock risks a rare
+    # concurrent-compression session fork; an infinite no-progress loop
+    # that never compresses at all is strictly worse.
+    if _lock_db is not None and _lock_sid:
+        _lock_holder = _compression_lock_holder(agent)
+        try:
+            _lock_acquired = _lock_db.try_acquire_compression_lock(
+                _lock_sid, _lock_holder
+            )
+        except Exception as _lock_err:
+            # Broken/absent lock subsystem (version skew, etc.).  Log once
+            # per session and proceed WITHOUT the lock rather than letting
+            # the exception spin the outer loop.
+            _lock_holder = None  # we don't own anything to release
+            if getattr(agent, "_last_compression_lock_error_sid", None) != _lock_sid:
+                agent._last_compression_lock_error_sid = _lock_sid
+                logger.warning(
+                    "compression lock subsystem unavailable for session=%s "
+                    "(%s: %s) — proceeding without lock. This usually means a "
+                    "stale in-memory module after an update; restart the "
+                    "process (or `hermes update`) to resync.",
+                    _lock_sid, type(_lock_err).__name__, _lock_err,
+                )
+            _lock_acquired = True  # treat as acquired-but-unlocked; proceed
+        if not _lock_acquired:
+            try:
+                existing = _lock_db.get_compression_lock_holder(_lock_sid)
+            except Exception:
+                existing = None
+            logger.warning(
+                "compression skipped: another path is compressing session=%s "
+                "(holder=%s) — returning messages unchanged to avoid session fork",
+                _lock_sid, existing,
+            )
+            _lock_holder = None  # don't release a lock we don't own
+            # Surface to the user once — quiet for downstream auto-compress loops
+            if getattr(agent, "_last_compression_lock_warning_sid", None) != _lock_sid:
+                agent._last_compression_lock_warning_sid = _lock_sid
+                try:
+                    agent._emit_warning(
+                        "⚠ Skipping concurrent compression — another path "
+                        "is already compressing this session. Will retry "
+                        "after it finishes."
+                    )
+                except Exception:
+                    pass
+            _existing_sp = getattr(agent, "_cached_system_prompt", None)
+            if not _existing_sp:
+                _existing_sp = agent._build_system_prompt(system_message)
+            return messages, _existing_sp
+
+    def _release_lock() -> None:
+        """Release the lock keyed on the OLD session_id (before rotation)."""
+        if _lock_db is not None and _lock_sid and _lock_holder:
+            try:
+                _lock_db.release_compression_lock(_lock_sid, _lock_holder)
+            except Exception as _rel_err:
+                logger.debug("compression lock release failed: %s", _rel_err)
+
    # Notify external memory provider before compression discards context
    if agent._memory_manager:
        try:
@ -318,6 +435,11 @@ def compress_context(
        # Plugin context engine with strict signature that doesn't accept
        # focus_topic / force — fall back to calling without them.
        compressed = agent.context_compressor.compress(messages, current_tokens=approx_tokens)
+    except BaseException:
+        # ANY exception during compress() must release the lock so the
+        # session isn't permanently blocked from future compression.
+        _release_lock()
+        raise

    # If compression aborted (aux LLM failed to produce a usable summary)
    # the compressor returns the input messages unchanged.  Surface the
@ -336,6 +458,7 @@ def compress_context(
        _existing_sp = getattr(agent, "_cached_system_prompt", None)
        if not _existing_sp:
            _existing_sp = agent._build_system_prompt(system_message)
+        _release_lock()  # compression aborted — no rotation will happen
        return messages, _existing_sp

    summary_error = getattr(agent.context_compressor, "_last_summary_error", None)
@ -421,6 +544,7 @@ def compress_context(
                agent.session_id or "",
                boundary_reason="compression",
                old_session_id=_old_sid,
+                conversation_id=getattr(agent, "_gateway_session_key", None),
            )
    except Exception as _ce_err:
        logger.debug("context engine on_session_start (compression): %s", _ce_err)
@ -479,6 +603,12 @@ def compress_context(
        agent.session_id or "none", _pre_msg_count, len(compressed),
        f"{_compressed_est:,}",
    )
+    # Release the lock on the OLD session_id only AFTER rotation completed
+    # and all post-rotation bookkeeping (memory manager, context engine,
+    # file dedup) ran. A concurrent path that wakes up the moment we
+    # release will see the NEW session_id in state.db / SessionEntry and
+    # acquire on that — no race against our just-finished work.
+    _release_lock()
    return compressed, new_system_prompt


--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -27,8 +27,6 @@ import time
 import uuid
 from typing import Any, Dict, List, Optional

-from agent.anthropic_adapter import _is_oauth_token
-from agent.auxiliary_client import set_runtime_main
 from agent.codex_responses_adapter import _summarize_user_message_for_log
 from agent.display import KawaiiSpinner
 from agent.error_classifier import FailoverReason, classify_api_error
@ -49,25 +47,17 @@ from agent.model_metadata import (
    MINIMUM_CONTEXT_LENGTH,
    estimate_messages_tokens_rough,
    estimate_request_tokens_rough,
-    get_next_probe_tier,
+    get_context_length_from_provider_error,
    parse_available_output_tokens_from_error,
-    parse_context_limit_from_error,
    save_context_length,
 )
-from agent.nous_rate_guard import (
-    clear_nous_rate_limit,
-    is_genuine_nous_rate_limit,
-    nous_rate_limit_remaining,
-    record_nous_rate_limit,
-)
 from agent.process_bootstrap import _install_safe_stdio
 from agent.prompt_caching import apply_anthropic_cache_control
 from agent.retry_utils import jittered_backoff
 from agent.trajectory import has_incomplete_scratchpad
 from agent.usage_pricing import estimate_usage_cost, normalize_usage
-from hermes_constants import display_hermes_home as _dhh_fn, PARTIAL_STREAM_STUB_ID
+from hermes_constants import PARTIAL_STREAM_STUB_ID
 from hermes_logging import set_session_context
-from tools.schema_sanitizer import strip_pattern_and_format
 from tools.skill_provenance import set_current_write_origin
 from utils import base_url_host_matches, env_var_enabled

@ -127,6 +117,104 @@ def _ra():
    return run_agent


+def _nous_entitlement_message(capability: str) -> str:
+    try:
+        from hermes_cli.nous_account import (
+            format_nous_portal_entitlement_message,
+            get_nous_portal_account_info,
+        )
+
+        account_info = get_nous_portal_account_info(force_fresh=True)
+        message = format_nous_portal_entitlement_message(
+            account_info,
+            capability=capability,
+        )
+        return message or ""
+    except Exception:
+        return ""
+
+
+def _print_nous_entitlement_guidance(agent, capability: str) -> bool:
+    message = _nous_entitlement_message(capability)
+    if not message:
+        return False
+    for line in message.splitlines():
+        agent._vprint(f"{agent.log_prefix}   💡 {line}", force=True)
+    return True
+
+
+def _is_nous_inference_route(provider: str, base_url: str) -> bool:
+    provider = (provider or "").strip().lower()
+    if provider == "nous":
+        return True
+    base = str(base_url or "")
+    return (
+        base_url_host_matches(base, "inference-api.nousresearch.com")
+        or base_url_host_matches(base, "inference.nousresearch.com")
+    )
+
+
+def _billing_or_entitlement_message(
+    *,
+    capability: str,
+    provider: str,
+    base_url: str,
+    model: str,
+) -> str:
+    if _is_nous_inference_route(provider, base_url):
+        return _nous_entitlement_message(capability)
+
+    provider_label = (provider or "").strip() or "the selected provider"
+    model_label = (model or "").strip() or "the selected model"
+    lines = [
+        (
+            f"{provider_label} reported that billing, credits, or account "
+            f"entitlement is exhausted for {model_label}."
+        ),
+        "Add credits or update billing with that provider, then retry.",
+    ]
+    if base_url_host_matches(str(base_url or ""), "openrouter.ai"):
+        lines.append("OpenRouter credits: https://openrouter.ai/settings/credits")
+    lines.append("You can switch providers temporarily with /model <model> --provider <provider>.")
+    return "\n".join(lines)
+
+
+def _print_billing_or_entitlement_guidance(
+    agent,
+    *,
+    capability: str,
+    provider: str,
+    base_url: str,
+    model: str,
+) -> bool:
+    message = _billing_or_entitlement_message(
+        capability=capability,
+        provider=provider,
+        base_url=base_url,
+        model=model,
+    )
+    if not message:
+        return False
+    for line in message.splitlines():
+        agent._vprint(f"{agent.log_prefix}   💡 {line}", force=True)
+    return True
+
+
+def _try_refresh_nous_paid_entitlement_credentials(agent) -> bool:
+    """Refresh Nous runtime credentials after a fresh paid-entitlement check."""
+    try:
+        from hermes_cli.nous_account import get_nous_portal_account_info
+
+        account_info = get_nous_portal_account_info(force_fresh=True)
+        if account_info.paid_service_access is not True:
+            return False
+        return agent._try_refresh_nous_client_credentials(
+            force=True,
+        )
+    except Exception:
+        return False
+
+
 def _restore_or_build_system_prompt(agent, system_message, conversation_history):
    """Restore the cached system prompt from the session DB or build it fresh.

@ -310,7 +398,6 @@ def run_conversation(

    # Tag all log records on this thread with the session ID so
    # ``hermes logs --session <id>`` can filter a single conversation.
-    from hermes_logging import set_session_context
    set_session_context(agent.session_id)

    # Bind the skill write-origin ContextVar for this thread so tool
@ -319,7 +406,6 @@ def run_conversation(
    # a foreground user-directed turn. Set at the top of each call;
    # the review fork runs on its own thread with a fresh context,
    # so the foreground value here does not leak into it.
-    from tools.skill_provenance import set_current_write_origin
    set_current_write_origin(getattr(agent, "_memory_write_origin", "assistant_tool"))

    # If the previous turn activated fallback, restore the primary
@ -1017,6 +1103,7 @@ def run_conversation(
        codex_auth_retry_attempted=False
        anthropic_auth_retry_attempted=False
        nous_auth_retry_attempted=False
+        nous_paid_entitlement_refresh_attempted=False
        copilot_auth_retry_attempted=False
        thinking_sig_retry_attempted = False
        invalid_encrypted_content_retry_attempted = False
@ -1050,17 +1137,18 @@ def run_conversation(
                            f"Nous Portal rate limit active — "
                            f"resets in {_fmt_nous_remaining(_nous_remaining)}."
                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}⏳ {_nous_msg} Trying fallback...",
-                            force=True,
+                        agent._buffer_vprint(
+                            f"⏳ {_nous_msg} Trying fallback..."
                        )
-                        agent._emit_status(f"⏳ {_nous_msg}")
+                        agent._buffer_status(f"⏳ {_nous_msg}")
                        if agent._try_activate_fallback():
                            retry_count = 0
                            compression_attempts = 0
                            primary_recovery_attempted = False
                            continue
-                        # No fallback available — return with clear message
+                        # No fallback available — surface buffered context
+                        # so user sees the rate-limit message that led here.
+                        agent._flush_status_buffer()
                        agent._persist_session(messages, conversation_history)
                        return {
                            "final_response": (
@ -1082,6 +1170,14 @@ def run_conversation(

            try:
                agent._reset_stream_delivery_tracking()
+                # api_messages is built once, before this retry loop, while the
+                # primary provider is active.  A mid-conversation fallback can
+                # switch to a require-side provider (DeepSeek / Kimi / MiMo) that
+                # rejects assistant turns lacking reasoning_content.  Re-apply the
+                # echo-back pad for the *current* provider here (idempotent no-op
+                # unless the active provider needs it) so the fallback request
+                # isn't sent with stale, primary-shaped reasoning fields.
+                agent._reapply_reasoning_echo_for_provider(api_messages)
                api_kwargs = agent._build_api_kwargs(api_messages)
                if agent._force_ascii_payload:
                    _sanitize_structure_non_ascii(api_kwargs)
@ -1275,9 +1371,10 @@ def run_conversation(
                            error_details.append("response.choices is empty")

                if response_invalid:
-                    # Stop spinner before printing error messages
+                    # Stop spinner silently — retry status is now buffered
+                    # and only surfaced if every retry+fallback exhausts.
                    if thinking_spinner:
-                        thinking_spinner.stop("(´;ω;`) oops, retrying...")
+                        thinking_spinner.stop("")
                        thinking_spinner = None
                    if agent.thinking_callback:
                        agent.thinking_callback("")
@ -1290,7 +1387,7 @@ def run_conversation(
                    # rate-limit symptom.  Switch to fallback immediately
                    # rather than retrying with extended backoff.
                    if agent._fallback_index < len(agent._fallback_chain):
-                        agent._emit_status("⚠️ Empty/malformed response — switching to fallback...")
+                        agent._buffer_status("⚠️ Empty/malformed response — switching to fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
@ -1352,20 +1449,22 @@ def run_conversation(
                    else:
                        _failure_hint = f"response time {api_duration:.1f}s"

-                    agent._vprint(f"{agent.log_prefix}⚠️  Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True)
-                    agent._vprint(f"{agent.log_prefix}   🏢 Provider: {provider_name}", force=True)
+                    agent._buffer_vprint(f"⚠️  Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}")
+                    agent._buffer_vprint(f"   🏢 Provider: {provider_name}")
                    cleaned_provider_error = agent._clean_error_message(error_msg)
-                    agent._vprint(f"{agent.log_prefix}   📝 Provider message: {cleaned_provider_error}", force=True)
-                    agent._vprint(f"{agent.log_prefix}   ⏱️  {_failure_hint}", force=True)
+                    agent._buffer_vprint(f"   📝 Provider message: {cleaned_provider_error}")
+                    agent._buffer_vprint(f"   ⏱️  {_failure_hint}")
                    
                    if retry_count >= max_retries:
                        # Try fallback before giving up
-                        agent._emit_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
+                        agent._buffer_status(f"⚠️ Max retries ({max_retries}) for invalid responses — trying fallback...")
                        if agent._try_activate_fallback():
                            retry_count = 0
                            compression_attempts = 0
                            primary_recovery_attempted = False
                            continue
+                        # Terminal — flush buffered retry trace so user sees what happened.
+                        agent._flush_status_buffer()
                        agent._emit_status(f"❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
                        logger.error(f"{agent.log_prefix}Invalid API response after {max_retries} retries.")
                        agent._persist_session(messages, conversation_history)
@ -1379,7 +1478,7 @@ def run_conversation(
                    
                    # Backoff before retry — jittered exponential: 5s base, 120s cap
                    wait_time = jittered_backoff(retry_count, base_delay=5.0, max_delay=120.0)
-                    agent._vprint(f"{agent.log_prefix}⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...", force=True)
+                    agent._buffer_vprint(f"⏳ Retrying in {wait_time:.1f}s ({_failure_hint})...")
                    logger.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")
                    
                    # Sleep in small increments to stay responsive to interrupts
@ -1606,14 +1705,14 @@ def run_conversation(
                        if assistant_message is not None and _trunc_has_tool_calls:
                            if truncated_tool_call_retries < 1:
                                truncated_tool_call_retries += 1
-                                agent._vprint(
-                                    f"{agent.log_prefix}⚠️  Truncated tool call detected — retrying API call...",
-                                    force=True,
+                                agent._buffer_vprint(
+                                    f"⚠️  Truncated tool call detected — retrying API call..."
                                )
                                # Don't append the broken response to messages;
                                # just re-run the same API call from the current
                                # message state, giving the model another chance.
                                continue
+                            agent._flush_status_buffer()
                            agent._vprint(
                                f"{agent.log_prefix}⚠️  Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
                                force=True,
@ -1647,6 +1746,7 @@ def run_conversation(
                        }
                    else:
                        # First message was truncated - mark as failed
+                        agent._flush_status_buffer()
                        agent._vprint(f"{agent.log_prefix}❌ First response truncated - cannot recover", force=True)
                        agent._persist_session(messages, conversation_history)
                        return {
@ -1668,10 +1768,19 @@ def run_conversation(
                    prompt_tokens = canonical_usage.prompt_tokens
                    completion_tokens = canonical_usage.output_tokens
                    total_tokens = canonical_usage.total_tokens
+                    # Forward canonical token + cache buckets so context engines
+                    # can make decisions on cache hit ratios / reasoning costs,
+                    # not just legacy aggregate tokens. Legacy keys stay for
+                    # back-compat with engines that only read prompt/completion/total.
                    usage_dict = {
                        "prompt_tokens": prompt_tokens,
                        "completion_tokens": completion_tokens,
                        "total_tokens": total_tokens,
+                        "input_tokens": canonical_usage.input_tokens,
+                        "output_tokens": canonical_usage.output_tokens,
+                        "cache_read_tokens": canonical_usage.cache_read_tokens,
+                        "cache_write_tokens": canonical_usage.cache_write_tokens,
+                        "reasoning_tokens": canonical_usage.reasoning_tokens,
                    }
                    agent.context_compressor.update_from_response(usage_dict)

@ -1789,6 +1898,11 @@ def run_conversation(
                        )
                
                has_retried_429 = False  # Reset on success
+                # Note: don't clear the retry buffer here — an "API call
+                # success" only means we got bytes back, not that we got
+                # usable content. Empty responses still loop through the
+                # empty-retry path below; the buffer is cleared when
+                # genuinely successful content is detected later (~L4127).
                # Clear Nous rate limit state on successful request —
                # proves the limit has reset and other sessions can
                # resume hitting Nous.
@ -1815,9 +1929,10 @@ def run_conversation(
                break

            except Exception as api_error:
-                # Stop spinner before printing error messages
+                # Stop spinner silently — retry status is buffered and
+                # only flushed when every retry+fallback is exhausted.
                if thinking_spinner:
-                    thinking_spinner.stop("(╥_╥) error, retrying...")
+                    thinking_spinner.stop("")
                    thinking_spinner = None
                if agent.thinking_callback:
                    agent.thinking_callback("")
@ -1872,14 +1987,12 @@ def run_conversation(
                    if _surrogates_found or _is_surrogate_error:
                        agent._unicode_sanitization_passes += 1
                        if _surrogates_found:
-                            agent._vprint(
-                                f"{agent.log_prefix}⚠️  Stripped invalid surrogate characters from messages. Retrying...",
-                                force=True,
+                            agent._buffer_vprint(
+                                f"⚠️  Stripped invalid surrogate characters from messages. Retrying..."
                            )
                        else:
-                            agent._vprint(
-                                f"{agent.log_prefix}⚠️  Surrogate encoding error — retrying after full-payload sanitization...",
-                                force=True,
+                            agent._buffer_vprint(
+                                f"⚠️  Surrogate encoding error — retrying after full-payload sanitization..."
                            )
                        continue
                    if _is_ascii_codec:
@ -2093,6 +2206,23 @@ def run_conversation(
                    classified.should_rotate_credential, classified.should_fallback,
                )

+                if (
+                    classified.reason == FailoverReason.billing
+                    and _is_nous_inference_route(
+                        getattr(agent, "provider", "") or "",
+                        getattr(agent, "base_url", "") or "",
+                    )
+                    and not nous_paid_entitlement_refresh_attempted
+                ):
+                    nous_paid_entitlement_refresh_attempted = True
+                    if _try_refresh_nous_paid_entitlement_credentials(agent):
+                        agent._vprint(
+                            f"{agent.log_prefix}🔐 Nous paid access verified — "
+                            "refreshed runtime credentials and retrying request...",
+                            force=True,
+                        )
+                        continue
+
                recovered_with_pool, has_retried_429 = agent._recover_with_credential_pool(
                    status_code=status_code,
                    has_retried_429=has_retried_429,
@ -2190,7 +2320,7 @@ def run_conversation(
                    codex_auth_retry_attempted = True
                    if agent._try_refresh_codex_client_credentials(force=True):
                        _label = "xAI OAuth" if agent.provider == "xai-oauth" else "Codex"
-                        agent._vprint(f"{agent.log_prefix}🔐 {_label} auth refreshed after 401. Retrying request...")
+                        agent._buffer_vprint(f"🔐 {_label} auth refreshed after 401. Retrying request...")
                        continue
                if (
                    agent.api_mode == "chat_completions"
@ -2217,7 +2347,8 @@ def run_conversation(
                    print(f"{agent.log_prefix}🔐 Nous 401 — Portal authentication failed.")
                    if _body_text:
                        print(f"{agent.log_prefix}   Response: {_body_text}")
-                    print(f"{agent.log_prefix}   Most likely: Portal OAuth expired, account out of credits, or agent key revoked.")
+                    if not _print_nous_entitlement_guidance(agent, "Nous model access"):
+                        print(f"{agent.log_prefix}   Most likely: Portal OAuth expired, account out of credits, or agent key revoked.")
                    print(f"{agent.log_prefix}   Troubleshooting:")
                    print(f"{agent.log_prefix}     • Re-authenticate: hermes auth add nous")
                    print(f"{agent.log_prefix}     • Check credits / billing: https://portal.nousresearch.com")
@ -2230,7 +2361,7 @@ def run_conversation(
                ):
                    copilot_auth_retry_attempted = True
                    if agent._try_refresh_copilot_client_credentials():
-                        agent._vprint(f"{agent.log_prefix}🔐 Copilot credentials refreshed after 401. Retrying request...")
+                        agent._buffer_vprint(f"🔐 Copilot credentials refreshed after 401. Retrying request...")
                        continue
                if (
                    agent.api_mode == "anthropic_messages"
@ -2405,41 +2536,37 @@ def run_conversation(
                _base = getattr(agent, "base_url", "unknown")
                _model = getattr(agent, "model", "unknown")
                _status_code_str = f" [HTTP {status_code}]" if status_code else ""
-                agent._vprint(f"{agent.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}", force=True)
-                agent._vprint(f"{agent.log_prefix}   🔌 Provider: {_provider}  Model: {_model}", force=True)
-                agent._vprint(f"{agent.log_prefix}   🌐 Endpoint: {_base}", force=True)
-                agent._vprint(f"{agent.log_prefix}   📝 Error: {_error_summary}", force=True)
+                agent._buffer_vprint(f"⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}{_status_code_str}")
+                agent._buffer_vprint(f"   🔌 Provider: {_provider}  Model: {_model}")
+                agent._buffer_vprint(f"   🌐 Endpoint: {_base}")
+                agent._buffer_vprint(f"   📝 Error: {_error_summary}")
                if status_code and status_code < 500:
                    _err_body = getattr(api_error, "body", None)
                    _err_body_str = str(_err_body)[:300] if _err_body else None
                    if _err_body_str:
-                        agent._vprint(f"{agent.log_prefix}   📋 Details: {_err_body_str}", force=True)
-                agent._vprint(f"{agent.log_prefix}   ⏱️  Elapsed: {elapsed_time:.2f}s  Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens")
+                        agent._buffer_vprint(f"   📋 Details: {_err_body_str}")
+                agent._buffer_vprint(f"   ⏱️  Elapsed: {elapsed_time:.2f}s  Context: {len(api_messages)} msgs, ~{approx_tokens:,} tokens")

                # Actionable hint for OpenRouter "no tool endpoints" error.
-                # This fires regardless of whether fallback succeeds — the
-                # user needs to know WHY their model failed so they can fix
-                # their provider routing, not just silently fall back.
+                # Buffered like the rest of the retry trace — surfaced only
+                # if every retry+fallback exhausts.  Avoids spamming users
+                # who recover automatically via fallback.
                if (
                    agent._is_openrouter_url()
                    and "support tool use" in error_msg
                ):
-                    agent._vprint(
-                        f"{agent.log_prefix}   💡 No OpenRouter providers for {_model} support tool calling with your current settings.",
-                        force=True,
+                    agent._buffer_vprint(
+                        f"   💡 No OpenRouter providers for {_model} support tool calling with your current settings."
                    )
                    if agent.providers_allowed:
-                        agent._vprint(
-                            f"{agent.log_prefix}      Your provider_routing.only restriction is filtering out tool-capable providers.",
-                            force=True,
+                        agent._buffer_vprint(
+                            f"      Your provider_routing.only restriction is filtering out tool-capable providers."
                        )
-                        agent._vprint(
-                            f"{agent.log_prefix}      Try removing the restriction or adding providers that support tools for this model.",
-                            force=True,
+                        agent._buffer_vprint(
+                            f"      Try removing the restriction or adding providers that support tools for this model."
                        )
-                    agent._vprint(
-                        f"{agent.log_prefix}      Check which providers support tools: https://openrouter.ai/models/{_model}",
-                        force=True,
+                    agent._buffer_vprint(
+                        f"      Check which providers support tools: https://openrouter.ai/models/{_model}"
                    )

                # Check for interrupt before deciding to retry
@ -2489,11 +2616,10 @@ def run_conversation(
                            # user later enables extra usage the 1M limit
                            # should come back automatically.
                            compressor._context_probe_persistable = False
-                        agent._vprint(
-                            f"{agent.log_prefix}⚠️  Anthropic long-context tier "
+                        agent._buffer_vprint(
+                            f"⚠️  Anthropic long-context tier "
                            f"requires extra usage — reducing context: "
-                            f"{old_ctx:,} → {_reduced_ctx:,} tokens",
-                            force=True,
+                            f"{old_ctx:,} → {_reduced_ctx:,} tokens"
                        )

                    compression_attempts += 1
@ -2509,7 +2635,7 @@ def run_conversation(
                        # messages to the new session, not skipping them.
                        conversation_history = None
                        if len(messages) < original_len or old_ctx > _reduced_ctx:
-                            agent._emit_status(
+                            agent._buffer_status(
                                f"🗜️ Context reduced to {_reduced_ctx:,} tokens "
                                f"(was {old_ctx:,}), retrying..."
                            )
@ -2538,7 +2664,12 @@ def run_conversation(
                        base_url=getattr(agent, "base_url", None),
                    )
                    if not pool_may_recover:
-                        agent._emit_status("⚠️ Rate limited — switching to fallback provider...")
+                        if classified.reason == FailoverReason.billing:
+                            agent._buffer_status(
+                                "⚠️ Billing or credits exhausted — switching to fallback provider..."
+                            )
+                        else:
+                            agent._buffer_status("⚠️ Rate limited — switching to fallback provider...")
                        if agent._try_activate_fallback(reason=classified.reason):
                            retry_count = 0
                            compression_attempts = 0
@ -2650,6 +2781,8 @@ def run_conversation(
                if is_payload_too_large:
                    compression_attempts += 1
                    if compression_attempts > max_compression_attempts:
+                        # Terminal — surface the buffered retry trace.
+                        agent._flush_status_buffer()
                        agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                        logger.error(f"{agent.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
@ -2663,7 +2796,7 @@ def run_conversation(
                            "failed": True,
                            "compression_exhausted": True,
                        }
-                    agent._emit_status(f"⚠️  Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")
+                    agent._buffer_status(f"⚠️  Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")

                    original_len = len(messages)
                    messages, active_system_prompt = agent._compress_context(
@ -2676,11 +2809,14 @@ def run_conversation(
                    conversation_history = None

                    if len(messages) < original_len:
-                        agent._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
+                        agent._buffer_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
                        time.sleep(2)  # Brief pause between compression retries
                        restart_with_compressed_messages = True
                        break
                    else:
+                        # Terminal — surface buffered context so the user
+                        # sees what compression attempts were made.
+                        agent._flush_status_buffer()
                        agent._vprint(f"{agent.log_prefix}❌ Payload too large and cannot compress further.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                        logger.error(f"{agent.log_prefix}413 payload too large. Cannot compress further.")
@ -2724,16 +2860,16 @@ def run_conversation(
                        # touching context_length or triggering compression.
                        safe_out = max(1, available_out - 64)  # small safety margin
                        agent._ephemeral_max_output_tokens = safe_out
-                        agent._vprint(
-                            f"{agent.log_prefix}⚠️  Output cap too large for current prompt — "
+                        agent._buffer_vprint(
+                            f"⚠️  Output cap too large for current prompt — "
                            f"retrying with max_tokens={safe_out:,} "
-                            f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})",
-                            force=True,
+                            f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})"
                        )
                        # Still count against compression_attempts so we don't
                        # loop forever if the error keeps recurring.
                        compression_attempts += 1
                        if compression_attempts > max_compression_attempts:
+                            agent._flush_status_buffer()
                            agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
                            agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                            logger.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
@ -2750,9 +2886,13 @@ def run_conversation(
                        restart_with_compressed_messages = True
                        break

-                    # Error is about the INPUT being too large — reduce context_length.
-                    # Try to parse the actual limit from the error message
-                    parsed_limit = parse_context_limit_from_error(error_msg)
+                    # Error is about the INPUT being too large.  Only reduce
+                    # context_length when the provider explicitly reports the
+                    # real lower limit.  If the provider only says "input
+                    # exceeds the context window", keep the configured window
+                    # and try compression; guessing probe tiers can incorrectly
+                    # turn a user-configured 1M window into 256K/128K/64K.
+                    new_ctx = get_context_length_from_provider_error(error_msg, old_ctx)
                    _provider_lower = (getattr(agent, "provider", "") or "").lower()
                    _base_lower = (getattr(agent, "base_url", "") or "").rstrip("/").lower()
                    is_minimax_provider = (
@ -2764,24 +2904,12 @@ def run_conversation(
                    )
                    minimax_delta_only_overflow = (
                        is_minimax_provider
-                        and parsed_limit is None
+                        and new_ctx is None
                        and "context window exceeds limit (" in error_msg
                    )
-                    if parsed_limit and parsed_limit < old_ctx:
-                        new_ctx = parsed_limit
-                        agent._vprint(f"{agent.log_prefix}Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True)
-                    elif minimax_delta_only_overflow:
-                        new_ctx = old_ctx
-                        agent._vprint(
-                            f"{agent.log_prefix}Provider reported overflow amount only; "
-                            f"keeping context_length at {old_ctx:,} tokens and compressing.",
-                            force=True,
-                        )
-                    else:
-                        # Step down to the next probe tier
-                        new_ctx = get_next_probe_tier(old_ctx)

-                    if new_ctx and new_ctx < old_ctx:
+                    if new_ctx is not None:
+                        agent._buffer_vprint(f"Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})")
                        compressor.update_model(
                            model=agent.model,
                            context_length=new_ctx,
@ -2791,23 +2919,26 @@ def run_conversation(
                            api_mode=agent.api_mode,
                        )
                        # Context probing flags — only set on built-in
-                        # compressor (plugin engines manage their own).
+                        # compressor (plugin engines manage their own).  This
+                        # value came from the provider, so it is safe to cache.
                        if hasattr(compressor, "_context_probed"):
                            compressor._context_probed = True
-                            # Only persist limits parsed from the provider's
-                            # error message (a real number).  Guessed fallback
-                            # tiers from get_next_probe_tier() should stay
-                            # in-memory only — persisting them pollutes the
-                            # cache with wrong values.
-                            compressor._context_probe_persistable = bool(
-                                parsed_limit and parsed_limit == new_ctx
-                            )
-                        agent._vprint(f"{agent.log_prefix}⚠️  Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True)
+                            compressor._context_probe_persistable = True
+                        agent._buffer_vprint(f"⚠️  Context length exceeded — using provider limit: {old_ctx:,} → {new_ctx:,} tokens")
+                    elif minimax_delta_only_overflow:
+                        agent._buffer_vprint(
+                            f"Provider reported overflow amount only; "
+                            f"keeping context_length at {old_ctx:,} tokens and compressing."
+                        )
                    else:
-                        agent._vprint(f"{agent.log_prefix}⚠️  Context length exceeded at minimum tier — attempting compression...", force=True)
+                        agent._buffer_vprint(
+                            f"⚠️  Context length exceeded, but provider did not report a max context length; "
+                            f"keeping context_length at {old_ctx:,} tokens and compressing."
+                        )

                    compression_attempts += 1
                    if compression_attempts > max_compression_attempts:
+                        agent._flush_status_buffer()
                        agent._vprint(f"{agent.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True)
                        logger.error(f"{agent.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
@ -2821,7 +2952,7 @@ def run_conversation(
                            "failed": True,
                            "compression_exhausted": True,
                        }
-                    agent._emit_status(f"🗜️ Context too large (~{approx_tokens:,} tokens) — compressing ({compression_attempts}/{max_compression_attempts})...")
+                    agent._buffer_status(f"🗜️ Context too large (~{approx_tokens:,} tokens) — compressing ({compression_attempts}/{max_compression_attempts})...")

                    original_len = len(messages)
                    messages, active_system_prompt = agent._compress_context(
@ -2835,12 +2966,13 @@ def run_conversation(

                    if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
                        if len(messages) < original_len:
-                            agent._emit_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
+                            agent._buffer_status(f"🗜️ Compressed {original_len} → {len(messages)} messages, retrying...")
                        time.sleep(2)  # Brief pause between compression retries
                        restart_with_compressed_messages = True
                        break
                    else:
                        # Can't compress further and already at minimum tier
+                        agent._flush_status_buffer()
                        agent._vprint(f"{agent.log_prefix}❌ Context length exceeded and cannot compress further.", force=True)
                        agent._vprint(f"{agent.log_prefix}   💡 The conversation has accumulated too much content. Try /new to start fresh, or /compress to manually trigger compression.", force=True)
                        logger.error(f"{agent.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
@ -2929,7 +3061,10 @@ def run_conversation(
                if is_client_error:
                    # Try fallback before aborting — a different provider
                    # may not have the same issue (rate limit, auth, etc.)
-                    agent._emit_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._buffer_status("⚠️ Provider safety filter blocked this request — trying fallback...")
+                    else:
+                        agent._buffer_status(f"⚠️ Non-retryable error (HTTP {status_code}) — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
@ -2939,16 +3074,38 @@ def run_conversation(
                        agent._dump_api_request_debug(
                            api_kwargs, reason="non_retryable_client_error", error=api_error,
                        )
-                    agent._emit_status(
-                        f"❌ Non-retryable error (HTTP {status_code}): "
-                        f"{agent._summarize_api_error(api_error)}"
-                    )
+                    # Terminal — flush buffered context so the user sees
+                    # what was tried before the abort.
+                    agent._flush_status_buffer()
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._emit_status(
+                            f"❌ Provider safety filter blocked this request: "
+                            f"{agent._summarize_api_error(api_error)}"
+                        )
+                    else:
+                        agent._emit_status(
+                            f"❌ Non-retryable error (HTTP {status_code}): "
+                            f"{agent._summarize_api_error(api_error)}"
+                        )
                    agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
                    agent._vprint(f"{agent.log_prefix}   🔌 Provider: {_provider}  Model: {_model}", force=True)
                    agent._vprint(f"{agent.log_prefix}   🌐 Endpoint: {_base}", force=True)
                    # Actionable guidance for common auth errors
                    if classified.is_auth or classified.reason == FailoverReason.billing:
-                        if _provider in {"openai-codex", "xai-oauth", "nous"} and status_code == 401:
+                        if classified.reason == FailoverReason.billing and _print_billing_or_entitlement_guidance(
+                            agent,
+                            capability="model access",
+                            provider=_provider,
+                            base_url=str(_base),
+                            model=_model,
+                        ):
+                            pass
+                        elif _provider == "nous" and _print_nous_entitlement_guidance(
+                            agent,
+                            "Nous model access",
+                        ):
+                            pass
+                        elif _provider in {"openai-codex", "xai-oauth", "nous"} and status_code == 401:
                            if _provider == "openai-codex":
                                agent._vprint(f"{agent.log_prefix}   💡 Codex OAuth token was rejected (HTTP 401). Your token may have been", force=True)
                                agent._vprint(f"{agent.log_prefix}      refreshed by another client (Codex CLI, VS Code). To fix:", force=True)
@ -2976,6 +3133,28 @@ def run_conversation(
                                agent._vprint(f"{agent.log_prefix}      • Check credits: https://openrouter.ai/settings/credits", force=True)
                    else:
                        agent._vprint(f"{agent.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
+                    # Content-policy blocks deserve their own actionable
+                    # guidance — neither "fix your API key" nor "retry won't
+                    # help" tells the user what to actually do. The provider
+                    # has refused this specific prompt, so the recovery is
+                    # either a rephrase or routing to a different model.
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        agent._vprint(
+                            f"{agent.log_prefix}   💡 The provider's safety filter rejected this specific prompt.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      • Try rephrasing the request, narrowing the context, or splitting into smaller steps.",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}      • Configure a fallback provider so future blocks route automatically:",
+                            force=True,
+                        )
+                        agent._vprint(
+                            f"{agent.log_prefix}        hermes fallback add   (interactive picker — same as `hermes model`)",
+                            force=True,
+                        )
                    logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}")
                    # Skip session persistence when the error is likely
                    # context-overflow related (status 400 + large session).
@ -2990,6 +3169,23 @@ def run_conversation(
                        )
                    else:
                        agent._persist_session(messages, conversation_history)
+                    if classified.reason == FailoverReason.content_policy_blocked:
+                        _summary = agent._summarize_api_error(api_error)
+                        _policy_response = (
+                            f"⚠️  The model provider's safety filter blocked this request "
+                            f"(not a Hermes/gateway failure).\n\n"
+                            f"Provider message: {_summary}\n\n"
+                            f"Try rephrasing the request, narrowing the context, or "
+                            f"adding a fallback provider with `hermes fallback add`."
+                        )
+                        return {
+                            "final_response": _policy_response,
+                            "messages": messages,
+                            "api_calls": api_call_count,
+                            "completed": False,
+                            "failed": True,
+                            "error": f"content_policy_blocked: {_summary}",
+                        }
                    return {
                        "final_response": None,
                        "messages": messages,
@ -3011,14 +3207,32 @@ def run_conversation(
                        retry_count = 0
                        continue
                    # Try fallback before giving up entirely
-                    agent._emit_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
+                    agent._buffer_status(f"⚠️ Max retries ({max_retries}) exhausted — trying fallback...")
                    if agent._try_activate_fallback():
                        retry_count = 0
                        compression_attempts = 0
                        primary_recovery_attempted = False
                        continue
+                    # Terminal — flush buffered retry/fallback trace.
+                    agent._flush_status_buffer()
                    _final_summary = agent._summarize_api_error(api_error)
-                    if is_rate_limited:
+                    _billing_guidance = ""
+                    if classified.reason == FailoverReason.billing:
+                        agent._emit_status(f"❌ Billing or credits exhausted — {_final_summary}")
+                        _billing_guidance = _billing_or_entitlement_message(
+                            capability="model access",
+                            provider=_provider,
+                            base_url=str(_base),
+                            model=_model,
+                        )
+                        _print_billing_or_entitlement_guidance(
+                            agent,
+                            capability="model access",
+                            provider=_provider,
+                            base_url=str(_base),
+                            model=_model,
+                        )
+                    elif is_rate_limited:
                        agent._emit_status(f"❌ Rate limited after {max_retries} retries — {_final_summary}")
                    else:
                        agent._emit_status(f"❌ API failed after {max_retries} retries — {_final_summary}")
@ -3063,7 +3277,12 @@ def run_conversation(
                            api_kwargs, reason="max_retries_exhausted", error=api_error,
                        )
                    agent._persist_session(messages, conversation_history)
-                    _final_response = f"API call failed after {max_retries} retries: {_final_summary}"
+                    if classified.reason == FailoverReason.billing:
+                        _final_response = f"Billing or credits exhausted: {_final_summary}"
+                        if _billing_guidance:
+                            _final_response += f"\n\n{_billing_guidance}"
+                    else:
+                        _final_response = f"API call failed after {max_retries} retries: {_final_summary}"
                    if _is_stream_drop:
                        _final_response += (
                            "\n\nThe provider's stream connection keeps "
@ -3095,9 +3314,9 @@ def run_conversation(
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
                if is_rate_limited:
-                    agent._emit_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
+                    agent._buffer_status(f"⏱️ Rate limited. Waiting {wait_time:.1f}s (attempt {retry_count + 1}/{max_retries})...")
                else:
-                    agent._emit_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
+                    agent._buffer_status(f"⏳ Retrying in {wait_time:.1f}s (attempt {retry_count}/{max_retries})...")
                logger.warning(
                    "Retrying API call in %ss (attempt %s/%s) %s error=%s",
                    wait_time,
@ -3256,14 +3475,15 @@ def run_conversation(
            if has_incomplete_scratchpad(assistant_message.content or ""):
                agent._incomplete_scratchpad_retries += 1
                
-                agent._vprint(f"{agent.log_prefix}⚠️  Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
+                agent._buffer_vprint(f"⚠️  Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
                
                if agent._incomplete_scratchpad_retries <= 2:
-                    agent._vprint(f"{agent.log_prefix}🔄 Retrying API call ({agent._incomplete_scratchpad_retries}/2)...")
+                    agent._buffer_vprint(f"🔄 Retrying API call ({agent._incomplete_scratchpad_retries}/2)...")
                    # Don't add the broken message, just retry
                    continue
                else:
                    # Max retries - discard this turn and save as partial
+                    agent._flush_status_buffer()
                    agent._vprint(f"{agent.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True)
                    agent._incomplete_scratchpad_retries = 0
                    
@ -3371,9 +3591,10 @@ def run_conversation(
                    available = ", ".join(sorted(agent.valid_tool_names))
                    invalid_name = invalid_tool_calls[0]
                    invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name
-                    agent._vprint(f"{agent.log_prefix}⚠️  Unknown tool '{invalid_preview}' — sending error to model for agent-correction ({agent._invalid_tool_retries}/3)")
+                    agent._buffer_vprint(f"⚠️  Unknown tool '{invalid_preview}' — sending error to model for agent-correction ({agent._invalid_tool_retries}/3)")

                    if agent._invalid_tool_retries >= 3:
+                        agent._flush_status_buffer()
                        agent._vprint(f"{agent.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
                        agent._invalid_tool_retries = 0
                        agent._persist_session(messages, conversation_history)
@ -3457,16 +3678,16 @@ def run_conversation(
                    agent._invalid_json_retries += 1

                    tool_name, error_msg = invalid_json_args[0]
-                    agent._vprint(f"{agent.log_prefix}⚠️  Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")
+                    agent._buffer_vprint(f"⚠️  Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")

                    if agent._invalid_json_retries < 3:
-                        agent._vprint(f"{agent.log_prefix}🔄 Retrying API call ({agent._invalid_json_retries}/3)...")
+                        agent._buffer_vprint(f"🔄 Retrying API call ({agent._invalid_json_retries}/3)...")
                        # Don't add anything to messages, just retry the API call
                        continue
                    else:
                        # Instead of returning partial, inject tool error results so the model can recover.
                        # Using tool results (not user messages) preserves role alternation.
-                        agent._vprint(f"{agent.log_prefix}⚠️  Injecting recovery tool results for invalid JSON...")
+                        agent._buffer_vprint(f"⚠️  Injecting recovery tool results for invalid JSON...")
                        agent._invalid_json_retries = 0  # Reset for next attempt
                        
                        # Append the assistant message with its (broken) tool_calls
@ -3774,7 +3995,7 @@ def run_conversation(
                            "Empty response after tool calls — nudging model "
                            "to continue processing"
                        )
-                        agent._emit_status(
+                        agent._buffer_status(
                            "⚠️ Model returned empty after tool calls — "
                            "nudging to continue"
                        )
@ -3820,7 +4041,7 @@ def run_conversation(
                            "prefilling to continue (%d/2)",
                            agent._thinking_prefill_retries,
                        )
-                        agent._emit_status(
+                        agent._buffer_status(
                            f"↻ Thinking-only response — prefilling to continue "
                            f"({agent._thinking_prefill_retries}/2)"
                        )
@ -3855,7 +4076,7 @@ def run_conversation(
                            "retry %d/3 (model=%s)",
                            agent._empty_content_retries, agent.model,
                        )
-                        agent._emit_status(
+                        agent._buffer_status(
                            f"⚠️ Empty response from model — retrying "
                            f"({agent._empty_content_retries}/3)"
                        )
@ -3874,13 +4095,13 @@ def run_conversation(
                            agent._empty_content_retries, agent.model,
                            agent.provider,
                        )
-                        agent._emit_status(
+                        agent._buffer_status(
                            "⚠️ Model returning empty responses — "
                            "switching to fallback provider..."
                        )
                        if agent._try_activate_fallback():
                            agent._empty_content_retries = 0
-                            agent._emit_status(
+                            agent._buffer_status(
                                f"↻ Switched to fallback: {agent.model} "
                                f"({agent.provider})"
                            )
@ -3894,6 +4115,9 @@ def run_conversation(
                    # Exhausted retries and fallback chain (or no
                    # fallback configured).  Fall through to the
                    # "(empty)" terminal.
+                    # Surface the buffered retry/fallback trace so the
+                    # user can see what was attempted before "(empty)".
+                    agent._flush_status_buffer()
                    _turn_exit_reason = "empty_response_exhausted"
                    reasoning_text = agent._extract_reasoning(assistant_message)
                    agent._drop_trailing_empty_response_scaffolding(messages)
@ -3938,6 +4162,9 @@ def run_conversation(
                # Reset retry counter/signature on successful content
                agent._empty_content_retries = 0
                agent._thinking_prefill_retries = 0
+                # Successful content reached — drop any buffered retry
+                # status from earlier failed attempts in this turn.
+                agent._clear_status_buffer()

                if (
                    agent.api_mode == "codex_responses"
@ -4074,36 +4301,54 @@ def run_conversation(
            )
        final_response = agent._handle_max_iterations(messages, api_call_count)

-        # If running as a kanban worker, block the task so the dispatcher
-        # knows the worker could not complete (rather than treating it as a
+        # If running as a kanban worker, signal the dispatcher that the
+        # worker could not complete (rather than treating it as a
        # protocol violation).  The agent loop strips tools before calling
        # _handle_max_iterations, so the model cannot call kanban_block
        # itself — we must do it on its behalf.
+        #
+        # We route through ``_record_task_failure(outcome="timed_out")``
+        # rather than ``kanban_block`` so this counts toward the
+        # ``consecutive_failures`` counter and the dispatcher's
+        # ``failure_limit`` circuit breaker (#29747 gap 2).  Without this,
+        # a task whose worker keeps exhausting its budget would block
+        # silently each run, get auto-promoted by the operator (or never
+        # surface), and re-block in an endless loop with no signal.
        _kanban_task = os.environ.get("HERMES_KANBAN_TASK")
        if _kanban_task:
            try:
-                _ra().handle_function_call(
-                    "kanban_block",
-                    {
-                        "task_id": _kanban_task,
-                        "reason": (
+                from hermes_cli import kanban_db as _kb
+                _conn = _kb.connect()
+                try:
+                    _kb._record_task_failure(
+                        _conn,
+                        _kanban_task,
+                        error=(
                            f"Iteration budget exhausted "
                            f"({api_call_count}/{agent.max_iterations}) — "
                            "task could not complete within the allowed "
                            "iterations"
                        ),
-                    },
-                    task_id=effective_task_id,
-                )
-                logger.info(
-                    "kanban_block called for task %s after iteration "
-                    "exhaustion (%d/%d)",
-                    _kanban_task, api_call_count, agent.max_iterations,
-                )
+                        outcome="timed_out",
+                        release_claim=True,
+                        end_run=True,
+                        event_payload_extra={
+                            "budget_used": api_call_count,
+                            "budget_max": agent.max_iterations,
+                        },
+                    )
+                    logger.info(
+                        "recorded budget-exhausted failure for task %s (%d/%d)",
+                        _kanban_task, api_call_count, agent.max_iterations,
+                    )
+                finally:
+                    try:
+                        _conn.close()
+                    except Exception:
+                        pass
            except Exception:
                logger.warning(
-                    "Failed to call kanban_block after iteration "
-                    "exhaustion for task %s",
+                    "Failed to record budget-exhausted failure for task %s",
                    _kanban_task,
                    exc_info=True,
                )
@ -4321,6 +4566,7 @@ def run_conversation(
        original_user_message=original_user_message,
        final_response=final_response,
        interrupted=interrupted,
+        messages=messages,
    )

    # Background memory/skill review — runs AFTER the response is delivered
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@ -14,7 +14,7 @@ from datetime import datetime, timezone
 from typing import Any, Dict, List, Optional, Set, Tuple

 from hermes_constants import OPENROUTER_BASE_URL
-from hermes_cli.config import get_env_value, load_env
+from hermes_cli.config import load_env
 from agent.credential_persistence import (
    is_borrowed_credential_source,
    sanitize_borrowed_credential_payload,
@ -22,7 +22,6 @@ from agent.credential_persistence import (
 import hermes_cli.auth as auth_mod
 from hermes_cli.auth import (
    CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
-    DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
    PROVIDER_REGISTRY,
    _auth_store_lock,
    _codex_access_token_is_expiring,
@ -55,6 +54,38 @@ def _load_config_safe() -> Optional[dict]:

 STATUS_OK = "ok"
 STATUS_EXHAUSTED = "exhausted"
+# Terminal failure — the credential will never recover on its own.  Used for
+# upstream-permanent OAuth states like ``token_invalidated`` / ``token_revoked``
+# where retrying after a TTL cooldown is guaranteed to fail.  ``DEAD`` entries
+# are excluded from rotation unconditionally and only clear when an explicit
+# write-side sync (e.g. ``_save_codex_tokens`` after a fresh device-code
+# login) rewrites the tokens.
+STATUS_DEAD = "dead"
+
+# OAuth error reasons that indicate the credential is permanently invalid
+# server-side and cannot be recovered by retry/refresh.  Sourced from
+# OpenAI Codex Responses API, Anthropic, xAI, and Google OAuth spec.
+_TERMINAL_AUTH_REASONS = frozenset({
+    "token_invalidated",   # OpenAI Codex: "Your authentication token has been invalidated."
+    "token_revoked",        # OAuth 2.0 RFC 7009: token explicitly revoked
+    "invalid_token",        # RFC 6750: bearer token is malformed/expired/revoked
+    "invalid_grant",        # RFC 6749: refresh_token rejected during refresh
+    "unauthorized_client",  # RFC 6749: client no longer authorized
+    "refresh_token_reused", # Single-use refresh token consumed by another process
+})
+
+# How long a DEAD manual credential is preserved before being pruned.
+# Manual entries (``manual:*``) are independent credentials with no singleton
+# to re-seed from, so pruning them after a quiet window cleans up dead state
+# without losing recoverability — the user always has the option to re-add
+# via ``hermes auth add``.
+#
+# Singleton-seeded entries (``device_code``, ``loopback_pkce``, ``claude_code``)
+# are NOT pruned because ``_seed_from_singletons`` would just re-create them
+# on the next ``load_pool()`` with the same stale singleton tokens, defeating
+# the cleanup.  They remain in the pool marked DEAD until an explicit re-auth
+# write-side sync (``_save_codex_tokens`` etc.) clears the status.
+DEAD_MANUAL_PRUNE_TTL_SECONDS = 24 * 60 * 60  # 24 hours

 AUTH_TYPE_OAUTH = "oauth"
 AUTH_TYPE_API_KEY = "api_key"
@ -171,8 +202,22 @@ class PooledCredential:
    def runtime_api_key(self) -> str:
        if self.provider == "nous":
            # Nous stores the runtime inference credential in agent_key for
-            # compatibility. It may be a NAS invoke JWT or legacy opaque key.
-            return str(self.agent_key or self.access_token or "")
+            # compatibility. It must be a NAS invoke JWT.
+            for token, expires_at in (
+                (self.agent_key, self.agent_key_expires_at),
+                (self.access_token, self.expires_at),
+            ):
+                if (
+                    isinstance(token, str)
+                    and token.strip()
+                    and auth_mod._nous_invoke_jwt_is_usable(
+                        token,
+                        scope=getattr(self, "scope", None),
+                        expires_at=expires_at,
+                    )
+                ):
+                    return token.strip()
+            return ""
        return str(self.access_token or "")

    @property
@ -438,6 +483,29 @@ class CredentialPool:
            [entry.to_dict() for entry in self._entries],
        )

+    def _is_terminal_auth_failure(
+        self,
+        status_code: Optional[int],
+        normalized_error: Dict[str, Any],
+    ) -> bool:
+        """Detect upstream-permanent OAuth failures that won't recover on TTL.
+
+        Only fires for 401 responses whose error code/reason matches a known
+        terminal OAuth state (token_invalidated, token_revoked, invalid_grant,
+        etc.).  Distinguishes permanent failures from transient ones like
+        token_expired (refreshable) or generic 401 without a specific reason
+        (could be a server-side glitch worth retrying).
+
+        Returns False for non-401 status codes — 429 rate limits and 402
+        billing failures are transient by nature and should keep TTL semantics.
+        """
+        if status_code != 401:
+            return False
+        reason = normalized_error.get("reason")
+        if not isinstance(reason, str):
+            return False
+        return reason.strip().lower() in _TERMINAL_AUTH_REASONS
+
    def _mark_exhausted(
        self,
        entry: PooledCredential,
@ -445,9 +513,20 @@ class CredentialPool:
        error_context: Optional[Dict[str, Any]] = None,
    ) -> PooledCredential:
        normalized_error = _normalize_error_context(error_context)
+        # Permanent OAuth failures (token_invalidated, token_revoked, etc.)
+        # transition to STATUS_DEAD instead of STATUS_EXHAUSTED.  Without this,
+        # a revoked credential gets a 1-hour TTL cooldown and then re-enters
+        # rotation, failing immediately every hour until the user manually
+        # removes it (issue #32849).  DEAD entries are excluded from rotation
+        # unconditionally and only clear via an explicit re-auth write-side
+        # sync (``_save_codex_tokens`` after a fresh device-code login).
+        if self._is_terminal_auth_failure(status_code, normalized_error):
+            terminal_status = STATUS_DEAD
+        else:
+            terminal_status = STATUS_EXHAUSTED
        updated = replace(
            entry,
-            last_status=STATUS_EXHAUSTED,
+            last_status=terminal_status,
            last_status_at=time.time(),
            last_error_code=status_code,
            last_error_reason=normalized_error.get("reason"),
@ -852,12 +931,7 @@ class CredentialPool:
                if synced is not entry:
                    entry = synced
                auth_mod.resolve_nous_runtime_credentials(
-                    min_key_ttl_seconds=DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
-                    inference_auth_mode=(
-                        auth_mod.NOUS_INFERENCE_AUTH_MODE_LEGACY
-                        if force
-                        else auth_mod.NOUS_INFERENCE_AUTH_MODE_AUTO
-                    ),
+                    force_refresh=force,
                )
                updated = self._sync_nous_entry_from_auth_store(entry)
            else:
@ -1139,7 +1213,7 @@ class CredentialPool:
                auth_mod.XAI_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
            )
        if self.provider == "nous":
-            # Nous refresh/mint can require network access and should happen when
+            # Nous refresh can require network access and should happen when
            # runtime credentials are actually resolved, not merely when the pool
            # is enumerated for listing, migration, or selection.
            return False
@ -1158,13 +1232,14 @@ class CredentialPool:
        """
        now = time.time()
        cleared_any = False
+        entries_to_prune: List[str] = []
        available: List[PooledCredential] = []
        for entry in self._entries:
            # For anthropic claude_code entries, sync from the credentials file
            # before any status/refresh checks. This picks up tokens refreshed
            # by other processes (Claude Code CLI, other Hermes profiles).
            if (self.provider == "anthropic" and entry.source == "claude_code"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_anthropic_entry_from_credentials_file(entry)
                if synced is not entry:
                    entry = synced
@ -1175,7 +1250,7 @@ class CredentialPool:
            # exhausted status stale.
            if (self.provider == "nous"
                    and entry.source == "device_code"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_nous_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
@ -1187,7 +1262,7 @@ class CredentialPool:
            # future for ChatGPT weekly windows).
            if (self.provider == "openai-codex"
                    and entry.source == "device_code"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_codex_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
@ -1198,11 +1273,41 @@ class CredentialPool:
            # xAI Grok OAuth login) has since rotated in auth.json.
            if (self.provider == "xai-oauth"
                    and entry.source == "loopback_pkce"
-                    and entry.last_status == STATUS_EXHAUSTED):
+                    and entry.last_status in {STATUS_EXHAUSTED, STATUS_DEAD}):
                synced = self._sync_xai_oauth_entry_from_auth_store(entry)
                if synced is not entry:
                    entry = synced
                    cleared_any = True
+            if entry.last_status == STATUS_DEAD:
+                # Manual DEAD credentials get pruned after a 24h quiet window
+                # so the pool doesn't accumulate dead entries forever.  The
+                # user can always re-add via ``hermes auth add``.  Singleton-
+                # seeded DEAD entries are kept so the audit trail (label,
+                # last_error_reason, timestamps) stays visible — pruning them
+                # would just be undone by ``_seed_from_singletons`` on the
+                # next load anyway.
+                if _is_manual_source(entry.source):
+                    dead_at = entry.last_status_at or 0
+                    if dead_at and now - dead_at > DEAD_MANUAL_PRUNE_TTL_SECONDS:
+                        _label = entry.label or entry.id[:8]
+                        logger.warning(
+                            "credential pool: pruning DEAD manual entry %s "
+                            "(reason=%s, age=%.1fh) — re-add via `hermes auth add %s`",
+                            _label,
+                            entry.last_error_reason or "unknown",
+                            (now - dead_at) / 3600.0,
+                            self.provider,
+                        )
+                        # Mark for removal after the loop completes; we can't
+                        # mutate self._entries while iterating.
+                        entries_to_prune.append(entry.id)
+                        cleared_any = True
+                # Permanently failed credentials never re-enter rotation via
+                # TTL.  They only clear when a write-side re-auth sync rewrites
+                # the tokens (e.g. ``_save_codex_tokens`` after a fresh
+                # device-code login).  The auth.json-sync paths below handle
+                # the re-auth case for OAuth singletons.
+                continue
            if entry.last_status == STATUS_EXHAUSTED:
                exhausted_until = _exhausted_until(entry)
                if exhausted_until is not None and now < exhausted_until:
@ -1226,6 +1331,9 @@ class CredentialPool:
                    continue
                entry = refreshed
            available.append(entry)
+        if entries_to_prune:
+            pruned_ids = set(entries_to_prune)
+            self._entries = [e for e in self._entries if e.id not in pruned_ids]
        if cleared_any:
            self._persist()
        return available
@ -1293,11 +1401,22 @@ class CredentialPool:
            if entry is None:
                return None
            _label = entry.label or entry.id[:8]
-            logger.info(
-                "credential pool: marking %s exhausted (status=%s), rotating",
-                _label, status_code,
-            )
            self._mark_exhausted(entry, status_code, error_context)
+            # Re-read the updated entry to log the correct terminal state.
+            updated_entry = next(
+                (e for e in self._entries if e.id == entry.id), entry,
+            )
+            if updated_entry.last_status == STATUS_DEAD:
+                logger.warning(
+                    "credential pool: marking %s DEAD (status=%s, reason=%s) — "
+                    "permanently failed, will NOT re-enter rotation until re-auth",
+                    _label, status_code, updated_entry.last_error_reason or "unknown",
+                )
+            else:
+                logger.info(
+                    "credential pool: marking %s exhausted (status=%s), rotating",
+                    _label, status_code,
+                )
            self._current_id = None
            next_entry = self._select_unlocked()
            if next_entry:
@ -1637,9 +1756,9 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                    "inference_base_url": state.get("inference_base_url"),
                    "agent_key": state.get("agent_key"),
                    "agent_key_expires_at": state.get("agent_key_expires_at"),
-                    # Carry the mint/refresh timestamps into the pool so
+                    # Carry the refresh timestamps into the pool so
                    # freshness-sensitive consumers (self-heal hooks, pool
-                    # pruning by age) can distinguish just-minted credentials
+                    # pruning by age) can distinguish just-refreshed credentials
                    # from stale ones.  Without these, fresh device_code
                    # entries get obtained_at=None and look older than they
                    # are (#15099).
--- a/agent/curator_backup.py
+++ b/agent/curator_backup.py
@ -39,12 +39,9 @@ from __future__ import annotations

 import json
 import logging
-import os
 import re
 import shutil
 import tarfile
-import tempfile
-import time
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
--- a/agent/display.py
+++ b/agent/display.py
@ -904,10 +904,6 @@ def get_cute_tool_message(
            extra = f" +{len(urls)-1}" if len(urls) > 1 else ""
            return _wrap(f"┊ 📄 fetch     {_trunc(domain, 35)}{extra}  {dur}")
        return _wrap(f"┊ 📄 fetch     pages  {dur}")
-    if tool_name == "web_crawl":
-        url = args.get("url", "")
-        domain = url.replace("https://", "").replace("http://", "").split("/")[0]
-        return _wrap(f"┊ 🕸️  crawl     {_trunc(domain, 35)}  {dur}")
    if tool_name == "terminal":
        return _wrap(f"┊ 💻 $         {_trunc(args.get('command', ''), 42)}  {dur}")
    if tool_name == "process":
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@ -44,9 +44,10 @@ class FailoverReason(enum.Enum):
    payload_too_large = "payload_too_large"  # 413 — compress payload
    image_too_large = "image_too_large"   # Native image part exceeds provider's per-image limit — shrink and retry

-    # Model
+    # Model / provider policy
    model_not_found = "model_not_found"  # 404 or invalid model — fallback to different model
    provider_policy_blocked = "provider_policy_blocked"  # Aggregator (e.g. OpenRouter) blocked the only endpoint due to account data/privacy policy
+    content_policy_blocked = "content_policy_blocked"  # Provider safety filter rejected this prompt — deterministic per-request, don't retry unchanged

    # Request format
    format_error = "format_error"        # 400 bad request — abort or strip + retry
@ -97,13 +98,20 @@ _BILLING_PATTERNS = [
    "insufficient_quota",
    "insufficient balance",
    "credit balance",
+    "credits exhausted",
    "credits have been exhausted",
+    "no usable credits",
    "top up your credits",
    "payment required",
    "billing hard limit",
    "exceeded your current quota",
    "account is deactivated",
    "plan does not include",
+    "out of funds",
+    "run out of funds",
+    "balance_depleted",
+    "model_not_supported_on_free_tier",
+    "not available on the free tier",
 ]

 # Patterns that indicate rate limiting (transient, will resolve)
@ -282,6 +290,45 @@ _PROVIDER_POLICY_BLOCKED_PATTERNS = [
    "no endpoints found matching your data policy",
 ]

+# Provider content-policy / safety-filter blocks. Distinct from
+# ``provider_policy_blocked`` above (which is an OpenRouter *account*-level
+# data/privacy guardrail) — these are *per-prompt* safety decisions made by
+# the upstream model provider. They are deterministic for the unchanged
+# request, so retrying the same prompt three times just reproduces the same
+# block and burns paid attempts on a refusal. The recovery is to switch to a
+# configured fallback model/provider immediately, or surface the block to
+# the user with actionable guidance if no fallback exists.
+#
+# Patterns are intentionally narrow — each phrase is a verbatim string from
+# a specific provider's safety pipeline, not a generic word like "policy" or
+# "violation" that could collide with billing/auth/format errors:
+#   • OpenAI Codex cybersecurity refusal (gpt-5.5, the case from #18028)
+#   • OpenAI moderation refusal ("violates our usage policies", with
+#     "usage policies" disambiguating from billing's "exceeded ... policy")
+#   • Anthropic safety refusal ("prompt was flagged by ... safety system")
+#   • OpenAI Responses content filter
+_CONTENT_POLICY_BLOCKED_PATTERNS = [
+    # OpenAI Codex (#18028) — message may arrive without an HTTP status
+    "flagged for possible cybersecurity risk",
+    "trusted access for cyber",
+    # OpenAI moderation — chat completions / responses
+    "violates our usage policies",
+    "violates openai's usage policies",
+    "your request was flagged by",
+    # Anthropic safety system
+    "prompt was flagged by our safety",
+    "responses cannot be generated due to safety",
+    # Generic content-filter wording seen on Azure / OpenAI Responses.
+    # ``content_filter`` (underscore) is the OpenAI-standard error/finish
+    # token surfaced verbatim by their SDKs when a request is blocked.
+    # ``responsibleaipolicyviolation`` is Azure OpenAI's error code.
+    # Deliberately NOT matching the space variant ("content filter") — it
+    # appears in benign config descriptions and tooltip text that providers
+    # echo back; the underscore form is provider-specific enough.
+    "content_filter",
+    "responsibleaipolicyviolation",
+]
+
 # Auth patterns (non-status-code signals)
 _AUTH_PATTERNS = [
    "invalid api key",
@ -485,6 +532,20 @@ def classify_api_error(

    # ── 1. Provider-specific patterns (highest priority) ────────────

+    # Provider content-policy / safety-filter block. The provider has made a
+    # deterministic refusal decision about THIS prompt — retrying unchanged
+    # just reproduces the same refusal and burns paid attempts. Must run
+    # before status-based classification so a 400 safety block isn't
+    # downgraded to a generic ``format_error`` and a status-less block
+    # (OpenAI Codex SDK can raise without one) isn't left in the retryable
+    # ``unknown`` bucket. See issue #18028.
+    if any(p in error_msg for p in _CONTENT_POLICY_BLOCKED_PATTERNS):
+        return _result(
+            FailoverReason.content_policy_blocked,
+            retryable=False,
+            should_fallback=True,
+        )
+
    # Anthropic thinking block signature invalid (400).
    # Don't gate on provider — OpenRouter proxies Anthropic errors, so the
    # provider may be "openrouter" even though the error is Anthropic-specific.
@ -690,8 +751,13 @@ def _classify_by_status(
        )

    if status_code == 403:
-        # OpenRouter 403 "key limit exceeded" is actually billing
-        if "key limit exceeded" in error_msg or "spending limit" in error_msg:
+        # OpenRouter 403 "key limit exceeded" is actually billing. Other
+        # providers also use 403 for account-plan or credit exhaustion.
+        if (
+            "key limit exceeded" in error_msg
+            or "spending limit" in error_msg
+            or any(p in error_msg for p in _BILLING_PATTERNS)
+        ):
            return result_fn(
                FailoverReason.billing,
                retryable=False,
@ -708,6 +774,17 @@ def _classify_by_status(
        return _classify_402(error_msg, result_fn)

    if status_code == 404:
+        # Nous API currently surfaces HA/NAS credit depletion as a paid model
+        # becoming unavailable on the Free Tier, returned as 404 rather than
+        # 402. Treat that as entitlement/billing exhaustion, not a missing
+        # model, so the retry loop can show credit/top-up guidance.
+        if any(p in error_msg for p in _BILLING_PATTERNS):
+            return result_fn(
+                FailoverReason.billing,
+                retryable=False,
+                should_rotate_credential=True,
+                should_fallback=True,
+            )
        # OpenRouter policy-block 404 — distinct from "model not found".
        # The model exists; the user's account privacy setting excludes the
        # only endpoint serving it. Falling back to another provider won't
@ -973,7 +1050,15 @@ def _classify_by_error_code(
            should_rotate_credential=True,
        )

-    if code_lower in {"insufficient_quota", "billing_not_active", "payment_required"}:
+    if code_lower in {
+        "insufficient_quota",
+        "billing_not_active",
+        "payment_required",
+        "insufficient_credits",
+        "no_usable_credits",
+        "balance_depleted",
+        "model_not_supported_on_free_tier",
+    }:
        return result_fn(
            FailoverReason.billing,
            retryable=False,
--- a/agent/file_safety.py
+++ b/agent/file_safety.py
@ -249,6 +249,10 @@ def get_read_block_error(path: str) -> Optional[str]:
        ".env",
        "webhook_subscriptions.json",
        os.path.join("auth", "google_oauth.json"),
+        # Bitwarden Secrets Manager disk cache: stores plaintext secret values
+        # to avoid re-fetching across back-to-back CLI invocations. The file
+        # was introduced by #31968 but not added to this guard.
+        os.path.join("cache", "bws_cache.json"),
    )
    for hd in hermes_dirs:
        for name in credential_file_names:
--- a/agent/google_code_assist.py
+++ b/agent/google_code_assist.py
@ -31,7 +31,6 @@ import json
 import logging
 import time
 import urllib.error
-import urllib.parse
 import urllib.request
 import uuid
 from dataclasses import dataclass, field
--- a/agent/google_oauth.py
+++ b/agent/google_oauth.py
@ -899,7 +899,15 @@ def start_oauth_flow(
        try:
            import webbrowser

-            webbrowser.open(auth_url, new=1, autoraise=True)
+            try:
+                from hermes_cli.auth import (
+                    _can_open_graphical_browser as _can_open_gui,
+                )
+            except Exception:
+                _can_open_gui = lambda: True  # noqa: E731
+
+            if _can_open_gui():
+                webbrowser.open(auth_url, new=1, autoraise=True)
        except Exception as exc:
            logger.debug("webbrowser.open failed: %s", exc)

--- a/agent/image_routing.py
+++ b/agent/image_routing.py
@ -37,6 +37,8 @@ from __future__ import annotations
 import base64
 import logging
 import mimetypes
+import os
+import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple

@ -46,6 +48,102 @@ logger = logging.getLogger(__name__)
 _VALID_MODES = frozenset({"auto", "native", "text"})


+# Image extensions used by extract_image_refs(). Kept tight on purpose — we
+# only auto-attach things the model can actually see. Documents/archives are
+# excluded because the gateway's broader extract_local_files() also routes
+# them differently (send_document), and we don't want to attach a PDF as a
+# vision part.
+_IMAGE_EXTS = (
+    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".heic",
+)
+_IMAGE_EXT_PATTERN = "|".join(e.lstrip(".") for e in _IMAGE_EXTS)
+
+# Absolute / home-relative local image path. Matches the same shape gateway's
+# extract_local_files() uses: anchors to ``~/`` or ``/``, ignores matches inside
+# URLs (the ``(?<![/:\w.])`` lookbehind), and case-insensitive on the extension.
+_LOCAL_IMAGE_PATH_RE = re.compile(
+    r"(?<![/:\w.])(?:~/|/)(?:[\w.\-]+/)*[\w.\-]+\.(?:" + _IMAGE_EXT_PATTERN + r")\b",
+    re.IGNORECASE,
+)
+
+# http(s) URL ending in an image extension (optionally followed by a
+# query string). Case-insensitive on the extension. Strict ``http(s)://``
+# scheme so we don't accidentally grab ``file://`` URLs or other shapes.
+_IMAGE_URL_RE = re.compile(
+    r"https?://[^\s<>\"']+?\.(?:" + _IMAGE_EXT_PATTERN + r")(?:\?[^\s<>\"']*)?",
+    re.IGNORECASE,
+)
+
+
+def extract_image_refs(text: str) -> Tuple[List[str], List[str]]:
+    """Scan free-form text for image references the model should see.
+
+    Returns ``(local_paths, urls)``:
+
+      * ``local_paths`` — absolute (``/``) or home-relative (``~/``) paths
+        whose suffix is an image extension AND whose expanded form exists
+        on disk as a file. Order-preserving, deduplicated.
+      * ``urls`` — ``http(s)://…`` URLs whose path ends in an image
+        extension (a ``?query`` is allowed after the extension).
+        Order-preserving, deduplicated.
+
+    Matches inside fenced code blocks (``` ``` ```) and inline backticks
+    (`` `…` ``) are skipped so that snippets pasted into a task body for
+    reference aren't mistaken for live attachments. This mirrors the
+    behaviour of ``gateway.platforms.base.BaseAdapter.extract_local_files``.
+
+    Local paths are validated against the filesystem; URLs are not
+    (the provider fetches them at request time).
+    """
+    if not isinstance(text, str) or not text:
+        return [], []
+
+    # Build spans covered by fenced code blocks and inline code so we can
+    # ignore references the author embedded purely as example text.
+    code_spans: list[tuple[int, int]] = []
+    for m in re.finditer(r"```[^\n]*\n.*?```", text, re.DOTALL):
+        code_spans.append((m.start(), m.end()))
+    for m in re.finditer(r"`[^`\n]+`", text):
+        code_spans.append((m.start(), m.end()))
+
+    def _in_code(pos: int) -> bool:
+        return any(s <= pos < e for s, e in code_spans)
+
+    local_paths: list[str] = []
+    seen_paths: set[str] = set()
+    for match in _LOCAL_IMAGE_PATH_RE.finditer(text):
+        if _in_code(match.start()):
+            continue
+        raw = match.group(0)
+        expanded = os.path.expanduser(raw)
+        try:
+            if not os.path.isfile(expanded):
+                continue
+        except OSError:
+            # ENAMETOOLONG / EINVAL on pathological inputs — skip rather than crash.
+            continue
+        if expanded in seen_paths:
+            continue
+        seen_paths.add(expanded)
+        local_paths.append(expanded)
+
+    urls: list[str] = []
+    seen_urls: set[str] = set()
+    for match in _IMAGE_URL_RE.finditer(text):
+        if _in_code(match.start()):
+            continue
+        url = match.group(0)
+        # Strip trailing punctuation that's almost certainly prose, not part
+        # of the URL (e.g. "see https://x.com/a.png." or "/a.png)").
+        url = url.rstrip(".,;:!?)]>")
+        if url in seen_urls:
+            continue
+        seen_urls.add(url)
+        urls.append(url)
+
+    return local_paths, urls
+
+
 # Strict YAML/JSON boolean coercion for capability overrides.
 #
 # ``bool("false")`` is True in Python because non-empty strings are truthy, so
@ -320,20 +418,29 @@ def _file_to_data_url(path: Path) -> Optional[str]:
 def build_native_content_parts(
    user_text: str,
    image_paths: List[str],
+    image_urls: Optional[List[str]] = None,
 ) -> Tuple[List[Dict[str, Any]], List[str]]:
    """Build an OpenAI-style ``content`` list for a user turn.

    Shape:
      [{"type": "text", "text": "...\\n\\n[Image attached at: /local/path]"},
       {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}},
+       {"type": "image_url", "image_url": {"url": "https://example.com/a.png"}},
       ...]

-    The local path of each successfully attached image is appended to the
-    text part as ``[Image attached at: <path>]``. The model still sees the
-    pixels via the ``image_url`` part (full native vision); the path note
-    just gives it a string handle so MCP/skill tools that take an image
-    path or URL argument can be invoked on the same image without an
-    extra round-trip. This parallels the text-mode hint produced by
+    Local paths are read from disk and embedded as base64 ``data:`` URLs.
+    Remote URLs (``http(s)://``) are passed through verbatim — the provider
+    fetches them server-side. The model still sees the pixels either way.
+
+    For each successfully attached image, a hint is appended to the text
+    part:
+
+      * local path → ``[Image attached at: <path>]``
+      * URL        → ``[Image attached: <url>]``
+
+    The hint gives the model a string handle so MCP/skill tools that take
+    an image path or URL argument can be invoked on the same image without
+    an extra round-trip. This parallels the text-mode hint produced by
    ``Runner._enrich_message_with_vision`` (``vision_analyze using image_url:
    <path>``) so behaviour is consistent across both image input modes.

@ -342,12 +449,14 @@ def build_native_content_parts(
    ceiling), the agent's retry loop transparently shrinks and retries
    once — see ``run_agent._try_shrink_image_parts_in_messages``.

-    Returns (content_parts, skipped_paths). Skipped paths are files that
-    couldn't be read from disk and are NOT advertised in the path hints.
+    Returns (content_parts, skipped). Skipped entries are local paths
+    that couldn't be read from disk; URLs are never skipped (they're
+    not validated here).
    """
    skipped: List[str] = []
    image_parts: List[Dict[str, Any]] = []
    attached_paths: List[str] = []
+    attached_urls: List[str] = []

    for raw_path in image_paths:
        p = Path(raw_path)
@ -364,16 +473,26 @@ def build_native_content_parts(
        })
        attached_paths.append(str(raw_path))

+    for url in image_urls or []:
+        url = (url or "").strip()
+        if not url:
+            continue
+        image_parts.append({
+            "type": "image_url",
+            "image_url": {"url": url},
+        })
+        attached_urls.append(url)
+
    text = (user_text or "").strip()

    # If at least one image attached, build a single text part that combines
-    # the user's caption (or a neutral default) with one path hint per image.
-    if attached_paths:
+    # the user's caption (or a neutral default) with one hint per image.
+    if attached_paths or attached_urls:
        base_text = text or "What do you see in this image?"
-        path_hints = "\n".join(
-            f"[Image attached at: {p}]" for p in attached_paths
-        )
-        combined_text = f"{base_text}\n\n{path_hints}"
+        hint_lines: List[str] = []
+        hint_lines.extend(f"[Image attached at: {p}]" for p in attached_paths)
+        hint_lines.extend(f"[Image attached: {u}]" for u in attached_urls)
+        combined_text = f"{base_text}\n\n" + "\n".join(hint_lines)
        parts: List[Dict[str, Any]] = [{"type": "text", "text": combined_text}]
        parts.extend(image_parts)
        return parts, skipped
@ -388,4 +507,5 @@ def build_native_content_parts(
 __all__ = [
    "decide_image_input_mode",
    "build_native_content_parts",
+    "extract_image_refs",
 ]
--- a/agent/jiter_preload.py
+++ b/agent/jiter_preload.py
@ -0,0 +1,39 @@
+"""Best-effort early import for the OpenAI SDK's native streaming parser.
+
+The OpenAI SDK imports ``jiter`` while constructing streaming chat-completion
+responses.  On some Windows installs the native extension can be imported
+directly from the Hermes venv, but the first import fails when it happens later
+inside the threaded streaming request path.  Loading it once during agent
+package import avoids that import-order failure while preserving the normal
+SDK error path for genuinely missing or broken installs.
+"""
+
+from __future__ import annotations
+
+import importlib
+
+_JITER_PRELOADED = False
+_JITER_PRELOAD_ERROR: Exception | None = None
+
+
+def preload_jiter_native_extension() -> bool:
+    """Import jiter's native extension early if it is available."""
+
+    global _JITER_PRELOADED, _JITER_PRELOAD_ERROR
+
+    if _JITER_PRELOADED:
+        return True
+
+    try:
+        importlib.import_module("jiter.jiter")
+        from jiter import from_json as _from_json  # noqa: F401
+    except Exception as exc:
+        _JITER_PRELOAD_ERROR = exc
+        return False
+
+    _JITER_PRELOADED = True
+    _JITER_PRELOAD_ERROR = None
+    return True
+
+
+preload_jiter_native_extension()
--- a/agent/lsp/cli.py
+++ b/agent/lsp/cli.py
@ -16,7 +16,6 @@ from __future__ import annotations

 import argparse
 import sys
-from typing import Optional


 def register_subparser(subparsers: argparse._SubParsersAction) -> None:
@ -249,7 +248,6 @@ def _cmd_restart() -> int:

 def _cmd_which(server_id: str) -> int:
    from agent.lsp.install import INSTALL_RECIPES, hermes_lsp_bin_dir
-    import os
    import shutil as _shutil

    recipe = INSTALL_RECIPES.get(server_id)
--- a/agent/lsp/manager.py
+++ b/agent/lsp/manager.py
@ -39,25 +39,20 @@ import logging
 import os
 import threading
 import time
-from concurrent.futures import Future as ConcurrentFuture
 from typing import Any, Callable, Dict, List, Optional, Tuple

 from agent.lsp import eventlog
 from agent.lsp.client import (
    DIAGNOSTICS_DOCUMENT_WAIT,
    LSPClient,
-    file_uri,
 )
 from agent.lsp.servers import (
    ServerContext,
-    ServerDef,
-    SpawnSpec,
    find_server_for_file,
    language_id_for,
 )
 from agent.lsp.workspace import (
    clear_cache,
-    is_inside_workspace,
    resolve_workspace_for_file,
 )

--- a/agent/lsp/servers.py
+++ b/agent/lsp/servers.py
@ -25,7 +25,7 @@ import shutil
 from dataclasses import dataclass, field
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple

-from agent.lsp.workspace import nearest_root, normalize_path
+from agent.lsp.workspace import nearest_root

 logger = logging.getLogger("agent.lsp.servers")

--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@ -368,11 +368,42 @@ class MemoryManager:

    # -- Sync ----------------------------------------------------------------

-    def sync_all(self, user_content: str, assistant_content: str, *, session_id: str = "") -> None:
+    @staticmethod
+    def _provider_sync_accepts_messages(provider: MemoryProvider) -> bool:
+        """Return whether sync_turn accepts a messages keyword."""
+        try:
+            signature = inspect.signature(provider.sync_turn)
+        except (TypeError, ValueError):
+            return True
+        params = list(signature.parameters.values())
+        if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params):
+            return True
+        return "messages" in signature.parameters
+
+    def sync_all(
+        self,
+        user_content: str,
+        assistant_content: str,
+        *,
+        session_id: str = "",
+        messages: Optional[List[Dict[str, Any]]] = None,
+    ) -> None:
        """Sync a completed turn to all providers."""
        for provider in self._providers:
            try:
-                provider.sync_turn(user_content, assistant_content, session_id=session_id)
+                if messages is not None and self._provider_sync_accepts_messages(provider):
+                    provider.sync_turn(
+                        user_content,
+                        assistant_content,
+                        session_id=session_id,
+                        messages=messages,
+                    )
+                else:
+                    provider.sync_turn(
+                        user_content,
+                        assistant_content,
+                        session_id=session_id,
+                    )
            except Exception as e:
                logger.warning(
                    "Memory provider '%s' sync_turn failed: %s",
--- a/agent/memory_provider.py
+++ b/agent/memory_provider.py
@ -112,11 +112,22 @@ class MemoryProvider(ABC):
        that do background prefetching should override this.
        """

-    def sync_turn(self, user_content: str, assistant_content: str, *, session_id: str = "") -> None:
+    def sync_turn(
+        self,
+        user_content: str,
+        assistant_content: str,
+        *,
+        session_id: str = "",
+        messages: Optional[List[Dict[str, Any]]] = None,
+    ) -> None:
        """Persist a completed turn to the backend.

        Called after each turn. Should be non-blocking — queue for
        background processing if the backend has latency.
+
+        ``messages`` is the OpenAI-style conversation message list as of the
+        completed turn, including any assistant tool calls and tool results.
+        Providers that do not need raw turn context can ignore it.
        """

    @abstractmethod
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -141,6 +141,8 @@ DEFAULT_CONTEXT_LENGTHS = {
    # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
    # substring of "anthropic/claude-sonnet-4.6").
    # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
+    "claude-opus-4-8": 1000000,
+    "claude-opus-4.8": 1000000,
    "claude-opus-4-7": 1000000,
    "claude-opus-4.7": 1000000,
    "claude-opus-4-6": 1000000,
@ -911,12 +913,33 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
    return None


+def get_context_length_from_provider_error(
+    error_msg: str,
+    current_context_length: int,
+) -> Optional[int]:
+    """Return a provider-reported lower context limit, if one is present.
+
+    Context-overflow recovery must not invent a new model window size.  Some
+    providers only say that the input exceeds the context window without
+    reporting the actual maximum.  In that case callers should keep the
+    configured context length and try compression only, rather than stepping
+    down through guessed probe tiers (1M → 256K → 128K → ...).
+    """
+    parsed_limit = parse_context_limit_from_error(error_msg)
+    if parsed_limit is None:
+        return None
+    if parsed_limit < current_context_length:
+        return parsed_limit
+    return None
+
+
 def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
    """Detect an "output cap too large" error and return how many output tokens are available.

    Background — two distinct context errors exist:
      1. "Prompt too long"  — the INPUT itself exceeds the context window.
-           Fix: compress history and/or halve context_length.
+           Fix: compress history, and only reduce context_length if the
+           provider explicitly reports the actual lower limit.
      2. "max_tokens too large" — input is fine, but input + requested_output > window.
           Fix: reduce max_tokens (the output cap) for this call.
           Do NOT touch context_length — the window hasn't shrunk.
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@ -7,7 +7,6 @@ assemble pieces, then combines them with memory and ephemeral prompts.
 import json
 import logging
 import os
-import re
 import threading
 from collections import OrderedDict
 from pathlib import Path
@ -236,6 +235,11 @@ KANBAN_GUIDANCE = (
    "- Do not shell out to `hermes kanban <verb>` for board operations. Use "
    "the `kanban_*` tools — they work across all terminal backends.\n"
    "- Do not complete a task you didn't actually finish. Block it.\n"
+    "- Do not call `clarify` to ask questions. You are running headless — "
+    "there is no live user to answer. The call will time out and the task "
+    "will sit silently in `running` with no signal to the operator. Instead: "
+    "`kanban_comment` the context, then `kanban_block(reason=...)` so the "
+    "task surfaces on the board as needing input.\n"
    "- Do not assign follow-up work to yourself. Assign it to the right "
    "specialist profile.\n"
    "- Do not call `delegate_task` as a board substitute. `delegate_task` is "
@ -262,6 +266,37 @@ TOOL_USE_ENFORCEMENT_GUIDANCE = (
 # Add new patterns here when a model family needs explicit steering.
 TOOL_USE_ENFORCEMENT_MODELS = ("gpt", "codex", "gemini", "gemma", "grok", "glm", "qwen", "deepseek")

+# Universal "finish the job" guidance — applied to ALL models, not gated
+# by model family.  Addresses two cross-model failure modes:
+#   1. Stopping after a stub: writing a tiny file or running one command
+#      and then ending the turn with a description of the plan instead
+#      of the finished artifact.  (Observed on Opus during a real
+#      Sarasota real-estate build task: 3 API calls, 85-byte file,
+#      one terminal command, finish_reason=stop.)
+#   2. Fabricating output when a real path is blocked.  When `pip` or a
+#      tool fails, some models will synthesize plausible-looking results
+#      (fake addresses, fake JSON, fake numbers) instead of reporting
+#      the blocker.  (Observed on DeepSeek v4-flash on the same task:
+#      pushed through PEP-668 wall, then returned fabricated listings.)
+#
+# Short on purpose.  This block is shipped to every user, every session,
+# in the cached system prompt — token cost is paid once at install and
+# then amortised across all sessions via prefix caching.  Keep it tight.
+TASK_COMPLETION_GUIDANCE = (
+    "# Finishing the job\n"
+    "When the user asks you to build, run, or verify something, the deliverable is "
+    "a working artifact backed by real tool output — not a description of one. "
+    "Do not stop after writing a stub, a plan, or a single command. Keep working "
+    "until you have actually exercised the code or produced the requested result, "
+    "then report what real execution returned.\n"
+    "If a tool, install, or network call fails and blocks the real path, say so "
+    "directly and try an alternative (different package manager, different "
+    "approach, ask the user). NEVER substitute plausible-looking fabricated "
+    "output (made-up data, invented file contents, synthesised API responses) "
+    "for results you couldn't actually produce. Reporting a blocker honestly "
+    "is always better than inventing a result."
+)
+
 # OpenAI GPT/Codex-specific execution guidance.  Addresses known failure modes
 # where GPT models abandon work on partial results, skip prerequisite lookups,
 # hallucinate instead of using tools, and declare "done" without verification.
@ -813,6 +848,27 @@ def build_environment_hints() -> str:

    if is_wsl():
        hints.append(WSL_ENVIRONMENT_HINT)
+
+    # Embedder-supplied environment description. Lets a host that wraps Hermes
+    # (e.g. a sandbox runner / managed platform) explain the environment the
+    # agent is running in — proxy, credential handling, mount layout — without
+    # forking the identity slot (SOUL.md). Read once at prompt-build time, so
+    # it's part of the stable, cache-safe system prompt. The env var is the
+    # build-time/embedder mechanism (set in a container ENV); config.yaml
+    # ``agent.environment_hint`` is the user-facing surface. Env var wins.
+    extra = (os.getenv("HERMES_ENVIRONMENT_HINT") or "").strip()
+    if not extra:
+        try:
+            from hermes_cli.config import load_config
+
+            extra = str(
+                (load_config().get("agent", {}) or {}).get("environment_hint", "")
+            ).strip()
+        except Exception as e:
+            logger.debug("Could not read agent.environment_hint from config: %s", e)
+    if extra:
+        hints.append(extra)
+
    return "\n\n".join(hints)


--- a/agent/redact.py
+++ b/agent/redact.py
@ -331,7 +331,7 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    """Apply all redaction patterns to a block of text.

    Safe to call on any string -- non-matching text passes through unchanged.
-    Disabled by default — enable via security.redact_secrets: true in config.yaml.
+    Enabled by default. Disable via security.redact_secrets: false in config.yaml.
    Set force=True for safety boundaries that must never return raw secrets
    regardless of the user's global logging redaction preference.

@ -406,19 +406,14 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
    if "eyJ" in text:
        text = _JWT_RE.sub(lambda m: _mask_token(m.group(0)), text)

-    # URL userinfo (http(s)://user:pass@host) — redact for non-DB schemes.
-    # DB schemes are handled above by _DB_CONNSTR_RE.
-    if "://" in text:
-        text = _redact_url_userinfo(text)
-
-        # URL query params containing opaque tokens (?access_token=…&code=…)
-        if "?" in text:
-            text = _redact_url_query_params(text)
-
-    # HTTP access logs can contain relative request targets with query params
-    # and no URL scheme, e.g. `"POST /hook?password=... HTTP/1.1"`.
-    if "?" in text and "=" in text and _has_http_method_substring(text):
-        text = _redact_http_request_target_query_params(text)
+    # NOTE: Web-URL redaction (query params + userinfo + HTTP access-log
+    # request targets) is intentionally OFF. Many legitimate workflows pass
+    # opaque tokens through query strings — magic-link checkouts, OAuth
+    # callbacks the agent is meant to follow, pre-signed share URLs — and
+    # blanket-redacting param values by name breaks those skills mid-flow.
+    # Known credential shapes (sk-, ghp_, JWTs, etc.) inside URLs are still
+    # caught by _PREFIX_RE and _JWT_RE above. DB connection-string passwords
+    # are still caught by _DB_CONNSTR_RE.

    # Form-urlencoded bodies (only triggers on clean k=v&k=v inputs).
    if "&" in text and "=" in text:
--- a/agent/secret_sources/bitwarden.py
+++ b/agent/secret_sources/bitwarden.py
@ -37,7 +37,6 @@ import platform
 import shutil
 import stat
 import subprocess
-import sys
 import tempfile
 import time
 import urllib.error
--- a/agent/stream_diag.py
+++ b/agent/stream_diag.py
@ -258,7 +258,7 @@ def emit_stream_drop(
        except Exception:
            pass
    try:
-        agent._emit_status(
+        agent._buffer_status(
            f"⚠️ {provider} stream {kind} ({type(error).__name__}){_suffix} "
            f"— reconnecting, retry {attempt}/{max_attempts}"
        )
--- a/agent/system_prompt.py
+++ b/agent/system_prompt.py
@ -37,6 +37,7 @@ from agent.prompt_builder import (
    PLATFORM_HINTS,
    SESSION_SEARCH_GUIDANCE,
    SKILLS_GUIDANCE,
+    TASK_COMPLETION_GUIDANCE,
    TOOL_USE_ENFORCEMENT_GUIDANCE,
    TOOL_USE_ENFORCEMENT_MODELS,
 )
@ -100,6 +101,15 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None)
    # Pointer to the hermes-agent skill + docs for user questions about Hermes itself.
    stable_parts.append(HERMES_AGENT_HELP_GUIDANCE)

+    # Universal task-completion / no-fabrication guidance.  Applied to ALL
+    # models regardless of tool_use_enforcement gating — the failure modes
+    # this targets (stopping after a stub; fabricating output when a real
+    # path is blocked) are not model-family specific.  Gated only by
+    # config.yaml ``agent.task_completion_guidance`` (default True) so
+    # users who want a leaner prompt can turn it off.
+    if getattr(agent, "_task_completion_guidance", True) and agent.valid_tool_names:
+        stable_parts.append(TASK_COMPLETION_GUIDANCE)
+
    # Tool-aware behavioral guidance: only inject when the tools are loaded
    tool_guidance = []
    if "memory" in agent.valid_tool_names:
@ -205,6 +215,23 @@ def build_system_prompt_parts(agent: Any, system_message: Optional[str] = None)
    if _env_hints:
        stable_parts.append(_env_hints)

+    # Local Python toolchain probe — names python/pip/uv/PEP-668 state when
+    # something is non-default so the model can pick the right install
+    # strategy without discovering by failure.  Emits a single line; emits
+    # NOTHING when the environment is clean (no token cost).  Skipped
+    # entirely for remote terminal backends (the host's Python state is
+    # irrelevant when tools run inside docker/modal/ssh).  Gated by
+    # config.yaml ``agent.environment_probe`` (default True).
+    if getattr(agent, "_environment_probe", True):
+        try:
+            from tools.env_probe import get_environment_probe_line
+            _probe_line = get_environment_probe_line()
+            if _probe_line:
+                stable_parts.append(_probe_line)
+        except Exception:
+            # Probe failure must never block prompt build.
+            pass
+
    # Active-profile hint — names the Hermes profile the agent is running
    # under so it doesn't conflate ~/.hermes/skills/ (default profile) with
    # ~/.hermes/profiles/<active>/skills/ (this profile's). Deterministic
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@ -13,14 +13,13 @@ extracted functions reach back through the ``run_agent`` module via
 from __future__ import annotations

 import concurrent.futures
-import contextvars
 import json
 import logging
 import os
 import random
 import threading
 import time
-from typing import Any, Optional
+from typing import Optional

 from agent.display import (
    KawaiiSpinner,
@ -38,12 +37,9 @@ from agent.tool_dispatch_helpers import (
    make_tool_result_message,
 )
 from tools.terminal_tool import (
-    _get_approval_callback,
-    _get_sudo_password_callback,
-    set_approval_callback as _set_approval_callback,
-    set_sudo_password_callback as _set_sudo_password_callback,
    get_active_env,
 )
+from tools.thread_context import propagate_context_to_thread
 from tools.tool_result_storage import (
    maybe_persist_tool_result,
    enforce_turn_budget,
@ -62,6 +58,55 @@ def _ra():
    return run_agent


+def _tool_search_scoped_names(agent) -> frozenset:
+    """Return the deferrable tool names the session may invoke via tool_call.
+
+    The Tool Search unwrap dispatches the underlying tool directly, bypassing
+    the bridge branch (and its scope check) in
+    ``model_tools.handle_function_call``. To keep a restricted-toolset session
+    (subagent, kanban worker, curated gateway session) from reaching tools it
+    was never granted, the unwrap validates the underlying name against this
+    set: the deferrable subset of the session's own enabled/disabled toolset
+    scope.
+
+    Result is cached on the agent and refreshed when the tool registry's
+    generation changes (e.g. an MCP server reconnects), so the common case is
+    a dict lookup, not a full tool-defs rebuild on every tool call.
+    """
+    try:
+        import model_tools
+        from tools import tool_search as _ts
+        from tools.registry import registry as _registry
+    except Exception:
+        return frozenset()
+
+    enabled = getattr(agent, "enabled_toolsets", None)
+    disabled = getattr(agent, "disabled_toolsets", None)
+    cache_key = (
+        getattr(_registry, "_generation", 0),
+        frozenset(enabled) if enabled is not None else None,
+        frozenset(disabled) if disabled is not None else None,
+    )
+    cached = getattr(agent, "_tool_search_scope_cache", None)
+    if cached is not None and cached[0] == cache_key:
+        return cached[1]
+    try:
+        scoped_defs = model_tools.get_tool_definitions(
+            enabled_toolsets=enabled,
+            disabled_toolsets=disabled,
+            quiet_mode=True,
+            skip_tool_search_assembly=True,
+        ) or []
+        names = _ts.scoped_deferrable_names(scoped_defs)
+    except Exception:
+        names = frozenset()
+    try:
+        agent._tool_search_scope_cache = (cache_key, names)
+    except Exception:
+        pass
+    return names
+
+
 def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
    """Execute multiple tool calls concurrently using a thread pool.

@ -100,6 +145,41 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
        if not isinstance(function_args, dict):
            function_args = {}

+        # ── Tool Search unwrap ────────────────────────────────────────
+        # When the model invokes the tool_call bridge, peel it open so
+        # every downstream check (checkpointing, guardrails, plugin
+        # pre-tool-call hooks, the display/activity feed, the post-call
+        # callback) sees the underlying tool — not the bridge. This is
+        # the OpenClaw lesson: hooks must observe the real tool name.
+        #
+        # The original tool_call entry on ``tool_call.function`` is left
+        # untouched so the conversation transcript and the matching
+        # tool_call_id are preserved exactly as the model emitted them.
+        #
+        # Scope gate: the unwrap dispatches the underlying tool directly
+        # (bypassing the bridge branch in handle_function_call and its
+        # scope check), so we enforce session toolset scope HERE. A tool
+        # the session was not granted is rejected before any checkpoint,
+        # hook, or dispatch fires.
+        _ts_scope_block = None
+        try:
+            from tools import tool_search as _ts
+            if function_name == _ts.TOOL_CALL_NAME:
+                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
+                if not _err and _underlying:
+                    if _underlying in _tool_search_scoped_names(agent):
+                        function_name = _underlying
+                        function_args = _underlying_args
+                    else:
+                        _ts_scope_block = json.dumps({
+                            "error": (
+                                f"'{_underlying}' is not available in this session. "
+                                "Use tool_search to find tools you can call."
+                            ),
+                        }, ensure_ascii=False)
+        except Exception:
+            pass
+
        # Checkpoint for file-mutating tools
        if function_name in {"write_file", "patch"} and agent._checkpoint_mgr.enabled:
            try:
@ -124,21 +204,25 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe

        block_result = None
        blocked_by_guardrail = False
-        try:
-            from hermes_cli.plugins import get_pre_tool_call_block_message
-            block_message = get_pre_tool_call_block_message(
-                function_name, function_args, task_id=effective_task_id or "",
-            )
-        except Exception:
-            block_message = None
-
-        if block_message is not None:
-            block_result = json.dumps({"error": block_message}, ensure_ascii=False)
+        if _ts_scope_block is not None:
+            # Out-of-scope tool_call: reject before hooks/guardrails/dispatch.
+            block_result = _ts_scope_block
        else:
-            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
-            if not guardrail_decision.allows_execution:
-                block_result = agent._guardrail_block_result(guardrail_decision)
-                blocked_by_guardrail = True
+            try:
+                from hermes_cli.plugins import get_pre_tool_call_block_message
+                block_message = get_pre_tool_call_block_message(
+                    function_name, function_args, task_id=effective_task_id or "",
+                )
+            except Exception:
+                block_message = None
+
+            if block_message is not None:
+                block_result = json.dumps({"error": block_message}, ensure_ascii=False)
+            else:
+                guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
+                if not guardrail_decision.allows_execution:
+                    block_result = agent._guardrail_block_result(guardrail_decision)
+                    blocked_by_guardrail = True

        parsed_calls.append((tool_call, function_name, function_args, block_result, blocked_by_guardrail))

@ -186,14 +270,6 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
    agent._current_tool = tool_names_str
    agent._touch_activity(f"executing {num_tools} tools concurrently: {tool_names_str}")

-    # Capture CLI callbacks from the agent thread so worker threads can
-    # register them locally.  Without this, _get_approval_callback() in
-    # terminal_tool returns None in ThreadPoolExecutor workers, causing
-    # the dangerous-command prompt to fall back to input() — which
-    # deadlocks against prompt_toolkit's raw terminal mode (#13617).
-    _parent_approval_cb = _get_approval_callback()
-    _parent_sudo_cb = _get_sudo_password_callback()
-
    def _run_tool(index, tool_call, function_name, function_args):
        """Worker function executed in a thread."""
        # Register this worker tid so the agent can fan out an interrupt
@ -220,18 +296,9 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            set_activity_callback(agent._touch_activity)
        except Exception:
            pass
-        # Propagate approval/sudo callbacks to this worker thread.
-        # Mirrors cli.py run_agent() pattern (GHSA-qg5c-hvr5-hjgr).
-        if _parent_approval_cb is not None:
-            try:
-                _set_approval_callback(_parent_approval_cb)
-            except Exception:
-                pass
-        if _parent_sudo_cb is not None:
-            try:
-                _set_sudo_password_callback(_parent_sudo_cb)
-            except Exception:
-                pass
+        # Approval/sudo callbacks (thread-local) and the agent turn's
+        # ContextVars are propagated by propagate_context_to_thread() at the
+        # submit site below (GHSA-qg5c-hvr5-hjgr, #13617).
        start = time.time()
        try:
            result = agent._invoke_tool(
@ -261,13 +328,6 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            _ra()._set_interrupt(False, _worker_tid)
        except Exception:
            pass
-        # Clear thread-local callbacks so a recycled worker thread
-        # doesn't hold stale references to a disposed CLI instance.
-        try:
-            _set_approval_callback(None)
-            _set_sudo_password_callback(None)
-        except Exception:
-            pass

    # Start spinner for CLI mode (skip when TUI handles tool progress)
    spinner = None
@ -287,9 +347,12 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
            max_workers = min(len(runnable_calls), _MAX_TOOL_WORKERS)
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                for i, tc, name, args in runnable_calls:
-                    # Propagate ContextVars (e.g. _approval_session_key); mirrors asyncio.to_thread.
-                    ctx = contextvars.copy_context()
-                    f = executor.submit(ctx.run, _run_tool, i, tc, name, args)
+                    # Propagate the agent turn's ContextVars (e.g.
+                    # _approval_session_key) AND thread-local approval/sudo
+                    # callbacks into the worker thread; clears callbacks on exit.
+                    f = executor.submit(
+                        propagate_context_to_thread(_run_tool), i, tc, name, args
+                    )
                    futures.append(f)

                # Wait for all to complete with periodic heartbeats so the
@ -497,16 +560,39 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
        if not isinstance(function_args, dict):
            function_args = {}

-        # Check plugin hooks for a block directive before executing.
-        _block_msg: Optional[str] = None
+        # Tool Search unwrap — see execute_tool_calls_concurrent for full
+        # rationale, including the scope gate (the unwrap dispatches the
+        # underlying tool directly, so session toolset scope is enforced here).
+        _ts_scope_block: Optional[str] = None
        try:
-            from hermes_cli.plugins import get_pre_tool_call_block_message
-            _block_msg = get_pre_tool_call_block_message(
-                function_name, function_args, task_id=effective_task_id or "",
-            )
+            from tools import tool_search as _ts
+            if function_name == _ts.TOOL_CALL_NAME:
+                _underlying, _underlying_args, _err = _ts.resolve_underlying_call(function_args)
+                if not _err and _underlying:
+                    if _underlying in _tool_search_scoped_names(agent):
+                        function_name = _underlying
+                        function_args = _underlying_args
+                    else:
+                        _ts_scope_block = (
+                            f"'{_underlying}' is not available in this session. "
+                            "Use tool_search to find tools you can call."
+                        )
        except Exception:
            pass

+        # Check plugin hooks for a block directive before executing.
+        _block_msg: Optional[str] = None
+        if _ts_scope_block is not None:
+            _block_msg = _ts_scope_block
+        else:
+            try:
+                from hermes_cli.plugins import get_pre_tool_call_block_message
+                _block_msg = get_pre_tool_call_block_message(
+                    function_name, function_args, task_id=effective_task_id or "",
+                )
+            except Exception:
+                pass
+
        _guardrail_block_decision: ToolGuardrailDecision | None = None
        if _block_msg is None:
            guardrail_decision = agent._tool_guardrails.before_call(function_name, function_args)
@ -752,6 +838,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                    session_id=agent.session_id or "",
                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
                    skip_pre_tool_call_hook=True,
+                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                )
                _spinner_result = function_result
            except Exception as tool_error:
@ -772,6 +860,8 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
                    session_id=agent.session_id or "",
                    enabled_tools=list(agent.valid_tool_names) if agent.valid_tool_names else None,
                    skip_pre_tool_call_hook=True,
+                    enabled_toolsets=getattr(agent, "enabled_toolsets", None),
+                    disabled_toolsets=getattr(agent, "disabled_toolsets", None),
                )
            except Exception as tool_error:
                function_result = f"Error executing tool '{function_name}': {tool_error}"
--- a/agent/transports/chat_completions.py
+++ b/agent/transports/chat_completions.py
@ -10,7 +10,7 @@ reasoning configuration, temperature handling, and extra_body assembly.
 """

 import copy
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict

 from agent.lmstudio_reasoning import resolve_lmstudio_effort
 from agent.moonshot_schema import is_moonshot_model, sanitize_moonshot_tools
@ -476,13 +476,17 @@ class ChatCompletionsTransport(ProviderTransport):
        ephemeral = params.get("ephemeral_max_output_tokens")
        user_max = params.get("max_tokens")
        anthropic_max = params.get("anthropic_max_output")
+        # Per-model default cap — profiles override get_max_tokens() when
+        # they front several backends with different completion-token limits
+        # (e.g. opencode-go: mimo-v2.5-pro = 131072).
+        profile_max = profile.get_max_tokens(model)

        if ephemeral is not None and max_tokens_fn:
            api_kwargs.update(max_tokens_fn(ephemeral))
        elif user_max is not None and max_tokens_fn:
            api_kwargs.update(max_tokens_fn(user_max))
-        elif profile.default_max_tokens and max_tokens_fn:
-            api_kwargs.update(max_tokens_fn(profile.default_max_tokens))
+        elif profile_max and max_tokens_fn:
+            api_kwargs.update(max_tokens_fn(profile_max))
        elif anthropic_max is not None:
            api_kwargs["max_tokens"] = anthropic_max

--- a/agent/transports/codex_app_server.py
+++ b/agent/transports/codex_app_server.py
@ -23,7 +23,7 @@ import subprocess
 import threading
 import time
 from dataclasses import dataclass, field
-from typing import Any, Callable, Optional
+from typing import Any, Optional

 # Default minimum codex version we test against. The PR sets this from the
 # `codex --version` parsed at install time; bumping is a one-line change here.
--- a/agent/transports/codex_app_server_session.py
+++ b/agent/transports/codex_app_server_session.py
@ -31,6 +31,7 @@ import time
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional

+from agent.codex_responses_adapter import _format_responses_error
 from agent.redact import redact_sensitive_text
 from agent.transports.codex_app_server import (
    CodexAppServerClient,
@ -581,7 +582,7 @@ class CodexAppServerSession:
                        (note.get("params") or {}).get("turn") or {}
                    ).get("error")
                    if err_obj:
-                        err_msg = err_obj.get("message") or str(err_obj)
+                        err_msg = _format_responses_error(err_obj, str(turn_status))
                        # If the turn failed for an auth/refresh reason,
                        # rewrite the error into a re-auth hint AND mark
                        # the session for retirement.
--- a/agent/usage_pricing.py
+++ b/agent/usage_pricing.py
@ -83,6 +83,34 @@ _UTC_NOW = lambda: datetime.now(timezone.utc)
 # Official docs snapshot entries. Models whose published pricing and cache
 # semantics are stable enough to encode exactly.
 _OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
+    # ── Anthropic Claude 4.8 ─────────────────────────────────────────────
+    # Same $5/$25 base pricing as 4.6/4.7.  Fast-mode variant is a separate
+    # model ID with 2x premium (vs the 6x premium on older Opus generations).
+    # Source: https://openrouter.ai/anthropic/claude-opus-4.8
+    (
+        "anthropic",
+        "claude-opus-4-8",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("5.00"),
+        output_cost_per_million=Decimal("25.00"),
+        cache_read_cost_per_million=Decimal("0.50"),
+        cache_write_cost_per_million=Decimal("6.25"),
+        source="official_docs_snapshot",
+        source_url="https://platform.claude.com/docs/en/about-claude/pricing",
+        pricing_version="anthropic-pricing-2026-05",
+    ),
+    (
+        "anthropic",
+        "claude-opus-4-8-fast",
+    ): PricingEntry(
+        input_cost_per_million=Decimal("10.00"),
+        output_cost_per_million=Decimal("50.00"),
+        cache_read_cost_per_million=Decimal("1.00"),
+        cache_write_cost_per_million=Decimal("12.50"),
+        source="official_docs_snapshot",
+        source_url="https://openrouter.ai/anthropic/claude-opus-4.8-fast",
+        pricing_version="anthropic-pricing-2026-05",
+    ),
    # ── Anthropic Claude 4.7 ─────────────────────────────────────────────
    # Opus 4.5/4.6/4.7 share $5/$25 pricing (new tokenizer, up to 35% more
    # tokens for the same text).
--- a/agent/web_search_provider.py
+++ b/agent/web_search_provider.py
@ -61,14 +61,14 @@ from typing import Any, Dict, List


 class WebSearchProvider(abc.ABC):
-    """Abstract base class for a web search/extract/crawl backend.
+    """Abstract base class for a web search/extract backend.

    Subclasses must implement :meth:`is_available` and at least one of
-    :meth:`search` / :meth:`extract` / :meth:`crawl`. The
-    :meth:`supports_search` / :meth:`supports_extract` / :meth:`supports_crawl`
-    capability flags let the registry route each tool call to the right
-    provider, and let multi-capability providers (Firecrawl, Tavily, Exa,
-    …) advertise multiple capabilities from a single class.
+    :meth:`search` / :meth:`extract`. The :meth:`supports_search` /
+    :meth:`supports_extract` capability flags let the registry route each
+    tool call to the right provider, and let multi-capability providers
+    (Firecrawl, Tavily, Exa, …) advertise multiple capabilities from a
+    single class.
    """

    @property
@ -113,22 +113,6 @@ class WebSearchProvider(abc.ABC):
        """
        return False

-    def supports_crawl(self) -> bool:
-        """Return True if this provider implements :meth:`crawl`.
-
-        Crawl differs from extract in that the agent provides a *seed URL*
-        and the provider walks linked pages on its own — useful for
-        documentation sites where the agent doesn't know all relevant
-        URLs upfront. Tavily is the only built-in backend that natively
-        crawls today; Firecrawl provides a similar capability that we
-        don't currently surface as a tool.
-
-        Providers that don't crawl should leave this as False; the
-        dispatcher in :func:`tools.web_tools.web_crawl_tool` will fall
-        back to its auxiliary-model summarization path.
-        """
-        return False
-
    def search(self, query: str, limit: int = 5) -> Dict[str, Any]:
        """Execute a web search.

@ -173,26 +157,6 @@ class WebSearchProvider(abc.ABC):
            f"{self.name} does not support extract (override supports_extract)"
        )

-    def crawl(self, url: str, **kwargs: Any) -> Any:
-        """Crawl a seed URL and return results.
-
-        Override when :meth:`supports_crawl` returns True. The default
-        raises NotImplementedError; callers should gate on
-        :meth:`supports_crawl` before calling.
-
-        Return shape: ``{"results": [{"url": str, "title": str,
-        "content": str, ...}, ...]}`` matching what
-        :func:`tools.web_tools.web_crawl_tool` post-processing expects.
-
-        Implementations MAY be ``async def``.
-
-        ``kwargs`` may carry forward-compat fields (e.g. ``max_depth``,
-        ``include_domains``) — implementations should ignore unknown keys.
-        """
-        raise NotImplementedError(
-            f"{self.name} does not support crawl (override supports_crawl)"
-        )
-
    def get_setup_schema(self) -> Dict[str, Any]:
        """Return provider metadata for the ``hermes tools`` picker.

--- a/agent/web_search_registry.py
+++ b/agent/web_search_registry.py
@ -11,7 +11,7 @@ Active selection
 ----------------
 The active provider is chosen by configuration with this precedence:

-1. ``web.search_backend`` / ``web.extract_backend`` / ``web.crawl_backend``
+1. ``web.search_backend`` / ``web.extract_backend``
   (per-capability override).
 2. ``web.backend`` (shared fallback).
 3. If exactly one capability-eligible provider is registered AND available,
@ -24,10 +24,10 @@ The active provider is chosen by configuration with this precedence:
 5. Otherwise ``None`` — the tool surfaces a helpful error pointing at
   ``hermes tools``.

-The capability filter (``supports_search`` / ``supports_extract`` /
-``supports_crawl``) is applied at every step so a search-only provider
-(``brave-free``) configured as ``web.extract_backend`` correctly falls
-through to an extract-capable backend.
+The capability filter (``supports_search`` / ``supports_extract``) is
+applied at every step so a search-only provider (``brave-free``)
+configured as ``web.extract_backend`` correctly falls through to an
+extract-capable backend.
 """

 from __future__ import annotations
@ -131,7 +131,7 @@ _LEGACY_PREFERENCE = (


 def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearchProvider]:
-    """Resolve the active provider for a capability ("search" | "extract" | "crawl").
+    """Resolve the active provider for a capability ("search" | "extract").

    Resolution rules (in order):

@ -168,8 +168,6 @@ def _resolve(configured: Optional[str], *, capability: str) -> Optional[WebSearc
            return bool(p.supports_search())
        if capability == "extract":
            return bool(p.supports_extract())
-        if capability == "crawl":
-            return bool(p.supports_crawl())
        return False

    def _is_available_safe(p: WebSearchProvider) -> bool:
@ -241,21 +239,6 @@ def get_active_extract_provider() -> Optional[WebSearchProvider]:
    return _resolve(explicit, capability="extract")


-def get_active_crawl_provider() -> Optional[WebSearchProvider]:
-    """Resolve the currently-active web crawl provider.
-
-    Reads ``web.crawl_backend`` (preferred) or ``web.backend`` (shared
-    fallback) from config.yaml; falls back per the module docstring.
-
-    Crawl is a niche capability — among built-in providers only Tavily and
-    Firecrawl implement it. Callers should expect ``None`` and fall back to
-    a different strategy (e.g. summarize-via-LLM) when neither is
-    configured.
-    """
-    explicit = _read_config_key("web", "crawl_backend") or _read_config_key("web", "backend")
-    return _resolve(explicit, capability="crawl")
-
-
 def _reset_for_tests() -> None:
    """Clear the registry. **Test-only.**"""
    with _lock:
--- a/apps/bootstrap-installer/vite.config.ts
+++ b/apps/bootstrap-installer/vite.config.ts
@ -6,7 +6,7 @@ import path from 'node:path'
 // Hermes Setup — Tauri-targeted Vite config.
 //
 // Port 5175 keeps us out of the way of:
-//   apps/dashboard       (vite default 5173)
+//   web       (vite default 5173)
 //   apps/desktop dev     (5174 per its package.json)
 //
 // `clearScreen: false` is the Tauri convention — they spawn vite as a child
--- a/apps/dashboard/public/ds-assets/filler-bg0.jpg
+++ b/apps/dashboard/public/ds-assets/filler-bg0.jpg
--- a/apps/dashboard/public/fonts/Collapse-BoldItalic.woff2
+++ b/apps/dashboard/public/fonts/Collapse-BoldItalic.woff2
--- a/apps/dashboard/public/fonts/Collapse-Italic.woff2
+++ b/apps/dashboard/public/fonts/Collapse-Italic.woff2
--- a/apps/dashboard/public/fonts/Collapse-Light.woff2
+++ b/apps/dashboard/public/fonts/Collapse-Light.woff2
--- a/apps/dashboard/public/fonts/Collapse-LightItalic.woff2
+++ b/apps/dashboard/public/fonts/Collapse-LightItalic.woff2
--- a/apps/dashboard/public/fonts/Collapse-Thin.woff2
+++ b/apps/dashboard/public/fonts/Collapse-Thin.woff2
--- a/apps/dashboard/public/fonts/Collapse-ThinItalic.woff2
+++ b/apps/dashboard/public/fonts/Collapse-ThinItalic.woff2
--- a/apps/dashboard/public/fonts/Neuebit-Bold.woff2
+++ b/apps/dashboard/public/fonts/Neuebit-Bold.woff2
--- a/apps/dashboard/src/components/BottomPickSheet.tsx
+++ b/apps/dashboard/src/components/BottomPickSheet.tsx
@ -1,225 +0,0 @@
-import {
-  type PointerEvent as ReactPointerEvent,
-  type ReactNode,
-  useEffect,
-  useRef,
-  useState,
-} from "react";
-import { createPortal } from "react-dom";
-import { Typography } from "@/components/NouiTypography";
-import { cn, themedBody } from "@/lib/utils";
-
-const CLOSE_DRAG_MIN_PX = 72;
-const CLOSE_DRAG_RATIO = 0.18;
-const SHEET_TRANSITION_MS = 280;
-
-/**
- * Mobile-first picker shell: fixed backdrop + bottom sheet, portaled to `body`
- * so nested overflow/transform in the sidebar cannot clip menus (theme /
- * language switchers). Open/close uses slide + fade; teardown is delayed until
- * the exit animation finishes so animations can complete.
- *
- * Drag the header/handle downward to dismiss (skipped when reduced motion is on).
- */
-export function BottomPickSheet({
-  backdropDismissLabel = "Dismiss",
-  children,
-  onClose,
-  open,
-  title,
-}: BottomPickSheetProps) {
-  const [renderPortal, setRenderPortal] = useState(open);
-  const [entered, setEntered] = useState(false);
-  const [dragOffsetPx, setDragOffsetPx] = useState(0);
-  const [dragActive, setDragActive] = useState(false);
-
-  const closeTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
-  const sheetRef = useRef<HTMLDivElement>(null);
-  const dragTrackingRef = useRef(false);
-  const dragStartYRef = useRef(0);
-  const dragOffsetRef = useRef(0);
-
-  const reducedMotion =
-    typeof window !== "undefined" &&
-    window.matchMedia("(prefers-reduced-motion: reduce)").matches;
-
-  const syncDragPx = (next: number) => {
-    dragOffsetRef.current = next;
-    setDragOffsetPx(next);
-  };
-
-  useEffect(() => {
-    if (closeTimerRef.current) {
-      clearTimeout(closeTimerRef.current);
-      closeTimerRef.current = null;
-    }
-
-    const ms = reducedMotion ? 0 : SHEET_TRANSITION_MS;
-
-    let openRafId = 0;
-    let exitRafId = 0;
-
-    if (open) {
-      openRafId = requestAnimationFrame(() => {
-        dragTrackingRef.current = false;
-        dragOffsetRef.current = 0;
-        setDragActive(false);
-        setDragOffsetPx(0);
-        setRenderPortal(true);
-        requestAnimationFrame(() => {
-          requestAnimationFrame(() => setEntered(true));
-        });
-      });
-    } else {
-      exitRafId = requestAnimationFrame(() => {
-        dragTrackingRef.current = false;
-        setDragActive(false);
-        setEntered(false);
-        closeTimerRef.current = window.setTimeout(() => {
-          dragOffsetRef.current = 0;
-          setDragOffsetPx(0);
-          setRenderPortal(false);
-          closeTimerRef.current = null;
-        }, ms);
-      });
-    }
-
-    return () => {
-      cancelAnimationFrame(openRafId);
-      cancelAnimationFrame(exitRafId);
-      if (closeTimerRef.current) {
-        clearTimeout(closeTimerRef.current);
-        closeTimerRef.current = null;
-      }
-    };
-  }, [open, reducedMotion]);
-
-  useEffect(() => {
-    if (!renderPortal) return;
-    const prev = document.body.style.overflow;
-    document.body.style.overflow = "hidden";
-    return () => {
-      document.body.style.overflow = prev;
-    };
-  }, [renderPortal]);
-
-  if (!renderPortal || typeof document === "undefined") return null;
-
-  const durationClass = reducedMotion ? "duration-0" : "duration-[280ms]";
-
-  const draggingVisual = dragActive || dragOffsetPx > 0;
-
-  const onDragPointerDown = (e: ReactPointerEvent<HTMLDivElement>) => {
-    if (reducedMotion || !entered) return;
-    if (e.pointerType === "mouse" && e.button !== 0) return;
-
-    dragTrackingRef.current = true;
-    setDragActive(true);
-    dragStartYRef.current = e.clientY;
-    syncDragPx(0);
-    e.currentTarget.setPointerCapture(e.pointerId);
-  };
-
-  const onDragPointerMove = (e: ReactPointerEvent<HTMLDivElement>) => {
-    if (!dragTrackingRef.current) return;
-    const dy = e.clientY - dragStartYRef.current;
-    const next = Math.max(0, dy);
-    const sheetH = sheetRef.current?.offsetHeight ?? 560;
-    syncDragPx(Math.min(next, sheetH));
-  };
-
-  const endDrag = (e: ReactPointerEvent<HTMLDivElement>) => {
-    if (!dragTrackingRef.current) return;
-    dragTrackingRef.current = false;
-    setDragActive(false);
-    try {
-      e.currentTarget.releasePointerCapture(e.pointerId);
-    } catch {
-      /* already released */
-    }
-
-    const sheetH = sheetRef.current?.offsetHeight ?? 560;
-    const threshold = Math.max(CLOSE_DRAG_MIN_PX, sheetH * CLOSE_DRAG_RATIO);
-    const d = dragOffsetRef.current;
-
-    if (d >= threshold) {
-      onClose();
-      return;
-    }
-    syncDragPx(0);
-  };
-
-  return createPortal(
-    <div className="fixed inset-0 z-[200] flex flex-col justify-end">
-      <button
-        type="button"
-        aria-label={backdropDismissLabel}
-        className={cn(
-          "absolute inset-0 bg-black/55 backdrop-blur-[2px]",
-          "transition-opacity ease-out motion-reduce:transition-none",
-          durationClass,
-          entered ? "opacity-100" : "opacity-0",
-        )}
-        onClick={onClose}
-      />
-
-      <div
-        aria-label={title}
-        aria-modal="true"
-        ref={sheetRef}
-        className={cn(
-          themedBody,
-          "relative flex max-h-[85dvh] min-h-0 flex-col rounded-t-xl border border-current/20",
-          "bg-background-base/98 pb-[max(1rem,env(safe-area-inset-bottom))]",
-          "shadow-[0_-12px_40px_-8px_rgba(0,0,0,0.55)] backdrop-blur-md",
-          "ease-out motion-reduce:transition-none transform-gpu",
-          draggingVisual ? "transition-none" : cn("transition-transform", durationClass),
-          entered ? "translate-y-0" : "translate-y-full",
-        )}
-        role="dialog"
-        style={
-          entered && dragOffsetPx > 0
-            ? { transform: `translateY(${dragOffsetPx}px)` }
-            : undefined
-        }
-      >
-        <div
-          className={cn(
-            "flex shrink-0 flex-col gap-2 border-b border-current/15 px-4 pb-3 pt-2",
-            "touch-none select-none",
-            reducedMotion ? "cursor-default" : "cursor-grab active:cursor-grabbing",
-          )}
-          onPointerCancel={endDrag}
-          onPointerDown={onDragPointerDown}
-          onPointerMove={onDragPointerMove}
-          onPointerUp={endDrag}
-        >
-          <div
-            aria-hidden
-            className="mx-auto h-1 w-10 shrink-0 rounded-full bg-current/20"
-          />
-
-          <Typography
-            mondwest
-            className="text-display text-xs tracking-[0.12em] text-text-tertiary"
-          >
-            {title}
-          </Typography>
-        </div>
-
-        <div className="min-h-0 flex-1 overflow-y-auto overscroll-contain">
-          {children}
-        </div>
-      </div>
-    </div>,
-    document.body,
-  );
-}
-
-interface BottomPickSheetProps {
-  backdropDismissLabel?: string;
-  children: ReactNode;
-  onClose: () => void;
-  open: boolean;
-  title: string;
-}
--- a/apps/dashboard/src/components/NouiTypography.tsx
+++ b/apps/dashboard/src/components/NouiTypography.tsx
@ -1,63 +0,0 @@
-import { forwardRef, type ElementType, type HTMLAttributes, type ReactNode } from "react";
-import { cn } from "@/lib/utils";
-
-type TypographyProps = HTMLAttributes<HTMLElement> & {
-  as?: ElementType;
-  children?: ReactNode;
-  compressed?: boolean;
-  courier?: boolean;
-  expanded?: boolean;
-  mondwest?: boolean;
-  mono?: boolean;
-  sans?: boolean;
-  variant?: "sm" | "md" | "lg" | "xl";
-};
-
-const variantClasses: Record<NonNullable<TypographyProps["variant"]>, string> = {
-  sm: "leading-[1.4] text-[.9375rem] tracking-[0.1875rem]",
-  md: "text-[2.625rem] leading-[1] tracking-[0.0525rem]",
-  lg: "text-[2.625rem] leading-[1] tracking-[0.0525rem]",
-  xl: "text-[4.5rem] leading-[1] tracking-[0.135rem]",
-};
-
-export const Typography = forwardRef<HTMLElement, TypographyProps>(function Typography(
-  {
-    as: Component = "span",
-    className,
-    compressed,
-    courier,
-    expanded,
-    mondwest,
-    mono,
-    sans,
-    variant,
-    ...props
-  },
-  ref,
-) {
-  const hasFontVariant = compressed || courier || expanded || mondwest || mono || sans;
-
-  return (
-    <Component
-      className={cn(
-        compressed && "font-compressed",
-        courier && "font-courier",
-        expanded && "font-expanded",
-        mondwest && "font-mondwest tracking-[0.1875rem]",
-        mono && "font-mono",
-        (!hasFontVariant || sans) && "font-sans",
-        variant && variantClasses[variant],
-        className,
-      )}
-      ref={ref}
-      {...props}
-    />
-  );
-});
-
-export const H2 = forwardRef<HTMLHeadingElement, Omit<TypographyProps, "as">>(function H2(
-  { className, variant = "lg", ...props },
-  ref,
-) {
-  return <Typography as="h2" className={cn("font-bold", className)} variant={variant} ref={ref} {...props} />;
-});
--- a/apps/dashboard/src/components/Toast.tsx
+++ b/apps/dashboard/src/components/Toast.tsx
@ -1,40 +0,0 @@
-import { useEffect, useState } from "react";
-import { createPortal } from "react-dom";
-
-export function Toast({ toast }: { toast: { message: string; type: "success" | "error" } | null }) {
-  const [visible, setVisible] = useState(false);
-  const [current, setCurrent] = useState(toast);
-
-  useEffect(() => {
-    if (toast) {
-      setCurrent(toast);
-      setVisible(true);
-    } else {
-      setVisible(false);
-      const timer = setTimeout(() => setCurrent(null), 200);
-      return () => clearTimeout(timer);
-    }
-  }, [toast]);
-
-  if (!current) return null;
-
-  // Portal to document.body so the toast escapes any ancestor stacking context
-  // (e.g. <main> has `relative z-2`, which would trap z-50 below the header's z-40).
-  return createPortal(
-    <div
-      role="status"
-      aria-live="polite"
-      className={`fixed top-16 right-4 z-50 border px-4 py-2.5 font-courier text-xs tracking-wider uppercase backdrop-blur-sm ${
-        current.type === "success"
-          ? "bg-success/15 text-success border-success/30"
-          : "bg-destructive/15 text-destructive border-destructive/30"
-      }`}
-      style={{
-        animation: visible ? "toast-in 200ms ease-out forwards" : "toast-out 200ms ease-in forwards",
-      }}
-    >
-      {current.message}
-    </div>,
-    document.body,
-  );
-}
--- a/apps/dashboard/src/components/ui/card.tsx
+++ b/apps/dashboard/src/components/ui/card.tsx
@ -1,63 +0,0 @@
-import { cn, themedBody } from "@/lib/utils";
-
-/**
- * Themed card primitive. Themes can restyle every card without touching
- * call sites by setting CSS vars under the `card` component-style bucket:
- *
- *   componentStyles:
- *     card:
- *       clipPath: "polygon(10px 0, 100% 0, 100% calc(100% - 10px), calc(100% - 10px) 100%, 0 100%, 0 10px)"
- *       border: "1px solid var(--color-ring)"
- *       background: "linear-gradient(180deg, var(--color-card) 0%, transparent 100%)"
- *       boxShadow: "0 0 0 1px var(--color-ring) inset, 0 0 24px -8px var(--warm-glow)"
- *
- * All properties are optional — vars that aren't set compute to their
- * CSS initial value, so the default shadcn-y card keeps looking normal
- * for themes that don't override anything.
- */
-const CARD_STYLE: React.CSSProperties = {
-  clipPath: "var(--component-card-clip-path)",
-  borderImage: "var(--component-card-border-image)",
-  background: "var(--component-card-background)",
-  boxShadow: "var(--component-card-box-shadow)",
-};
-
-export function Card({ className, style, ...props }: React.HTMLAttributes<HTMLDivElement>) {
-  return (
-    <div
-      className={cn(
-        "border border-border bg-card/80 text-card-foreground w-full",
-        themedBody,
-        className,
-      )}
-      style={{ ...CARD_STYLE, ...style }}
-      {...props}
-    />
-  );
-}
-
-export function CardHeader({ className, ...props }: React.HTMLAttributes<HTMLDivElement>) {
-  return <div className={cn("flex flex-col gap-1.5 p-4 border-b border-border", className)} {...props} />;
-}
-
-export function CardTitle({ className, ...props }: React.HTMLAttributes<HTMLHeadingElement>) {
-  return (
-    <h3
-      className={cn(
-        "font-mondwest text-display text-sm tracking-[0.12em] text-text-primary",
-        className,
-      )}
-      {...props}
-    />
-  );
-}
-
-export function CardDescription({ className, ...props }: React.HTMLAttributes<HTMLParagraphElement>) {
-  return (
-    <p className={cn("font-mondwest normal-case text-xs text-muted-foreground", className)} {...props} />
-  );
-}
-
-export function CardContent({ className, ...props }: React.HTMLAttributes<HTMLDivElement>) {
-  return <div className={cn("p-4", className)} {...props} />;
-}
--- a/apps/dashboard/src/components/ui/confirm-dialog.tsx
+++ b/apps/dashboard/src/components/ui/confirm-dialog.tsx
@ -1,137 +0,0 @@
-import { useEffect, useRef } from "react";
-import { createPortal } from "react-dom";
-import { AlertTriangle } from "lucide-react";
-import { Button } from "@nous-research/ui/ui/components/button";
-import { cn, themedBody } from "@/lib/utils";
-
-export function ConfirmDialog({
-  cancelLabel = "Cancel",
-  confirmLabel = "Confirm",
-  description,
-  destructive = false,
-  loading = false,
-  onCancel,
-  onConfirm,
-  open,
-  title,
-}: ConfirmDialogProps) {
-  const dialogRef = useRef<HTMLDivElement>(null);
-
-  // Focus the confirm button when opened; trap ESC to cancel.
-  useEffect(() => {
-    if (!open) return;
-
-    const prevActive = document.activeElement as HTMLElement | null;
-    dialogRef.current
-      ?.querySelector<HTMLButtonElement>("[data-confirm]")
-      ?.focus();
-
-    const onKey = (e: KeyboardEvent) => {
-      if (e.key === "Escape") {
-        e.preventDefault();
-        onCancel();
-      }
-    };
-
-    document.addEventListener("keydown", onKey);
-    const prevOverflow = document.body.style.overflow;
-    document.body.style.overflow = "hidden";
-
-    return () => {
-      document.removeEventListener("keydown", onKey);
-      document.body.style.overflow = prevOverflow;
-      prevActive?.focus?.();
-    };
-  }, [open, onCancel]);
-
-  if (!open) return null;
-
-  return createPortal(
-    <div
-      role="dialog"
-      aria-modal="true"
-      aria-labelledby="confirm-dialog-title"
-      aria-describedby={description ? "confirm-dialog-desc" : undefined}
-      onClick={(e) => {
-        if (e.target === e.currentTarget) onCancel();
-      }}
-      className={cn(
-        "fixed inset-0 z-50 flex items-center justify-center",
-        "bg-black/60 backdrop-blur-sm",
-        "animate-[fade-in_150ms_ease-out]",
-      )}
-    >
-      <div
-        ref={dialogRef}
-        className={cn(
-          themedBody,
-          "relative w-full max-w-md mx-4",
-          "border border-border bg-card shadow-lg",
-          "animate-[dialog-in_180ms_ease-out]",
-        )}
-      >
-        <div className="flex items-start gap-3 p-4 border-b border-border">
-          {destructive && (
-            <div
-              aria-hidden
-              className="mt-0.5 shrink-0 text-destructive"
-            >
-              <AlertTriangle className="h-4 w-4" />
-            </div>
-          )}
-
-          <div className="flex-1 min-w-0 flex flex-col gap-1">
-            <h2
-              id="confirm-dialog-title"
-              className="font-mondwest text-display text-sm font-bold tracking-[0.12em] blend-lighter"
-            >
-              {title}
-            </h2>
-
-            {description && (
-              <p
-                id="confirm-dialog-desc"
-                className="font-mondwest normal-case text-xs text-muted-foreground leading-relaxed"
-              >
-                {description}
-              </p>
-            )}
-          </div>
-        </div>
-
-        <div className="flex items-center justify-end gap-2 p-3">
-          <Button
-            type="button"
-            outlined
-            onClick={onCancel}
-            disabled={loading}
-          >
-            {cancelLabel}
-          </Button>
-          <Button
-            data-confirm
-            type="button"
-            destructive={destructive}
-            onClick={onConfirm}
-            disabled={loading}
-          >
-            {loading ? "…" : confirmLabel}
-          </Button>
-        </div>
-      </div>
-    </div>,
-    document.body,
-  );
-}
-
-interface ConfirmDialogProps {
-  cancelLabel?: string;
-  confirmLabel?: string;
-  description?: string;
-  destructive?: boolean;
-  loading?: boolean;
-  onCancel: () => void;
-  onConfirm: () => void;
-  open: boolean;
-  title: string;
-}
--- a/apps/dashboard/src/components/ui/input.tsx
+++ b/apps/dashboard/src/components/ui/input.tsx
@ -1,16 +0,0 @@
-import { cn } from "@/lib/utils";
-
-export function Input({ className, ...props }: React.InputHTMLAttributes<HTMLInputElement>) {
-  return (
-    <input
-      className={cn(
-        "flex h-9 w-full border border-border bg-background/40 px-3 py-1 font-courier text-sm transition-colors",
-        "placeholder:text-muted-foreground",
-        "focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-foreground/30 focus-visible:border-foreground/25",
-        "disabled:cursor-not-allowed disabled:opacity-50",
-        className,
-      )}
-      {...props}
-    />
-  );
-}
--- a/apps/dashboard/src/components/ui/label.tsx
+++ b/apps/dashboard/src/components/ui/label.tsx
@ -1,13 +0,0 @@
-import { cn } from "@/lib/utils";
-
-export function Label({ className, ...props }: React.LabelHTMLAttributes<HTMLLabelElement>) {
-  return (
-    <label
-      className={cn(
-        "font-mondwest text-xs tracking-[0.1em] uppercase leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70",
-        className,
-      )}
-      {...props}
-    />
-  );
-}
--- a/apps/dashboard/src/components/ui/separator.tsx
+++ b/apps/dashboard/src/components/ui/separator.tsx
@ -1,19 +0,0 @@
-import { cn } from "@/lib/utils";
-
-export function Separator({
-  className,
-  orientation = "horizontal",
-  ...props
-}: React.HTMLAttributes<HTMLDivElement> & { orientation?: "horizontal" | "vertical" }) {
-  return (
-    <div
-      role="separator"
-      className={cn(
-        "shrink-0 bg-border",
-        orientation === "horizontal" ? "h-px w-full" : "h-full w-px",
-        className,
-      )}
-      {...props}
-    />
-  );
-}
--- a/apps/dashboard/src/hooks/useBelowBreakpoint.ts
+++ b/apps/dashboard/src/hooks/useBelowBreakpoint.ts
@ -1,19 +0,0 @@
-import { useEffect, useState } from "react";
-
-/** True when viewport width is strictly below `px` (matches Tailwind `min-width: px`). */
-export function useBelowBreakpoint(px: number) {
-  const query = `(max-width: ${px - 1}px)`;
-  const [matches, setMatches] = useState(() =>
-    typeof window !== "undefined" ? window.matchMedia(query).matches : false,
-  );
-
-  useEffect(() => {
-    const mql = window.matchMedia(query);
-    const sync = () => setMatches(mql.matches);
-    sync();
-    mql.addEventListener("change", sync);
-    return () => mql.removeEventListener("change", sync);
-  }, [query]);
-
-  return matches;
-}
--- a/apps/dashboard/src/hooks/useConfirmDelete.ts
+++ b/apps/dashboard/src/hooks/useConfirmDelete.ts
@ -1,41 +0,0 @@
-import { useCallback, useState } from "react";
-
-export function useConfirmDelete<TId>({
-  onDelete,
-}: {
-  onDelete: (id: TId) => Promise<void>;
-}) {
-  const [pendingId, setPendingId] = useState<TId | null>(null);
-  const [isDeleting, setIsDeleting] = useState(false);
-
-  const requestDelete = useCallback((id: TId) => {
-    setPendingId(id);
-  }, []);
-
-  const cancel = useCallback(() => {
-    if (!isDeleting) setPendingId(null);
-  }, [isDeleting]);
-
-  const confirm = useCallback(async () => {
-    if (pendingId === null) return;
-    const id = pendingId;
-    setIsDeleting(true);
-    try {
-      await onDelete(id);
-      setPendingId(null);
-    } catch {
-      // Dialog stays open; caller can surface errors in onDelete before rethrowing
-    } finally {
-      setIsDeleting(false);
-    }
-  }, [pendingId, onDelete]);
-
-  return {
-    cancel,
-    confirm,
-    isDeleting,
-    isOpen: pendingId !== null,
-    pendingId,
-    requestDelete,
-  } as const;
-}
--- a/apps/dashboard/src/hooks/useToast.ts
+++ b/apps/dashboard/src/hooks/useToast.ts
@ -1,15 +0,0 @@
-import { useCallback, useState } from "react";
-
-export function useToast(duration = 3000) {
-  const [toast, setToast] = useState<{ message: string; type: "success" | "error" } | null>(null);
-
-  const showToast = useCallback(
-    (message: string, type: "success" | "error") => {
-      setToast({ message, type });
-      setTimeout(() => setToast(null), duration);
-    },
-    [duration],
-  );
-
-  return { toast, showToast };
-}
--- a/apps/dashboard/src/lib/gatewayClient.ts
+++ b/apps/dashboard/src/lib/gatewayClient.ts
@ -1,38 +0,0 @@
-import {
-  JsonRpcGatewayClient,
-  type ConnectionState,
-  type GatewayEvent,
-  type GatewayEventName,
-} from "@hermes/shared";
-
-import { HERMES_BASE_PATH } from "@/lib/api";
-
-export type { ConnectionState, GatewayEvent, GatewayEventName };
-
-/**
- * Browser wrapper for the shared tui_gateway JSON-RPC client.
- *
- * Dashboard resolves its token and host from the served page. Desktop uses the
- * same shared protocol client, but supplies an absolute wsUrl from Electron.
- */
-export class GatewayClient extends JsonRpcGatewayClient {
-  async connect(token?: string): Promise<void> {
-    const resolved = token ?? window.__HERMES_SESSION_TOKEN__ ?? "";
-    if (!resolved) {
-      throw new Error(
-        "Session token not available — page must be served by the Hermes dashboard",
-      );
-    }
-
-    const scheme = location.protocol === "https:" ? "wss:" : "ws:";
-    await super.connect(
-      `${scheme}//${location.host}${HERMES_BASE_PATH}/api/ws?token=${encodeURIComponent(resolved)}`,
-    );
-  }
-}
-
-declare global {
-  interface Window {
-    __HERMES_SESSION_TOKEN__?: string;
-  }
-}
--- a/apps/desktop/README.md
+++ b/apps/desktop/README.md
@ -4,7 +4,7 @@ Native Electron shell for Hermes. It packages the desktop renderer, a bundled He

 ## Setup

-Install workspace dependencies from the repo root so `apps/desktop`, `apps/dashboard`, and `apps/shared` stay linked:
+Install workspace dependencies from the repo root so `apps/desktop`, `web`, and `apps/shared` stay linked:

 ```bash
 npm install
@ -80,7 +80,7 @@ hermes dashboard --tui --no-open
 For dashboard HMR, start Vite in another terminal:

 ```bash
-cd apps/dashboard
+cd web
 npm run dev
 ```

--- a/apps/desktop/electron/main.cjs
+++ b/apps/desktop/electron/main.cjs
@ -926,6 +926,35 @@ function getVenvPython(venvRoot) {
  return path.join(venvRoot, IS_WINDOWS ? path.join('Scripts', 'python.exe') : path.join('bin', 'python'))
 }

+// resolveGitBinary — locate git.exe on Windows. A fresh installer-driven
+// install only has PortableGit under %LOCALAPPDATA%\hermes\git (never on
+// PATH), so a bare spawn('git') ENOENTs and self-update checks fail with
+// "Couldn't check for updates". Mirror findGitBash: PortableGit first, then
+// standard Git-for-Windows locations, then PATH. Cached after first probe.
+let _gitBinaryCache = null
+function resolveGitBinary() {
+  if (_gitBinaryCache) return _gitBinaryCache
+  if (!IS_WINDOWS) {
+    _gitBinaryCache = findOnPath('git') || 'git'
+    return _gitBinaryCache
+  }
+
+  const localAppData = process.env.LOCALAPPDATA || ''
+  const candidates = []
+  if (localAppData) {
+    candidates.push(path.join(localAppData, 'hermes', 'git', 'cmd', 'git.exe'))
+    candidates.push(path.join(localAppData, 'hermes', 'git', 'bin', 'git.exe'))
+  }
+  candidates.push(path.join(process.env['ProgramFiles'] || 'C:\\Program Files', 'Git', 'cmd', 'git.exe'))
+  candidates.push(path.join(process.env['ProgramFiles(x86)'] || 'C:\\Program Files (x86)', 'Git', 'cmd', 'git.exe'))
+  if (localAppData) {
+    candidates.push(path.join(localAppData, 'Programs', 'Git', 'cmd', 'git.exe'))
+  }
+
+  _gitBinaryCache = candidates.find(fileExists) || findOnPath('git') || 'git'
+  return _gitBinaryCache
+}
+
 function recentHermesLog() {
  return hermesLog.slice(-20).join('\n')
 }
@ -962,7 +991,7 @@ function resolveUpdateRoot() {

 function runGit(args, options = {}) {
  return new Promise((resolve, reject) => {
-    const child = spawn('git', IS_WINDOWS ? ['-c', 'windows.appendAtomically=false', ...args] : args, {
+    const child = spawn(resolveGitBinary(), IS_WINDOWS ? ['-c', 'windows.appendAtomically=false', ...args] : args, {
      cwd: options.cwd,
      env: { ...process.env, ...(options.env || {}), GIT_TERMINAL_PROMPT: '0' },
      stdio: ['ignore', 'pipe', 'pipe']
--- a/apps/desktop/src/app/page-search-shell.tsx
+++ b/apps/desktop/src/app/page-search-shell.tsx
@ -29,12 +29,22 @@ export function PageSearchShell({
      className={cn('flex h-full min-w-0 flex-col overflow-hidden bg-(--ui-chat-surface-background)', className)}
    >
      <div className="relative z-10 grid gap-2 border-b border-(--ui-stroke-tertiary) px-3 py-2.5">
-        <PageSearchInput
-          onChange={onSearchChange}
-          placeholder={searchPlaceholder}
-          trailingAction={searchTrailingAction}
-          value={searchValue}
-        />
+        {/* Reserve the top-right titlebar tools + native window-controls
+            footprint so the full-width search input never slides under them
+            (this header sits in the titlebar row at the window top). */}
+        <div
+          style={{
+            paddingRight:
+              'max(0px, calc(var(--titlebar-tools-right, 0px) + var(--titlebar-tools-width, 0px) - 0.75rem))'
+          }}
+        >
+          <PageSearchInput
+            onChange={onSearchChange}
+            placeholder={searchPlaceholder}
+            trailingAction={searchTrailingAction}
+            value={searchValue}
+          />
+        </div>
        {filters ? <div className="flex flex-wrap items-center justify-center gap-1.5">{filters}</div> : null}
      </div>
      <div className="min-h-0 flex-1 overflow-hidden bg-(--ui-chat-surface-background)">{children}</div>
--- a/apps/desktop/src/components/assistant-ui/thread.tsx
+++ b/apps/desktop/src/components/assistant-ui/thread.tsx
@ -517,11 +517,17 @@ const AssistantActionBar: FC<MessageActionProps> = ({ messageId, messageText, on
    <div className="relative flex w-full shrink-0 justify-end">
      <ActionBarPrimitive.Root
        className={cn(
+          // NOTE: intentionally NOT `hideWhenRunning`. That prop unmounts the
+          // bar while the thread streams, which collapses every completed
+          // assistant message's footer by this bar's height and shifts the
+          // whole conversation when the turn resolves. The bar is already
+          // invisible by default (opacity-0 + pointer-events-none, reveals on
+          // hover), so keeping it mounted reserves stable layout height with
+          // no visual change during streaming.
          'relative flex flex-row items-center justify-end gap-2 py-1.5 opacity-0 pointer-events-none group-hover:pointer-events-auto group-hover:opacity-100 focus-within:pointer-events-auto focus-within:opacity-100',
          menuOpen && 'pointer-events-auto opacity-100 [&_button]:opacity-100'
        )}
        data-slot="aui_msg-actions"
-        hideWhenRunning
      >
        <CopyButton appearance="icon" buttonSize="icon" disabled={!messageText} label="Copy" text={messageText} />
        <ActionBarPrimitive.Reload asChild>
--- a/apps/desktop/src/components/desktop-install-overlay.tsx
+++ b/apps/desktop/src/components/desktop-install-overlay.tsx
@ -46,6 +46,7 @@ interface StageRowProps {
  descriptor: DesktopBootstrapStageDescriptor
  result: DesktopBootstrapStageResult | undefined
  isCurrent: boolean
+  now: number
 }

 const STATE_LABEL: Record<DesktopBootstrapStageState, string> = {
@ -77,8 +78,18 @@ function formatDuration(ms: number | null | undefined): string {
  return `${m}m ${rs}s`
 }

-function StageRow({ descriptor, result, isCurrent }: StageRowProps) {
+// Live elapsed for a running stage, as m:ss (or s for sub-minute).
+function formatElapsed(ms: number): string {
+  const s = Math.max(0, Math.floor(ms / 1000))
+  if (s < 60) return `${s}s`
+  const m = Math.floor(s / 60)
+  return `${m}:${String(s - m * 60).padStart(2, '0')}`
+}
+
+function StageRow({ descriptor, result, isCurrent, now }: StageRowProps) {
  const state: DesktopBootstrapStageState = result?.state || 'pending'
+  const elapsed =
+    state === 'running' && typeof result?.startedAt === 'number' ? formatElapsed(now - result.startedAt) : ''
  const icon = useMemo(() => {
    switch (state) {
      case 'running':
@ -119,7 +130,7 @@ function StageRow({ descriptor, result, isCurrent }: StageRowProps) {
            {formatStageName(descriptor.name)}
          </span>
          <span className="flex-shrink-0 text-xs tabular-nums text-muted-foreground">
-            {state === 'running' ? STATE_LABEL[state] : null}
+            {state === 'running' ? (elapsed ? `${STATE_LABEL[state]} · ${elapsed}` : STATE_LABEL[state]) : null}
            {state === 'succeeded' || state === 'skipped' ? formatDuration(result?.durationMs) : null}
            {state === 'failed' ? STATE_LABEL[state] : null}
          </span>
@ -147,7 +158,7 @@ function applyEvent(state: DesktopBootstrapState, ev: DesktopBootstrapEvent): De
  if (ev.type === 'manifest') {
    const stages: Record<string, DesktopBootstrapStageResult> = {}
    for (const stage of ev.stages) {
-      stages[stage.name] = { state: 'pending', durationMs: null, json: null, error: null }
+      stages[stage.name] = { state: 'pending', durationMs: null, startedAt: null, json: null, error: null }
    }
    return {
      ...state,
@ -159,6 +170,7 @@ function applyEvent(state: DesktopBootstrapState, ev: DesktopBootstrapEvent): De
    }
  }
  if (ev.type === 'stage') {
+    const prev = state.stages[ev.name]
    return {
      ...state,
      stages: {
@ -166,6 +178,9 @@ function applyEvent(state: DesktopBootstrapState, ev: DesktopBootstrapEvent): De
        [ev.name]: {
          state: ev.state,
          durationMs: ev.durationMs ?? null,
+          // Stamp the start time on the running transition so the UI can show
+          // a live elapsed timer; preserve it across repeated running events.
+          startedAt: ev.state === 'running' ? prev?.startedAt ?? Date.now() : prev?.startedAt ?? null,
          json: ev.json ?? null,
          error: ev.error ?? null
        }
@ -202,8 +217,17 @@ export function DesktopInstallOverlay({ enabled = true }: DesktopInstallOverlayP
  const [state, setState] = useState<DesktopBootstrapState>(EMPTY_STATE)
  const [logOpen, setLogOpen] = useState(false)
  const [copied, setCopied] = useState(false)
+  const [now, setNow] = useState(() => Date.now())
  const logEndRef = useRef<HTMLDivElement | null>(null)

+  // Tick once a second while a bootstrap is in flight so running steps show a
+  // live elapsed timer. Stops when nothing is active to avoid idle renders.
+  useEffect(() => {
+    if (!state.active) return
+    const id = window.setInterval(() => setNow(Date.now()), 1000)
+    return () => window.clearInterval(id)
+  }, [state.active])
+
  // Subscribe to bootstrap events + load initial snapshot
  useEffect(() => {
    if (!enabled) return
@ -325,6 +349,8 @@ export function DesktopInstallOverlay({ enabled = true }: DesktopInstallOverlayP
  const totalCount = stages.length
  const failed = Boolean(state.error)
  const progressPct = totalCount > 0 ? Math.round((completedCount / totalCount) * 100) : 0
+  const currentStartedAt = currentStage ? state.stages[currentStage]?.startedAt : null
+  const currentElapsed = typeof currentStartedAt === 'number' ? formatElapsed(now - currentStartedAt) : ''

  return (
    <div className="fixed inset-0 z-[1400] flex items-center justify-center bg-background/90 backdrop-blur-md p-4">
@ -350,6 +376,7 @@ export function DesktopInstallOverlay({ enabled = true }: DesktopInstallOverlayP
                <span>
                  {completedCount} of {totalCount} steps complete
                  {currentStage && ` -- now: ${formatStageName(currentStage)}`}
+                  {currentElapsed && ` (${currentElapsed})`}
                </span>
                <span className="tabular-nums">{progressPct}%</span>
              </div>
@ -390,6 +417,7 @@ export function DesktopInstallOverlay({ enabled = true }: DesktopInstallOverlayP
                  descriptor={stage}
                  result={state.stages[stage.name]}
                  isCurrent={stage.name === currentStage}
+                  now={now}
                />
              ))}
            </ol>
--- a/apps/desktop/src/global.d.ts
+++ b/apps/desktop/src/global.d.ts
@ -201,6 +201,7 @@ export type DesktopBootstrapStageState =
 export interface DesktopBootstrapStageResult {
  state: DesktopBootstrapStageState
  durationMs: number | null
+  startedAt: number | null
  json: { ok: boolean; skipped?: boolean; reason?: string | null; stage: string } | null
  error: string | null
 }
--- a/apps/desktop/src/styles.css
+++ b/apps/desktop/src/styles.css
@ -962,6 +962,11 @@ canvas {
  width: 0.6em;
  height: 1em;
  margin-left: 0.18em;
+  /* Net-zero the caret's inline advance so it paints at the text end without
+     consuming layout width. Otherwise its ~0.78em footprint can wrap the last
+     line mid-stream, and removing the caret on completion un-wraps it — the
+     visible "layout shift after the cursor goes away". */
+  margin-right: calc(-0.6em - 0.18em);
  vertical-align: middle;
  border-radius: 0.09375rem;
  background: repeating-conic-gradient(currentColor 0% 25%, transparent 0% 50%) 0 0 / 0.125rem 0.125rem;
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@ -163,7 +163,7 @@ model:
 # -----------------------------------------------------------------------------
 # Working directory behavior:
 #   - CLI (`hermes` command): Uses "." (current directory where you run hermes)
-#   - Messaging (Telegram/Discord): Uses MESSAGING_CWD from .env (default: home)
+#   - Gateway/messaging/cron: Uses terminal.cwd here; legacy .env cwd values are deprecated
 terminal:
  backend: "local"
  cwd: "."  # For local backend: "." = current directory. Ignored for remote backends unless a backend documents otherwise.
--- a/cli.py
+++ b/cli.py
@ -74,10 +74,15 @@ except (ImportError, AttributeError):
    _STEADY_CURSOR = None

 try:
-    from hermes_cli.pt_input_extras import install_shift_enter_alias, install_ctrl_enter_alias
+    from hermes_cli.pt_input_extras import (
+        install_ctrl_enter_alias,
+        install_ignored_terminal_sequences,
+        install_shift_enter_alias,
+    )
    install_shift_enter_alias()
    install_ctrl_enter_alias()
-    del install_shift_enter_alias, install_ctrl_enter_alias
+    install_ignored_terminal_sequences()
+    del install_shift_enter_alias, install_ctrl_enter_alias, install_ignored_terminal_sequences
 except Exception:
    pass
 import threading
@ -168,7 +173,7 @@ from hermes_cli.browser_connect import (
    try_launch_chrome_debug,
 )
 from hermes_cli.env_loader import load_hermes_dotenv
-from utils import base_url_host_matches, is_truthy_value
+from utils import base_url_host_matches

 _hermes_home = get_hermes_home()
 _project_env = Path(__file__).parent / '.env'
@ -382,6 +387,10 @@ def load_cli_config() -> Dict[str, Any]:
            "inactivity_timeout": 120,  # Auto-cleanup inactive browser sessions after 2 min
            "record_sessions": False,  # Auto-record browser sessions as WebM videos
            "engine": "auto",  # Browser engine: auto (Chrome), lightpanda, chrome
+            "camofox": {
+                "rewrite_loopback_urls": False,
+                "loopback_host_alias": "host.docker.internal",
+            },
        },
        "compression": {
            "enabled": True,      # Auto-compress when approaching context limit
@ -576,6 +585,8 @@ def load_cli_config() -> Dict[str, Any]:
        "docker_env": "TERMINAL_DOCKER_ENV",
        "docker_mount_cwd_to_workspace": "TERMINAL_DOCKER_MOUNT_CWD_TO_WORKSPACE",
        "docker_run_as_host_user": "TERMINAL_DOCKER_RUN_AS_HOST_USER",
+        "docker_persist_across_processes": "TERMINAL_DOCKER_PERSIST_ACROSS_PROCESSES",
+        "docker_orphan_reaper": "TERMINAL_DOCKER_ORPHAN_REAPER",
        "sandbox_dir": "TERMINAL_SANDBOX_DIR",
        # Persistent shell (non-local backends)
        "persistent_shell": "TERMINAL_PERSISTENT_SHELL",
@ -2475,8 +2486,9 @@ _TERMINAL_INPUT_MODE_RESET_SEQ = (
 def _preserve_ctrl_enter_newline() -> bool:
    """Detect environments where Ctrl+Enter must produce a newline, not submit.

-    Native Windows, WSL, SSH sessions, and Windows Terminal all send Ctrl+Enter
-    as bare LF (c-j). On those terminals c-j must NOT be bound to submit;
+    Windows Terminal, WSL, SSH sessions, Ghostty, and some modern terminals
+    deliver Ctrl+Enter/Ctrl+J as bare LF (c-j). On those terminals c-j must
+    NOT be bound to submit;
    binding it to submit makes Ctrl+Enter (intended as 'newline like Alt+Enter')
    submit instead. Local POSIX TTYs that deliver Enter as LF (docker exec,
    some thin PTYs without SSH) still need c-j bound to submit, so we keep
@ -2490,6 +2502,12 @@ def _preserve_ctrl_enter_newline() -> bool:
        return True
    if os.environ.get("WT_SESSION"):
        return True
+    if os.environ.get("GHOSTTY_RESOURCES_DIR") or os.environ.get("GHOSTTY_BIN_DIR"):
+        return True
+    if os.environ.get("TERM", "").lower() == "xterm-ghostty":
+        return True
+    if os.environ.get("TERM_PROGRAM", "").lower() == "ghostty":
+        return True
    if "microsoft" in os.environ.get("WSL_DISTRO_NAME", "").lower():
        return True
    # WSL detection — env vars can be scrubbed under sudo, also peek /proc.
@ -2510,7 +2528,7 @@ def _bind_prompt_submit_keys(kb, handler) -> None:
    some thin PTYs (docker exec, certain SSH flavors) deliver Enter as LF
    instead of CR — without this, Enter appears dead on those terminals.

-    Exception: on Windows, WSL, SSH sessions, and Windows Terminal,
+    Exception: on Windows, WSL, SSH sessions, Windows Terminal, and Ghostty,
    c-j is the wire encoding of Ctrl+Enter (a distinct keystroke from
    plain Enter / c-m). We leave c-j unbound there so the c-j newline
    handler registered separately can fire — giving the user an
@ -3230,6 +3248,12 @@ class HermesCLI:
        self._slash_confirm_state = None
        self._slash_confirm_deadline = 0
        self._model_picker_state = None
+        # Armed when a bare `/resume` prints the recent-sessions list so the
+        # very next bare numeric input (e.g. `3`) resolves to that session.
+        # Holds the exact list used for index resolution; one-shot (cleared on
+        # the next submitted input, whether it's the selection or anything
+        # else). See #34584.
+        self._pending_resume_sessions = None
        self._secret_state = None
        self._secret_deadline = 0
        self._spinner_text: str = ""  # thinking spinner text for TUI
@ -3748,7 +3772,7 @@ class HermesCLI:
            percent_label = f"{percent}%" if percent is not None else "--"
            duration_label = snapshot["duration"]

-            yolo_active = bool(os.getenv("HERMES_YOLO_MODE"))
+            yolo_active = self._is_session_yolo_active()
            if width < 52:
                text = f"⚕ {snapshot['model_short']} · {duration_label}"
                if yolo_active:
@ -3809,7 +3833,7 @@ class HermesCLI:
            # line and produce duplicated status bar rows over long sessions.
            width = self._get_tui_terminal_width()
            duration_label = snapshot["duration"]
-            yolo_active = bool(os.getenv("HERMES_YOLO_MODE"))
+            yolo_active = self._is_session_yolo_active()

            if width < 52:
                frags = [
@ -6658,10 +6682,21 @@ class HermesCLI:
        if not target:
            _cprint("  Usage: /resume <number|session_id_or_title>")
            if self._show_recent_sessions(reason="resume"):
+                # Arm a one-shot pending-resume selection so the user can type
+                # just the number (`3`) on the next line instead of having to
+                # retype `/resume 3`. The list here must match the one shown by
+                # _show_recent_sessions and used for index resolution below —
+                # all three go through _list_recent_sessions(limit=10). See
+                # #34584.
+                self._pending_resume_sessions = self._list_recent_sessions(limit=10)
                return
            _cprint("  Tip:   Use /history or `hermes sessions list` to find sessions.")
            return

+        # Any explicit /resume <target> supersedes a previously-armed bare
+        # numbered prompt.
+        self._pending_resume_sessions = None
+
        if not self._session_db:
            from hermes_state import format_session_db_unavailable
            _cprint(f"  {format_session_db_unavailable()}")
@ -6775,6 +6810,44 @@ class HermesCLI:
        else:
            _cprint(f"  ↻ Resumed session {target_id}{title_part} — no messages, starting fresh.")

+    def _consume_pending_resume_selection(self, text: str) -> bool:
+        """Resolve a bare numeric reply that follows a bare ``/resume`` prompt.
+
+        After ``/resume`` (no args) prints the recent-sessions list it arms
+        ``self._pending_resume_sessions``. The next submitted input is given
+        one chance to be a bare session number (``3``); if so we resume that
+        session here. Anything else (another command, free text, blank) simply
+        disarms the prompt and is handled normally by the caller.
+
+        Returns True if the input was consumed as a resume selection (caller
+        must not treat it as chat); False otherwise. The pending state is
+        always one-shot: it is cleared on the first submitted input regardless
+        of outcome. See #34584.
+        """
+        pending = self._pending_resume_sessions
+        if not pending:
+            return False
+        # One-shot: disarm now so a non-matching input can't leave the prompt
+        # armed and hijack a later number the user meant as chat.
+        self._pending_resume_sessions = None
+
+        if not isinstance(text, str):
+            return False
+        stripped = text.strip()
+        # Only a pure number selects; let "/resume 3", titles, or any other
+        # text fall through to normal handling.
+        if not stripped.isdigit():
+            return False
+
+        index = int(stripped)
+        if index < 1 or index > len(pending):
+            _cprint(f"  Resume index {index} is out of range.")
+            _cprint("  Use /resume with no arguments to see available sessions.")
+            return True
+
+        self._handle_resume_command(f"/resume {index}")
+        return True
+
    def _handle_sessions_command(self, cmd_original: str) -> None:
        """Handle /sessions [list|<id_or_title>] — browse or resume previous sessions.

@ -6890,6 +6963,7 @@ class HermesCLI:
            pass

        # Switch to the new session
+        self._transfer_session_yolo(self.session_id, new_session_id)
        self.session_id = new_session_id
        self.session_start = now
        self._pending_title = None
@ -7569,8 +7643,19 @@ class HermesCLI:
        parts = cmd_original.split(None, 1)  # split off '/model'
        raw_args = parts[1].strip() if len(parts) > 1 else ""

-        # Parse --provider and --global flags
-        model_input, explicit_provider, persist_global = parse_model_flags(raw_args)
+        # Parse --provider, --global, and --refresh flags
+        model_input, explicit_provider, persist_global, force_refresh = parse_model_flags(raw_args)
+
+        # --refresh: wipe the on-disk picker cache before building the
+        # provider list. Forces a live re-fetch of every authed provider's
+        # /v1/models endpoint on this open.
+        if force_refresh:
+            try:
+                from hermes_cli.models import clear_provider_models_cache
+                clear_provider_models_cache()
+                _cprint("  Cleared model picker cache. Refreshing...")
+            except Exception:
+                pass

        # Single inventory context — replaces the inline config-slice the
        # dashboard / TUI used to duplicate. Overlay live session state
@ -7609,6 +7694,7 @@ class HermesCLI:
                _cprint("")
                _cprint("  /model <name>                        switch model")
                _cprint("  /model --provider <slug>             switch provider")
+                _cprint("  /model --refresh                     re-fetch live model lists")
                return

            self._open_model_picker(
@ -8285,7 +8371,14 @@ class HermesCLI:
        _base_word = cmd_lower.split()[0].lstrip("/")
        _cmd_def = _resolve_cmd(_base_word)
        canonical = _cmd_def.name if _cmd_def else _base_word
-        
+
+        # A bare `/resume` prompt is one-shot: any command other than the
+        # resume/sessions handlers (which manage the pending state themselves)
+        # disarms it so a later number isn't swallowed as a stale selection.
+        # See #34584.
+        if canonical not in {"resume", "sessions"}:
+            self._pending_resume_sessions = None
+
        if canonical in {"quit", "exit"}:
            # Parse --delete flag: /exit --delete also removes the current
            # session's transcripts + SQLite history. Ported from
@ -9590,20 +9683,92 @@ class HermesCLI:
        }
        _cprint(labels.get(self.tool_progress_mode, ""))

-    def _toggle_yolo(self):
-        """Toggle YOLO mode — skip all dangerous command approval prompts."""
-        import os
-        from hermes_cli.colors import Colors as _Colors
+    def _transfer_session_yolo(self, old_session_id: str, new_session_id: str) -> None:
+        """Move YOLO bypass state from an old session key to a new one.

-        current = is_truthy_value(os.environ.get("HERMES_YOLO_MODE"))
-        if current:
-            os.environ.pop("HERMES_YOLO_MODE", None)
+        Called whenever ``self.session_id`` is reassigned mid-run — ``/branch``
+        forks into a new session, and auto-compression rotates the agent's
+        session id into a fresh continuation session. Without this transfer
+        the user's ``/yolo ON`` toggle would silently revert on the very next
+        turn (the same UX failure mode that motivated this entire fix), since
+        ``_session_yolo`` is keyed by session id.
+
+        Mirrors ``tui_gateway/server.py`` (~line 1297-1305) which performs the
+        same transfer for the TUI's session-rename path. No-op when YOLO
+        wasn't enabled or when the ids match.
+        """
+        if not old_session_id or not new_session_id or old_session_id == new_session_id:
+            return
+        try:
+            from tools.approval import (
+                disable_session_yolo,
+                enable_session_yolo,
+                is_session_yolo_enabled,
+            )
+        except Exception:
+            return
+        if is_session_yolo_enabled(old_session_id):
+            enable_session_yolo(new_session_id)
+            disable_session_yolo(old_session_id)
+
+    def _is_session_yolo_active(self) -> bool:
+        """Whether YOLO bypass is currently enabled for this CLI session.
+
+        Reads from ``tools.approval._session_yolo`` (the same set that
+        ``enable_session_yolo`` / ``disable_session_yolo`` write to) so the
+        status bar reflects the actual bypass state instead of a stale env
+        var. Also honors the process-start ``--yolo`` flag, which freezes
+        ``HERMES_YOLO_MODE`` into ``_YOLO_MODE_FROZEN`` before tool imports
+        happen.
+        """
+        try:
+            from tools.approval import (
+                _YOLO_MODE_FROZEN,
+                is_session_yolo_enabled,
+            )
+        except Exception:
+            return False
+        if _YOLO_MODE_FROZEN:
+            return True
+        # Use ``getattr`` so test fixtures that build a CLI via ``__new__``
+        # (skipping ``__init__``) don't trip an AttributeError here; the
+        # status-bar builders swallow exceptions silently but lose every
+        # field after the failure.
+        session_key = getattr(self, "session_id", None) or "default"
+        return is_session_yolo_enabled(session_key)
+
+    def _toggle_yolo(self):
+        """Toggle YOLO mode — skip all dangerous command approval prompts.
+
+        Per-session toggle that mirrors the gateway and TUI ``/yolo`` handlers
+        (see ``gateway/run.py:_handle_yolo_command`` and
+        ``tui_gateway/server.py`` key=="yolo"). We deliberately do NOT mutate
+        ``HERMES_YOLO_MODE`` here — that env var is read once at module import
+        time into ``tools.approval._YOLO_MODE_FROZEN`` to keep prompt-injected
+        skills from flipping the bypass mid-session, so setting it after CLI
+        startup is a silent no-op. Routing through ``enable_session_yolo`` /
+        ``disable_session_yolo`` gives the same auditable, per-session bypass
+        the other surfaces have. ``run_conversation`` binds
+        ``self.session_id`` as the active approval session key via
+        ``set_current_session_key`` so the bypass takes effect on the very
+        next dangerous command in this run.
+        """
+        from hermes_cli.colors import Colors as _Colors
+        from tools.approval import (
+            disable_session_yolo,
+            enable_session_yolo,
+            is_session_yolo_enabled,
+        )
+
+        session_key = self.session_id or "default"
+        if is_session_yolo_enabled(session_key):
+            disable_session_yolo(session_key)
            _cprint(
                f"  ⚠ YOLO mode {_Colors.BOLD}{_Colors.RED}OFF{_Colors.RESET}"
                " — dangerous commands will require approval."
            )
        else:
-            os.environ["HERMES_YOLO_MODE"] = "1"
+            enable_session_yolo(session_key)
            _cprint(
                f"  ⚡ YOLO mode {_Colors.BOLD}{_Colors.GREEN}ON{_Colors.RESET}"
                " — all commands auto-approved. Use with caution."
@ -9765,10 +9930,20 @@ class HermesCLI:
    def _manual_compress(self, cmd_original: str = ""):
        """Manually trigger context compression on the current conversation.

-        Accepts an optional focus topic: ``/compress <focus>`` guides the
-        summariser to preserve information related to *focus* while being
-        more aggressive about discarding everything else.  Inspired by
-        Claude Code's ``/compact <focus>`` feature.
+        Two modes:
+
+        * ``/compress [<focus>]`` — compress the *whole* history. An
+          optional focus topic guides the summariser to preserve
+          information related to *focus* while being more aggressive
+          about discarding everything else.  Inspired by Claude Code's
+          ``/compact <focus>`` feature.
+        * ``/compress here [N]`` — boundary-aware compression. Summarize
+          everything *except* the most recent ``N`` exchanges (default
+          2), which are preserved verbatim. Inspired by Claude Code's
+          Rewind "Summarize up to here" action (v2.1.139, May 2026,
+          https://code.claude.com/docs/en/whats-new/2026-w20). Lets the
+          user pick the compression boundary instead of leaving it to
+          the automatic token-budget heuristic.
        """
        if not self.conversation_history or len(self.conversation_history) < 4:
            print("(._.) Not enough conversation to compress (need at least 4 messages).")
@ -9782,12 +9957,21 @@ class HermesCLI:
            print("(._.) Compression is disabled in config.")
            return

-        # Extract optional focus topic from the command (e.g. "/compress database schema")
-        focus_topic = ""
+        from hermes_cli.partial_compress import (
+            parse_partial_compress_args,
+            rejoin_compressed_head_and_tail,
+            split_history_for_partial_compress,
+        )
+
+        # Args after the command word (e.g. "/compress here 3" -> "here 3").
+        raw_args = ""
        if cmd_original:
-            parts = cmd_original.strip().split(None, 1)
-            if len(parts) > 1:
-                focus_topic = parts[1].strip()
+            _parts = cmd_original.strip().split(None, 1)
+            if len(_parts) > 1:
+                raw_args = _parts[1].strip()
+
+        partial, keep_last, focus_topic = parse_partial_compress_args(raw_args)
+        focus_topic = focus_topic or ""

        original_count = len(self.conversation_history)
        with self._busy_command("Compressing context..."):
@ -9795,6 +9979,22 @@ class HermesCLI:
                from agent.model_metadata import estimate_request_tokens_rough
                from agent.manual_compression_feedback import summarize_manual_compression
                original_history = list(self.conversation_history)
+
+                # Boundary-aware split: only the head is summarized; the
+                # most recent `keep_last` exchanges ride along verbatim.
+                tail: list = []
+                head = original_history
+                if partial:
+                    head, tail = split_history_for_partial_compress(
+                        original_history, keep_last
+                    )
+                    if not tail:
+                        # Split degenerated (everything would be kept, or
+                        # no head left to compress). Fall back to full
+                        # compression so the user still gets an action.
+                        partial = False
+                        head = original_history
+
                # Include system prompt + tool schemas in the estimate —
                # a transcript-only number understates real request pressure
                # and can even appear to grow after compression because a
@ -9806,7 +10006,11 @@ class HermesCLI:
                    system_prompt=_sys_prompt,
                    tools=_tools,
                )
-                if focus_topic:
+                if partial:
+                    print(f"🗜️  Summarizing up to here: compressing {len(head)} of "
+                          f"{original_count} messages (~{approx_tokens:,} tokens), "
+                          f"keeping last {keep_last} exchange(s) verbatim...")
+                elif focus_topic:
                    print(f"🗜️  Compressing {original_count} messages (~{approx_tokens:,} tokens), "
                          f"focus: \"{focus_topic}\"...")
                else:
@ -9819,12 +10023,21 @@ class HermesCLI:
                # which already contain the agent identity — resulting in the
                # identity block appearing twice (issue #15281).
                compressed, _ = self.agent._compress_context(
-                    original_history,
+                    head,
                    None,
                    approx_tokens=approx_tokens,
                    focus_topic=focus_topic or None,
                    force=True,
                )
+                # Re-append the verbatim tail after the compressed head.
+                # The split guarantees `tail` begins on a user turn, so the
+                # compressed-head -> tail boundary is normally valid
+                # (the head's compressed output ends on assistant/tool).
+                # rejoin_compressed_head_and_tail() additionally guards the
+                # seam against any illegal user->user / assistant->assistant
+                # adjacency, defending provider role-alternation rules.
+                if partial and tail:
+                    compressed = rejoin_compressed_head_and_tail(compressed, tail)
                self.conversation_history = compressed
                # _compress_context ends the old session and creates a new child
                # session on the agent (run_agent.py::_compress_context). Sync the
@ -10650,7 +10863,8 @@ class HermesCLI:
        if not reqs.get("stt_available", reqs.get("stt_key_set")):
            raise RuntimeError(
                "Voice mode requires an STT provider for transcription.\n"
-                "Option 1: pip install faster-whisper  (free, local)\n"
+                "Option 1: uv pip install faster-whisper  "
+                "(free, local; `pip install faster-whisper` also works if pip is on PATH)\n"
                "Option 2: Set GROQ_API_KEY (free tier)\n"
                "Option 3: Set VOICE_TOOLS_OPENAI_KEY (paid)"
            )
@ -11739,6 +11953,23 @@ class HermesCLI:
                    set_secret_capture_callback(self._secret_capture_callback)
                except Exception:
                    pass
+                # Bind this turn's approval session key into the contextvar so
+                # ``tools.approval.is_current_session_yolo_enabled()`` resolves
+                # against the same key that ``/yolo`` toggles under (see
+                # ``_toggle_yolo`` → ``enable_session_yolo(self.session_id)``).
+                # Mirrors ``tui_gateway/server.py`` and ``gateway/run.py`` which
+                # bind the same contextvar before invoking the agent.
+                try:
+                    from tools.approval import (
+                        reset_current_session_key,
+                        set_current_session_key,
+                    )
+                    _approval_session_token = set_current_session_key(
+                        self.session_id or "default"
+                    )
+                except Exception:
+                    reset_current_session_key = None  # type: ignore[assignment]
+                    _approval_session_token = None
                agent_message = _voice_prefix + message if _voice_prefix else message
                # Prepend pending model switch note so the model knows about the switch
                _msn = getattr(self, '_pending_model_switch_note', None)
@ -11780,6 +12011,15 @@ class HermesCLI:
                        set_secret_capture_callback(None)
                    except Exception:
                        pass
+                    # Release the per-turn approval session key. ``_session_yolo``
+                    # state itself is preserved across turns (so /yolo persists
+                    # for the whole CLI run); we just unbind the contextvar so a
+                    # reused thread doesn't see stale identity on its next run.
+                    if _approval_session_token is not None and reset_current_session_key is not None:
+                        try:
+                            reset_current_session_key(_approval_session_token)
+                        except Exception:
+                            pass

            # Start agent in background thread (daemon so it cannot keep the
            # process alive when the user closes the terminal tab — SIGHUP
@ -11910,6 +12150,7 @@ class HermesCLI:
                and getattr(self.agent, "session_id", None)
                and self.agent.session_id != self.session_id
            ):
+                self._transfer_session_yolo(self.session_id, self.agent.session_id)
                self.session_id = self.agent.session_id
                self._pending_title = None

@ -12574,7 +12815,21 @@ class HermesCLI:
        
        # Key bindings for the input area
        kb = KeyBindings()
-        
+
+        from prompt_toolkit.keys import Keys as _IgnoreKeys
+
+        @kb.add(_IgnoreKeys.Ignore, eager=True)
+        def handle_ignored_terminal_sequence(event):
+            """Consume parser-level ignored terminal sequences before self-insert.
+
+            install_ignored_terminal_sequences() in hermes_cli.pt_input_extras
+            registers focus reports (CSI I / CSI O) as Keys.Ignore at the
+            VT100 parser level. Without this no-op binding the default
+            self-insert path would still fire and the bytes would land in
+            the buffer.
+            """
+            return None
+
        def handle_enter(event):
            """Handle Enter key - submit input.
            
@ -13817,7 +14072,12 @@ class HermesCLI:
            reserved_below = 6

            available = max(0, term_rows - reserved_below)
-            mandatory_full = chrome_full + len(choice_wrapped) + len(other_wrapped)
+            # The compact decision must reserve room for at least one question
+            # row on top of the choices, otherwise full chrome (3 blank
+            # separators) gets kept when there is no room for it and the panel
+            # overflows the viewport — HSplit then clips the panel's tail,
+            # silently dropping the choices (the reported bug).
+            mandatory_full = chrome_full + 1 + len(choice_wrapped) + len(other_wrapped)

            use_compact_chrome = mandatory_full > available
            chrome_rows = chrome_tight if use_compact_chrome else chrome_full
@ -13825,9 +14085,24 @@ class HermesCLI:
            max_question_rows = max(1, available - chrome_rows - len(choice_wrapped) - len(other_wrapped))
            max_question_rows = min(max_question_rows, 12)  # soft cap on huge terminals

+            # When the choices alone (plus compact chrome) already exceed the
+            # viewport, drop the question entirely — the choices are the only
+            # thing the user must see to make a selection. Without this the
+            # question would still claim its 1-row floor above and push the
+            # tail of the choices off-screen (HSplit clips the overflow).
+            choices_overflow = chrome_rows + len(choice_wrapped) + len(other_wrapped) >= available
+            if choices_overflow:
+                max_question_rows = 0
+
            question_wrapped = _wrap_panel_text(question, inner_text_width)
-            if len(question_wrapped) > max_question_rows:
-                keep = max(1, max_question_rows - 1)
+            if max_question_rows <= 0:
+                question_wrapped = []
+            elif len(question_wrapped) > max_question_rows:
+                # The truncation marker is itself a row, so it must count
+                # against the budget. With a 1-row budget there is no room for
+                # both a question line and the marker — show the marker alone
+                # so the rendered question never exceeds max_question_rows.
+                keep = max(0, max_question_rows - 1)
                question_wrapped = question_wrapped[:keep] + ["… (question truncated)"]

            lines = []
@ -14361,6 +14636,17 @@ class HermesCLI:
                                + (f"\n{_remainder}" if _remainder else "")
                            )

+                    # A bare number right after a bare `/resume` prompt selects
+                    # that session (see #34584). Checked before chat routing so
+                    # the digit isn't sent to the agent as a message.
+                    if (
+                        not _file_drop
+                        and self._pending_resume_sessions
+                        and isinstance(user_input, str)
+                        and self._consume_pending_resume_selection(user_input)
+                    ):
+                        continue
+
                    if not _file_drop and isinstance(user_input, str) and _looks_like_slash_command(user_input):
                        _cprint(f"\n⚙️  {user_input}")
                        try:
@ -14950,6 +15236,39 @@ def main(
                    time.sleep(_grace)
        except Exception:
            pass  # never block signal handling
+        # Kanban worker exit path (#28181): SIGTERM hits a dispatcher-spawned
+        # worker that's likely in a non-daemon thread waiting on a child
+        # subprocess in _wait_for_process. Raising KeyboardInterrupt only
+        # unwinds the main thread; the worker thread keeps running, the
+        # process gets reparented to init, and the dispatcher's _pid_alive
+        # check returns True forever — task stuck in 'running' indefinitely.
+        # Skip the controlled-unwind dance and call os._exit(0) so the kernel
+        # reclaims the PID immediately and detect_crashed_workers can reclaim
+        # the stale claim on the next tick. Flush logging + stdout/stderr
+        # first so the final debug trace isn't lost; SIGALRM deadman guards
+        # the flush against any rare blocking-I/O case (the reporter measured
+        # flush in <1ms; the alarm is a failsafe, not the common path).
+        if os.environ.get("HERMES_KANBAN_TASK"):
+            try:
+                import signal as _sig_mod
+                if hasattr(_sig_mod, "SIGALRM"):
+                    # Cancel any pre-existing alarm to avoid colliding with
+                    # caller-installed timers.
+                    _sig_mod.signal(_sig_mod.SIGALRM, lambda *_: os._exit(0))
+                    _sig_mod.alarm(2)
+            except Exception:
+                pass
+            try:
+                import logging as _lg
+                _lg.shutdown()
+            except Exception:
+                pass
+            for _stream in (sys.stdout, sys.stderr):
+                try:
+                    _stream.flush()
+                except Exception:
+                    pass
+            os._exit(0)
        raise KeyboardInterrupt()
    try:
        import signal as _signal
@ -14962,13 +15281,50 @@ def main(
    # Handle single query mode
    if query or image:
        query, single_query_images = _collect_query_images(query, image)
+        # Kanban workers spawn with ``hermes chat -q "work kanban task <id>"``;
+        # the actual task description lives in the task body. Mirror the
+        # gateway/CLI behaviour for inbound images by scanning the body for
+        # local image paths and http(s) image URLs and attaching them to the
+        # worker's first turn. Without this, users who paste a screenshot
+        # path or URL into a kanban task body never get it routed to the
+        # model's vision input.
+        single_query_image_urls: list[str] = []
+        _kanban_task_id = os.environ.get("HERMES_KANBAN_TASK", "").strip()
+        if _kanban_task_id:
+            try:
+                from hermes_cli import kanban_db as _kb
+                from agent.image_routing import extract_image_refs as _extract_refs
+
+                _conn = _kb.connect()
+                try:
+                    _task = _kb.get_task(_conn, _kanban_task_id)
+                finally:
+                    try:
+                        _conn.close()
+                    except Exception:
+                        pass
+                _body = getattr(_task, "body", "") if _task is not None else ""
+                if _body:
+                    _kb_paths, _kb_urls = _extract_refs(_body)
+                    if _kb_paths:
+                        # Dedupe against any --image the user already passed.
+                        _seen = {str(p) for p in single_query_images}
+                        for _p in _kb_paths:
+                            if _p not in _seen:
+                                _seen.add(_p)
+                                single_query_images.append(Path(_p))
+                    if _kb_urls:
+                        single_query_image_urls.extend(_kb_urls)
+            except Exception as _exc:
+                # Best-effort enrichment; never block worker startup on it.
+                logger.debug("kanban image-ref extraction failed: %s", _exc)
        if quiet:
            # Quiet mode: suppress banner, spinner, tool previews.
            # Only print the final response and parseable session info.
            cli.tool_progress_mode = "off"
            if cli._ensure_runtime_credentials():
                effective_query: Any = query
-                if single_query_images:
+                if single_query_images or single_query_image_urls:
                    # Honour the same image-routing decision used by the
                    # interactive path. With a vision-capable model (incl.
                    # custom-provider models declared via
@ -14997,19 +15353,26 @@ def main(
                            _parts, _skipped = _build_parts(
                                query if isinstance(query, str) else "",
                                [str(p) for p in single_query_images],
+                                image_urls=list(single_query_image_urls) or None,
                            )
                            if any(p.get("type") == "image_url" for p in _parts):
                                effective_query = _parts
                            else:
                                # All images unreadable — text fallback.
+                                # ``_preprocess_images_with_vision`` only knows
+                                # about local files; URLs would be lost there,
+                                # so keep the original query text intact when
+                                # only URLs were supplied.
+                                if single_query_images:
+                                    effective_query = cli._preprocess_images_with_vision(
+                                        query, single_query_images, announce=False,
+                                    )
+                        except Exception:
+                            if single_query_images:
                                effective_query = cli._preprocess_images_with_vision(
                                    query, single_query_images, announce=False,
                                )
-                        except Exception:
-                            effective_query = cli._preprocess_images_with_vision(
-                                query, single_query_images, announce=False,
-                            )
-                    else:
+                    elif single_query_images:
                        effective_query = cli._preprocess_images_with_vision(
                            query,
                            single_query_images,
--- a/docker/hermes-exec-shim.sh
+++ b/docker/hermes-exec-shim.sh
@ -0,0 +1,87 @@
+#!/bin/sh
+# shellcheck shell=sh
+# /opt/hermes/bin/hermes — `docker exec` privilege-drop shim.
+#
+# Background
+# ----------
+# The s6 image runs the supervised gateway/main process as the unprivileged
+# `hermes` user (UID 10000). When an operator runs `docker exec <c> hermes ...`
+# the default UID is root (0), and any file the command writes under
+# $HERMES_HOME — auth.json, .env, config.yaml — ends up root-owned and
+# unreadable to the supervised gateway. The most common manifestation: the
+# user runs `docker exec <c> hermes login`, this writes
+# /opt/data/auth.json as root:root mode 0600, and from then on the gateway
+# returns "Provider authentication failed: Hermes is not logged into Nous
+# Portal" on every incoming message — even though `docker exec <c> hermes
+# chat -q ping` (also running as root) succeeds because root happens to be
+# able to read its own root-owned file. See systematic-debugging skill
+# notes attached to this fix.
+#
+# Fix
+# ---
+# This shim sits at /opt/hermes/bin/hermes and is placed earliest on PATH.
+# When invoked as root, it drops to the hermes user (via s6-setuidgid)
+# before exec'ing the real venv binary, so anything that writes under
+# $HERMES_HOME is uid-aligned with the supervised processes. When invoked
+# as any non-root UID — including the supervised processes themselves,
+# `docker exec --user hermes`, kanban subagents, etc. — it short-circuits
+# straight to the venv binary with no privilege change. Net: one extra
+# fork on the docker-exec-as-root path, zero behavioral change on every
+# other path.
+#
+# Recursion safety: the shim exec's the venv binary by *absolute path*
+# (/opt/hermes/.venv/bin/hermes), so the second hop cannot re-enter this
+# shim regardless of PATH state. No sentinel env var needed.
+#
+# Opt-out: set HERMES_DOCKER_EXEC_AS_ROOT=1 (1/true/yes, case-insensitive)
+# to keep running as root. Reserved for diagnostic sessions where the
+# operator deliberately wants root semantics — e.g. inspecting root-only
+# state via the hermes CLI. Default is to drop.
+
+set -e
+
+REAL=/opt/hermes/.venv/bin/hermes
+
+# Defensive: if the venv binary is missing (corrupted image, partial
+# install), fail loudly rather than silently masking it.
+if [ ! -x "$REAL" ]; then
+    echo "hermes-shim: $REAL not found or not executable" >&2
+    exit 127
+fi
+
+# Already non-root? Just exec the real binary. This is the hot path for
+# supervised processes (uid 10000) and for `docker exec --user hermes`.
+if [ "$(id -u)" != "0" ]; then
+    exec "$REAL" "$@"
+fi
+
+# Root, with opt-out set? Honor it.
+case "${HERMES_DOCKER_EXEC_AS_ROOT:-}" in
+    1|true|TRUE|True|yes|YES|Yes)
+        exec "$REAL" "$@"
+        ;;
+esac
+
+# Root, no opt-out. Drop to the hermes user.
+#
+# s6-setuidgid lives under /command/ which is NOT on `docker exec`'s PATH
+# (s6-overlay only puts /command/ on PATH for supervision-tree children).
+# Reference it by absolute path so the drop is robust against PATH
+# manipulation.
+S6_SUID=/command/s6-setuidgid
+if [ ! -x "$S6_SUID" ]; then
+    # Non-s6 image (someone stripped s6-overlay, or a hand-built variant).
+    # Fail loud rather than silently re-execing as root and leaking the
+    # bug this shim exists to prevent.
+    echo "hermes-shim: $S6_SUID not found; refusing to silently run as root." >&2
+    echo "hermes-shim: re-run with --user hermes or set HERMES_DOCKER_EXEC_AS_ROOT=1." >&2
+    exit 126
+fi
+
+# Reset HOME to the hermes user's home before dropping privileges. Without
+# this, $HOME stays /root and any library that resolves paths off $HOME
+# (XDG caches, lockfiles, .config writes) will try to write to /root and
+# fail with EACCES. Mirrors main-wrapper.sh.
+export HOME=/opt/data
+
+exec "$S6_SUID" hermes "$REAL" "$@"
--- a/docker/s6-rc.d/dashboard/run
+++ b/docker/s6-rc.d/dashboard/run
@ -19,6 +19,10 @@ case "${HERMES_DASHBOARD:-}" in
        ;;
 esac

+# with-contenv repopulates HOME from /init as /root. Reset it before
+# dropping privileges so HOME-anchored state lands under /opt/data.
+export HOME=/opt/data
+
 cd /opt/data
 # shellcheck disable=SC1091
 . /opt/hermes/.venv/bin/activate
@ -26,13 +30,21 @@ cd /opt/data
 dash_host="${HERMES_DASHBOARD_HOST:-0.0.0.0}"
 dash_port="${HERMES_DASHBOARD_PORT:-9119}"

-# Binding to anything other than localhost requires --insecure — the
-# dashboard refuses otherwise because it exposes API keys. Inside a
-# container this is the expected deployment.
+# `--insecure` is opt-in via HERMES_DASHBOARD_INSECURE. The dashboard's
+# OAuth auth gate engages automatically on non-loopback binds when a
+# DashboardAuthProvider is registered (e.g. the bundled dashboard_auth/nous
+# provider, which auto-registers when HERMES_DASHBOARD_OAUTH_CLIENT_ID is
+# set). If no provider is registered, start_server fails closed with a
+# specific operator-facing error.
+#
+# This used to derive --insecure from the bind host ("anything non-loopback
+# implies insecure"), but that predates the OAuth gate and silently
+# disabled it on every container-deployed dashboard. The gate is now the
+# authority; operators on trusted LANs / behind a reverse proxy without
+# the OAuth contract opt in explicitly.
 insecure=""
-case "$dash_host" in
-    127.0.0.1|localhost) ;;
-    *) insecure="--insecure" ;;
+case "${HERMES_DASHBOARD_INSECURE:-}" in
+    1|true|TRUE|True|yes|YES|Yes) insecure="--insecure" ;;
 esac

 # shellcheck disable=SC2086  # word-splitting of $insecure is intentional
--- a/docker/stage2-hook.sh
+++ b/docker/stage2-hook.sh
@ -33,6 +33,15 @@ INSTALL_DIR="/opt/hermes"
 mkdir -p "$HERMES_HOME"

 # --- UID/GID remap ---
+# Accept PUID/PGID as aliases for HERMES_UID/HERMES_GID.  NAS users (UGOS,
+# Synology, unRAID) expect the LinuxServer.io PUID/PGID convention and
+# bind-mount /opt/data from a host directory owned by their own UID; without
+# this alias those vars are silently ignored and the s6-setuidgid drop to
+# UID 10000 leaves the runtime unable to read the volume.  HERMES_UID/
+# HERMES_GID still win when both are set.  See #15290, salvages #25872.
+HERMES_UID="${HERMES_UID:-${PUID:-}}"
+HERMES_GID="${HERMES_GID:-${PGID:-}}"
+
 if [ -n "${HERMES_UID:-}" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then
    echo "[stage2] Changing hermes UID to $HERMES_UID"
    usermod -u "$HERMES_UID" hermes
@ -44,6 +53,62 @@ if [ -n "${HERMES_GID:-}" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then
    groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
 fi

+# --- Docker socket group membership (docker-in-docker / DooD) ---
+# When the user bind-mounts the host Docker daemon socket
+# (`-v /var/run/docker.sock:/var/run/docker.sock`) to use the `docker`
+# terminal backend from inside the container, the socket is owned by the
+# host's `docker` group (or root). The supervised hermes user (UID 10000)
+# is not a member of any group that matches the socket's GID, so every
+# `docker` invocation EACCES'es and `check_terminal_requirements()` fails.
+# See #16703.
+#
+# Granting the supp group via `docker run --group-add <gid>` alone is
+# NOT sufficient with our s6-setuidgid privilege drop: s6-setuidgid (and
+# gosu, the older shim) calls initgroups() for the target user, which
+# rebuilds the supplementary group list from /etc/group. Without an
+# /etc/group entry whose GID matches the socket, the kernel-granted
+# supp group is silently wiped between PID 1 and the dropped process.
+# Confirmed empirically: `--group-add 998` alone leaves the dropped
+# hermes process with `Groups: 10000` (998 gone); after this hook adds
+# the entry, the dropped process has `Groups: 998 10000` as expected.
+#
+# Fix: detect the socket's GID at boot and ensure /etc/group has a
+# matching entry that includes hermes. Idempotent across container
+# restarts. Skipped silently when no socket is bind-mounted.
+#
+# Handles the awkward corner cases:
+#   - socket owned by GID 0 (root) — some Podman setups; usermod -aG root
+#   - socket GID already used by a known container group (e.g. tty=5):
+#     reuse that group's name rather than creating a duplicate
+#   - hermes is already a member of the right group (idempotent restart)
+#   - chown/groupadd failures under rootless containers — non-fatal
+for sock in /var/run/docker.sock /run/docker.sock; do
+    [ -S "$sock" ] || continue
+    sock_gid=$(stat -c '%g' "$sock" 2>/dev/null) || continue
+    [ -n "$sock_gid" ] || continue
+    # Already a member? Nothing to do.
+    if id -G hermes 2>/dev/null | tr ' ' '\n' | grep -qx "$sock_gid"; then
+        echo "[stage2] hermes already in group $sock_gid for $sock"
+        break
+    fi
+    # Resolve or create a group name for this GID.
+    sock_group=$(getent group "$sock_gid" 2>/dev/null | cut -d: -f1)
+    if [ -z "$sock_group" ]; then
+        sock_group="hostdocker"
+        if ! groupadd -g "$sock_gid" "$sock_group" 2>/dev/null; then
+            echo "[stage2] Warning: groupadd -g $sock_gid $sock_group failed; skipping docker socket group setup"
+            break
+        fi
+        echo "[stage2] Created group $sock_group (GID $sock_gid) for Docker socket"
+    fi
+    if usermod -aG "$sock_group" hermes 2>/dev/null; then
+        echo "[stage2] Added hermes to group $sock_group (GID $sock_gid) for $sock"
+    else
+        echo "[stage2] Warning: usermod -aG $sock_group hermes failed; docker backend may fail with EACCES"
+    fi
+    break
+done
+
 # --- Fix ownership of data volume ---
 # When HERMES_UID is remapped or the top-level $HERMES_HOME isn't owned by
 # the runtime hermes UID, restore ownership to hermes — but ONLY for the
--- a/docs/security/network-egress-isolation.md
+++ b/docs/security/network-egress-isolation.md
@ -0,0 +1,195 @@
+# Network Egress Isolation for Docker Deployments
+
+When running Hermes inside Docker, the default `network_mode: host` gives the
+agent process unrestricted outbound network access. This guide shows how to
+segment traffic so the agent core can only reach the services it needs, while
+blocking arbitrary outbound connections.
+
+This is primarily a defense against prompt injection attacks that attempt to
+exfiltrate data via `curl`, `wget`, or raw HTTP from tool-generated shell
+commands.
+
+## Threat Model
+
+The Hermes [SECURITY.md](../../SECURITY.md) §2 defines the trust model. The
+terminal backend is the primary execution boundary. However, when running with
+`network_mode: host`, any command the agent executes can reach any endpoint on
+the network, including external ones.
+
+Network egress isolation adds a second layer: even if a malicious command
+executes inside the container, it cannot reach endpoints outside the
+explicitly allowlisted set.
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────┐
+│  Docker Network: internal (no internet)     │
+│                                             │
+│   ┌──────────────┐   ┌──────────────────┐   │
+│   │ hermes-agent │   │ hermes-dashboard │   │
+│   └──────┬───────┘   └────────┬─────────┘   │
+│          │                    │              │
+│          ▼                    │              │
+│   ┌──────────────┐            │              │
+│   │ hermes-gtw   │◄───────────┘              │
+│   └──────┬───────┘                           │
+│          │                                   │
+└──────────┼───────────────────────────────────┘
+           │
+┌──────────┼───────────────────────────────────┐
+│  Docker Network: egress (internet-capable)   │
+│          │                                   │
+│          ▼                                   │
+│   ┌─────────────────┐                        │
+│   │ egress-proxy     │──► allowlisted hosts  │
+│   │ (squid / envoy)  │                       │
+│   └─────────────────┘                        │
+└──────────────────────────────────────────────┘
+```
+
+Two Docker networks:
+
+- **`internal`** — no default route, no internet access. The agent, dashboard,
+  and gateway run here.
+- **`egress`** — has internet access. Only services that need to reach external
+  APIs are attached to this network.
+
+The gateway service is dual-homed (attached to both networks) so it can
+receive inbound messages from Telegram/Slack/etc. and forward them to the
+agent on the internal network.
+
+## Compose Configuration
+
+Override the default `docker-compose.yml` with a
+`docker-compose.override.yml`:
+
+```yaml
+# docker-compose.override.yml
+# Network egress isolation for production deployments.
+#
+# Usage:
+#   HERMES_UID=$(id -u) HERMES_GID=$(id -g) docker compose up -d
+#
+# This overrides network_mode: host with isolated Docker networks.
+
+networks:
+  internal:
+    driver: bridge
+    internal: true          # no default route, no internet
+  egress:
+    driver: bridge
+
+services:
+  gateway:
+    network_mode: ""        # clear the host-mode default
+    networks:
+      - internal
+      - egress              # needs outbound for Telegram, LLM APIs
+    ports:
+      - "127.0.0.1:9119:9119"   # dashboard proxy, localhost only
+
+  dashboard:
+    network_mode: ""
+    networks:
+      - internal            # internal only, no egress needed
+```
+
+### With an Egress Proxy (Recommended)
+
+For tighter control, route all outbound traffic through an HTTP proxy with
+an explicit allowlist:
+
+```yaml
+# docker-compose.override.yml (with egress proxy)
+
+networks:
+  internal:
+    driver: bridge
+    internal: true
+  egress:
+    driver: bridge
+
+services:
+  gateway:
+    network_mode: ""
+    networks:
+      - internal
+      - egress
+    environment:
+      - HTTP_PROXY=http://egress-proxy:3128
+      - HTTPS_PROXY=http://egress-proxy:3128
+      - NO_PROXY=hermes,hermes-dashboard,localhost
+
+  dashboard:
+    network_mode: ""
+    networks:
+      - internal
+
+  egress-proxy:
+    image: ubuntu/squid:6.10-24.04_edge
+    networks:
+      - egress
+    volumes:
+      - ./config/squid-allowlist.conf:/etc/squid/conf.d/allowlist.conf:ro
+    restart: unless-stopped
+```
+
+Example `config/squid-allowlist.conf`:
+
+```
+# Only allow HTTPS CONNECT to these hosts
+acl allowed_hosts dstdomain api.openai.com
+acl allowed_hosts dstdomain api.anthropic.com
+acl allowed_hosts dstdomain openrouter.ai
+acl allowed_hosts dstdomain generativelanguage.googleapis.com
+acl allowed_hosts dstdomain api.telegram.org
+acl allowed_hosts dstdomain api.github.com
+acl allowed_hosts dstdomain discord.com
+
+http_access allow CONNECT allowed_hosts
+http_access deny all
+```
+
+Adjust the allowlist to match your LLM provider and messaging platform.
+
+## Validating the Setup
+
+After bringing up the stack, verify isolation:
+
+```bash
+# From the agent container: this should FAIL (no egress)
+docker compose exec gateway \
+  curl -sf --max-time 5 https://example.com && echo "FAIL: egress not blocked" || echo "OK: egress blocked"
+
+# From the agent container: this should SUCCEED (internal network)
+docker compose exec gateway \
+  curl -sf --max-time 5 http://hermes-dashboard:9119/health && echo "OK: internal reachable" || echo "FAIL"
+
+# If using egress proxy: this should SUCCEED (allowlisted)
+docker compose exec gateway \
+  curl -sf --max-time 5 --proxy http://egress-proxy:3128 https://api.openai.com/v1/models && echo "OK" || echo "FAIL"
+```
+
+## Limitations
+
+- **DNS resolution:** The `internal` network can still resolve external DNS
+  names unless you also run a local DNS resolver that blocks external queries.
+  For most threat models this is acceptable since DNS resolution alone does not
+  exfiltrate meaningful data.
+
+- **Not a substitute for sandbox backends:** This guide isolates the agent
+  *container's* network. If you use the default local terminal backend, tool
+  commands execute inside the same container. For stronger isolation, combine
+  network segmentation with a sandboxed terminal backend (Docker, Modal,
+  Daytona).
+
+- **Platform adapters need egress:** The gateway service needs outbound access
+  to reach messaging platform APIs. If you add new platform adapters, add their
+  API endpoints to the proxy allowlist.
+
+## Related
+
+- [SECURITY.md](../../SECURITY.md) — Hermes trust model and vulnerability reporting
+- [Terminal backends](../../README.md) — sandboxed execution targets
+- [docker-compose.yml](../../docker-compose.yml) — default compose configuration
--- a/gateway/platforms/api_server.py
+++ b/gateway/platforms/api_server.py
@ -24,7 +24,8 @@ Exposes an HTTP server with endpoints:

 Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
 AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent
-through this adapter by pointing at http://localhost:8642/v1.
+through this adapter by pointing at http://localhost:8642/v1 and
+authenticating with API_SERVER_KEY.

 Requires:
 - aiohttp (already available in the gateway)
@ -844,11 +845,11 @@ class APIServerAdapter(BasePlatformAdapter):
        Validate Bearer token from Authorization header.

        Returns None if auth is OK, or a 401 web.Response on failure.
-        If no API key is configured, all requests are allowed (only when API
-        server is local).
+        connect() refuses to start the API server without API_SERVER_KEY, so
+        the no-key branch only exists for tests or unsupported manual wiring.
        """
        if not self._api_key:
-            return None  # No key configured — allow all (local-only use)
+            return None

        auth_header = request.headers.get("Authorization", "")
        if auth_header.startswith("Bearer "):
@ -1604,6 +1605,7 @@ class APIServerAdapter(BasePlatformAdapter):
                )
                final_response = result.get("final_response", "") if isinstance(result, dict) else ""
                effective_session_id = result.get("session_id", session_id) if isinstance(result, dict) else session_id
+                turn_messages = self._turn_transcript_messages(history, user_message, result) if isinstance(result, dict) else []
                await queue.put(_event_payload("assistant.completed", {
                    "session_id": effective_session_id,
                    "message_id": message_id,
@ -1616,6 +1618,7 @@ class APIServerAdapter(BasePlatformAdapter):
                    "session_id": effective_session_id,
                    "message_id": message_id,
                    "completed": True,
+                    "messages": turn_messages,
                    "usage": usage,
                }))
            except Exception as exc:
@ -3328,6 +3331,44 @@ class APIServerAdapter(BasePlatformAdapter):
            return len(prior)
        return 0

+    @classmethod
+    def _turn_transcript_messages(
+        cls,
+        conversation_history: List[Dict[str, Any]],
+        user_message: Any,
+        result: Dict[str, Any],
+    ) -> List[Dict[str, Any]]:
+        """Return this turn's assistant/tool messages in client-safe shape.
+
+        The streaming SSE contract delivers all assistant text as
+        ``assistant.delta`` events under one ``message_id`` interleaved with
+        ``tool.*`` events, and a single ``assistant.completed`` carrying only
+        the final reply.  A client that accumulates deltas into one buffer
+        cannot reconstruct *intermediate* assistant text segments that preceded
+        tool calls — so when the page is re-opened mid/post-stream those
+        segments appear lost, even though state.db persisted them correctly.
+
+        Emitting the authoritative per-turn transcript on ``run.completed`` lets
+        any SSE consumer reconcile its live view against ground truth without a
+        separate ``GET /messages`` round-trip.  Purely additive: clients that
+        ignore the field are unaffected.  Refs #34703.
+        """
+        agent_messages = result.get("messages") if isinstance(result, dict) else None
+        if not isinstance(agent_messages, list) or not agent_messages:
+            return []
+        start = cls._response_messages_turn_start_index(
+            conversation_history, user_message, result
+        )
+        turn = agent_messages[start:]
+        out: List[Dict[str, Any]] = []
+        for msg in turn:
+            if not isinstance(msg, dict):
+                continue
+            if msg.get("role") not in {"assistant", "tool"}:
+                continue
+            out.append(cls._message_response(msg))
+        return out
+
    @staticmethod
    def _extract_output_items(result: Dict[str, Any], start_index: int = 0) -> List[Dict[str, Any]]:
        """
@ -4099,11 +4140,13 @@ class APIServerAdapter(BasePlatformAdapter):
            if hasattr(sweep_task, "add_done_callback"):
                sweep_task.add_done_callback(self._background_tasks.discard)

-            # Refuse to start network-accessible without authentication
-            if is_network_accessible(self._host) and not self._api_key:
+            # Refuse to start without authentication. The API server can
+            # dispatch terminal-capable agent work, so every deployment needs
+            # an explicit API_SERVER_KEY regardless of bind address.
+            if not self._api_key:
                logger.error(
-                    "[%s] Refusing to start: binding to %s requires API_SERVER_KEY. "
-                    "Set API_SERVER_KEY or use the default 127.0.0.1.",
+                    "[%s] Refusing to start: API_SERVER_KEY is required for the API server, "
+                    "including loopback-only binds on %s.",
                    self.name, self._host,
                )
                return False
@ -4141,14 +4184,6 @@ class APIServerAdapter(BasePlatformAdapter):
            await self._site.start()

            self._mark_connected()
-            if not self._api_key:
-                logger.warning(
-                    "[%s] ⚠️  No API key configured (API_SERVER_KEY / platforms.api_server.key). "
-                    "All requests will be accepted without authentication. "
-                    "Set an API key for production deployments to prevent "
-                    "unauthorized access to sessions, responses, and cron jobs.",
-                    self.name,
-                )
            logger.info(
                "[%s] API server listening on http://%s:%d (model: %s)",
                self.name, self._host, self._port, self._model_name,
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -472,6 +472,7 @@ def is_host_excluded_by_no_proxy(hostname: str, no_proxy_value: str | None = Non
    return False


+import dataclasses
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
@ -829,6 +830,13 @@ _HERMES_HOME = get_hermes_home()
 MEDIA_DELIVERY_ALLOW_DIRS_ENV = "HERMES_MEDIA_ALLOW_DIRS"
 MEDIA_DELIVERY_TRUST_RECENT_ENV = "HERMES_MEDIA_TRUST_RECENT_FILES"
 MEDIA_DELIVERY_TRUST_RECENT_SECONDS_ENV = "HERMES_MEDIA_TRUST_RECENT_SECONDS"
+# Strict mode toggles the original allowlist+recency path-validation behavior.
+# Off by default — symmetric with inbound (we accept any document type the
+# user uploads), and with the denylist still blocking obvious credential /
+# system paths. Operators running public-facing gateways where prompt
+# injection from one user could exfiltrate the host's secrets to that same
+# user should set this to true.
+MEDIA_DELIVERY_STRICT_ENV = "HERMES_MEDIA_DELIVERY_STRICT"
 MEDIA_DELIVERY_SAFE_ROOTS = (
    IMAGE_CACHE_DIR,
    AUDIO_CACHE_DIR,
@ -840,6 +848,13 @@ MEDIA_DELIVERY_SAFE_ROOTS = (
    _HERMES_HOME / "video_cache",
    _HERMES_HOME / "document_cache",
    _HERMES_HOME / "browser_screenshots",
+    # Canonical cache layout — listed alongside the legacy *_cache dirs so
+    # generated artifacts deliver on installs that have both (#31733).
+    _HERMES_HOME / "cache" / "images",
+    _HERMES_HOME / "cache" / "audio",
+    _HERMES_HOME / "cache" / "videos",
+    _HERMES_HOME / "cache" / "documents",
+    _HERMES_HOME / "cache" / "screenshots",
 )

 # Default recency window for trusting freshly-produced files (seconds).
@ -918,6 +933,21 @@ def _media_delivery_recency_seconds() -> float:
    return float(_MEDIA_DELIVERY_TRUST_RECENT_DEFAULT_SECONDS)


+def _media_delivery_strict_mode() -> bool:
+    """Return True when path validation should require allowlist/recency match.
+
+    Off by default. In non-strict mode, ``validate_media_delivery_path``
+    accepts any existing regular file that isn't under the credential /
+    system-path denylist — restoring the pre-#29523 behavior for the
+    single-user case. Strict mode preserves the original
+    allowlist+recency-window logic for operators running public-facing
+    gateways where prompt injection from one user shouldn't be able to
+    exfiltrate the host's secrets to that same user.
+    """
+    raw = os.environ.get(MEDIA_DELIVERY_STRICT_ENV, "0").strip().lower()
+    return raw in ("1", "true", "yes", "on")
+
+
 def _media_delivery_denied_paths() -> List[Path]:
    """Return absolute denylist paths under which delivery is never allowed."""
    denied = [Path(p) for p in _MEDIA_DELIVERY_DENIED_PREFIXES]
@ -972,10 +1002,22 @@ def _path_is_within(path: Path, root: Path) -> bool:
 def validate_media_delivery_path(path: str) -> Optional[str]:
    """Return a safe absolute file path for native media delivery, else None.

-    MEDIA tags and bare local paths in model output are untrusted text. Only
-    existing regular files under Hermes-managed media caches, or roots the
-    operator explicitly allowlists, may be uploaded as native attachments.
-    Symlinks are resolved before the containment check.
+    Default mode (single-user / private gateway): accept any existing regular
+    file that isn't under the credential / system-path denylist
+    (``_MEDIA_DELIVERY_DENIED_PREFIXES`` + ``~/.ssh``, ``~/.aws``, etc.).
+    This matches the symmetry of inbound delivery — Telegram/Discord/Slack
+    will hand the agent any file the user uploads, and the agent can hand
+    back any file that isn't a credential.
+
+    Strict mode (opt-in via ``gateway.strict`` in ``config.yaml`` or
+    ``HERMES_MEDIA_DELIVERY_STRICT=1``): the file MUST live under a
+    Hermes-managed cache, under an operator-allowlisted root
+    (``HERMES_MEDIA_ALLOW_DIRS``), or be freshly produced inside the
+    configured recency window. Suitable for public-facing bots where
+    prompt injection from one user shouldn't be able to exfiltrate the
+    host's secrets to that same user.
+
+    Symlinks are resolved before any containment / denylist check.
    """
    if not path:
        return None
@ -987,7 +1029,11 @@ def validate_media_delivery_path(path: str) -> Optional[str]:
    if not candidate:
        return None

-    expanded = Path(os.path.expanduser(candidate))
+    try:
+        expanded = Path(os.path.expanduser(candidate))
+    except (OSError, RuntimeError, ValueError):
+        # expanduser raises ValueError("embedded null byte") for a ~\x00 path.
+        return None
    if not expanded.is_absolute():
        return None

@ -999,6 +1045,8 @@ def validate_media_delivery_path(path: str) -> Optional[str]:
    if not resolved.is_file():
        return None

+    # Cache / operator allowlist is always honored — these are unconditionally
+    # trusted regardless of mode.
    for root in _media_delivery_allowed_roots():
        try:
            resolved_root = root.expanduser().resolve(strict=False)
@ -1007,9 +1055,18 @@ def validate_media_delivery_path(path: str) -> Optional[str]:
        if _path_is_within(resolved, resolved_root):
            return str(resolved)

-    # Outside the cache/operator allowlist: fall back to recency-based trust
-    # for files the agent has just produced (e.g. ``pandoc -o /tmp/report.pdf``
-    # or ``write_file("/home/user/report.pdf", ...)``). System paths and
+    # Non-strict mode (default): accept anything not on the denylist.
+    # The denylist still blocks /etc, /proc, ~/.ssh, ~/.aws, ~/.hermes/.env,
+    # ~/.hermes/auth.json, etc. — so the obvious prompt-injection sites
+    # (``MEDIA:/etc/passwd``, ``MEDIA:~/.ssh/id_rsa``) remain rejected.
+    if not _media_delivery_strict_mode():
+        if _path_under_denied_prefix(resolved):
+            return None
+        return str(resolved)
+
+    # Strict mode: fall back to recency-based trust for freshly-produced
+    # files (e.g. ``pandoc -o /tmp/report.pdf`` or
+    # ``write_file("/home/user/report.pdf", ...)``). System paths and
    # credential locations remain blocked even when "recent" — see
    # ``_MEDIA_DELIVERY_DENIED_PREFIXES`` for the denylist.
    window = _media_delivery_recency_seconds()
@ -1020,6 +1077,17 @@ def validate_media_delivery_path(path: str) -> Optional[str]:
    return None


+# Neutralise control chars and the Unicode line separators (NEL, LS, PS) that
+# str.splitlines() / log aggregators treat as breaks, so a model-emitted path
+# can't forge a second log line. Truncated to keep records bounded.
+_LOG_UNSAFE_CHARS = re.compile(r"[\x00-\x1f\x7f\x85\u2028\u2029]")
+
+
+def _log_safe_path(path: str) -> str:
+    """Return a single-line, length-bounded path for log output."""
+    return _LOG_UNSAFE_CHARS.sub("?", str(path))[:200]
+
+
 SUPPORTED_DOCUMENT_TYPES = {
    ".pdf": "application/pdf",
    ".md": "text/markdown",
@ -1063,6 +1131,75 @@ SUPPORTED_IMAGE_DOCUMENT_TYPES = {
 }


+# ---------------------------------------------------------------------------
+# Media-delivery extension allowlist — SINGLE SOURCE OF TRUTH
+#
+# Both extractors that turn response text into native attachments derive their
+# extension set from this tuple:
+#   * ``extract_media()``       — explicit ``MEDIA:<path>`` tags
+#   * ``extract_local_files()`` — bare absolute/home paths the agent mentions
+#
+# Historically these two carried independently-maintained extension lists.
+# ``extract_media`` had a narrow list (no .md/.json/.yaml/.xml/.html/...) while
+# ``extract_local_files`` had a broad one. Combined with the unconditional
+# ``MEDIA:\\s*\\S+`` cleanup at the dispatch sites, that mismatch created a
+# silent black hole: a ``MEDIA:/report.md`` tag failed the narrow extract_media
+# match, got stripped from the body by the loose cleanup regex, and was then
+# invisible to extract_local_files — the file was never delivered (issue
+# #34517). Keeping one list eliminates the drift; building the cleanup regexes
+# from the same set means a tag is only stripped when its extension is one we
+# can actually deliver, so an unknown-extension path survives in the body
+# instead of vanishing.
+#
+# Covers images (inline), video (inline where supported), audio (voice/audio),
+# documents/spreadsheets/presentations (send_document), archives, and rendered
+# web output. The dispatch partition (image vs video vs document) lives in
+# ``gateway/run.py``.
+# ---------------------------------------------------------------------------
+
+MEDIA_DELIVERY_EXTS: Tuple[str, ...] = (
+    # Images (embed inline)
+    ".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg",
+    # Video (embed inline where supported)
+    ".mp4", ".mov", ".avi", ".mkv", ".webm",
+    # Audio (delivered as voice/audio where supported)
+    ".mp3", ".wav", ".ogg", ".opus", ".m4a", ".flac",
+    # Documents (uploaded as file attachments)
+    ".pdf", ".docx", ".doc", ".odt", ".rtf", ".txt", ".md", ".epub",
+    # Spreadsheets / data
+    ".xlsx", ".xls", ".ods", ".csv", ".tsv", ".json", ".xml", ".yaml", ".yml",
+    # Presentations
+    ".pptx", ".ppt", ".odp", ".key",
+    # Archives
+    ".zip", ".tar", ".gz", ".tgz", ".bz2", ".xz", ".7z", ".rar", ".apk", ".ipa",
+    # Web / rendered output
+    ".html", ".htm",
+)
+
+# Regex alternation fragment of bare extensions (no leading dot), e.g.
+# ``png|jpe?g|...``. ``jpe?g`` collapses jpg/jpeg into one branch. Sorted
+# longest-first so the alternation never matches a shorter ext as a prefix of
+# a longer one (e.g. ``.tar`` before ``.tar.gz`` components).
+_MEDIA_EXT_ALTERNATION = "|".join(
+    sorted((e.lstrip(".") for e in MEDIA_DELIVERY_EXTS), key=len, reverse=True)
+)
+
+# Anchored ``MEDIA:<path>`` cleanup pattern. Unlike the old loose
+# ``MEDIA:\\s*\\S+``, this only strips a tag whose path ends in a known
+# deliverable extension (optionally quoted/backticked). A ``MEDIA:`` tag with
+# an unknown extension is left in the text so it can still be picked up by the
+# bare-path detector (extract_local_files) downstream rather than silently
+# deleted. Shared by the non-streaming dispatch path and the streaming
+# consumer so both behave identically.
+MEDIA_TAG_CLEANUP_RE = re.compile(
+    r'''[`"']?MEDIA:\s*'''
+    r'''(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|'''
+    r'''(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:''' + _MEDIA_EXT_ALTERNATION + r'''))'''
+    r'''(?=[\s`"',;:)\]}]|$)[`"']?''',
+    re.IGNORECASE,
+)
+
+
 def get_document_cache_dir() -> Path:
    """Return the document cache directory, creating it if it doesn't exist."""
    DOCUMENT_CACHE_DIR.mkdir(parents=True, exist_ok=True)
@ -1516,6 +1653,10 @@ class BasePlatformAdapter(ABC):
        self.config = config
        self.platform = platform
        self._message_handler: Optional[MessageHandler] = None
+        # Optional hook (e.g. Telegram DM topic recovery) that rewrites
+        # ``event.source.thread_id`` before session keying. Returns the
+        # corrected thread_id or None to leave the source untouched.
+        self._topic_recovery_fn: Optional[Callable[[Any], Optional[str]]] = None
        self._running = False
        self._fatal_error_code: Optional[str] = None
        self._fatal_error_message: Optional[str] = None
@ -1583,6 +1724,29 @@ class BasePlatformAdapter(ABC):
        """
        return len

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """Whether this adapter gates inbound access before dispatch.
+
+        Some adapters (WeCom, Weixin, Yuanbao, QQBot) implement a documented
+        config-driven access surface — ``dm_policy`` / ``group_policy`` /
+        ``allow_from`` / ``group_allow_from`` in ``PlatformConfig.extra`` — and
+        enforce it at intake: a message is dropped inside the adapter and never
+        reaches the gateway unless it already passed that policy.
+
+        The gateway's env-based allowlist check runs *after* the adapter, so for
+        these platforms a message arriving at ``_is_user_authorized`` has, by
+        definition, already been authorized by the adapter. Without this flag the
+        gateway would then deny it again (no env allowlist → default deny),
+        silently breaking ``dm_policy: open`` and config-only allowlists.
+
+        Adapters that own their access policy override this to return ``True``.
+        The gateway treats that as "already authorized at intake" and skips the
+        env-allowlist default-deny. Adapters that delegate access control to the
+        gateway leave it ``False`` (the default).
+        """
+        return False
+
    def supports_draft_streaming(
        self,
        chat_type: Optional[str] = None,
@ -1771,6 +1935,40 @@ class BasePlatformAdapter(ABC):
        """
        self._message_handler = handler

+    def set_topic_recovery_fn(
+        self,
+        fn: Optional[Callable[[Any], Optional[str]]],
+    ) -> None:
+        """Install a thread_id-recovery hook (Telegram DM topic mode).
+
+        The hook is called with ``event.source`` before session keying;
+        a non-None return value replaces ``source.thread_id``. Pass
+        ``None`` to clear the hook.
+        """
+        # Guard against subclasses that initialize via ``object.__new__`` in
+        # tests and never run ``BasePlatformAdapter.__init__``.
+        self._topic_recovery_fn = fn  # type: ignore[attr-defined]
+
+    def _apply_topic_recovery(self, event: MessageEvent) -> None:
+        """Rewrite ``event.source.thread_id`` in place if the hook returns one."""
+        recover = getattr(self, "_topic_recovery_fn", None)
+        if recover is None:
+            return
+        source = getattr(event, "source", None)
+        if source is None:
+            return
+        try:
+            recovered = recover(source)
+        except Exception:
+            logger.debug("topic recovery hook failed", exc_info=True)
+            return
+        if recovered is None or str(recovered) == str(source.thread_id or ""):
+            return
+        try:
+            event.source = dataclasses.replace(source, thread_id=str(recovered))
+        except Exception:
+            logger.debug("topic recovery rewrite failed", exc_info=True)
+
    def set_busy_session_handler(self, handler: Optional[Callable[[MessageEvent, str], Awaitable[bool]]]) -> None:
        """Set an optional handler for messages arriving during active sessions."""
        self._busy_session_handler = handler
@ -2354,11 +2552,12 @@ class BasePlatformAdapter(ABC):
        """Drop unsafe MEDIA paths and normalize accepted paths."""
        safe_media: List[Tuple[str, bool]] = []
        for media_path, is_voice in media_files or []:
-            safe_path = validate_media_delivery_path(str(media_path))
+            raw = str(media_path)
+            safe_path = validate_media_delivery_path(raw)
            if safe_path:
                safe_media.append((safe_path, bool(is_voice)))
            else:
-                logger.warning("Skipping unsafe MEDIA directive path outside allowed roots")
+                logger.warning("Skipping unsafe MEDIA directive path: %s", _log_safe_path(raw))
        return safe_media

    @staticmethod
@ -2366,11 +2565,12 @@ class BasePlatformAdapter(ABC):
        """Drop unsafe bare local file paths and normalize accepted paths."""
        safe_paths: List[str] = []
        for file_path in file_paths or []:
-            safe_path = validate_media_delivery_path(str(file_path))
+            raw = str(file_path)
+            safe_path = validate_media_delivery_path(raw)
            if safe_path:
                safe_paths.append(safe_path)
            else:
-                logger.warning("Skipping unsafe local file path outside allowed roots")
+                logger.warning("Skipping unsafe local file path: %s", _log_safe_path(raw))
        return safe_paths

    @staticmethod
@ -2411,17 +2611,22 @@ class BasePlatformAdapter(ABC):
        cleaned = cleaned.replace("[[as_document]]", "")
        
        # Extract MEDIA:<path> tags, allowing optional whitespace after the colon
-        # and quoted/backticked paths for LLM-formatted outputs.
-        media_pattern = re.compile(
-            r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|(?:~/|/)\S+(?:[^\S\n]+\S+)*?\.(?:png|jpe?g|gif|webp|mp4|mov|avi|mkv|webm|ogg|opus|mp3|wav|m4a|flac|epub|pdf|zip|rar|7z|docx?|xlsx?|pptx?|txt|csv|apk|ipa)(?=[\s`"',;:)\]}]|$))[`"']?'''
-        )
+        # and quoted/backticked paths for LLM-formatted outputs. The extension
+        # set is the shared MEDIA_DELIVERY_EXTS source of truth (built once into
+        # MEDIA_TAG_CLEANUP_RE) so it can never drift from extract_local_files.
+        media_pattern = MEDIA_TAG_CLEANUP_RE
        for match in media_pattern.finditer(content):
            path = match.group("path").strip()
            if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
                path = path[1:-1].strip()
            path = path.lstrip("`\"'").rstrip("`\"',.;:)}]")
            if path:
-                media.append((os.path.expanduser(path), has_voice_tag))
+                try:
+                    media.append((os.path.expanduser(path), has_voice_tag))
+                except (OSError, RuntimeError, ValueError):
+                    # Skip a crafted ~\x00 path rather than aborting extraction
+                    # and dropping every other attachment in the response.
+                    continue

        # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
        if media:
@ -2455,24 +2660,7 @@ class BasePlatformAdapter(ABC):
            Tuple of (list of expanded file paths, cleaned text with the
            raw path strings removed).
        """
-        _LOCAL_MEDIA_EXTS = (
-            # Images (embed inline)
-            '.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff', '.svg',
-            # Video (embed inline where supported)
-            '.mp4', '.mov', '.avi', '.mkv', '.webm',
-            # Audio (delivered as voice/audio where supported)
-            '.mp3', '.wav', '.ogg', '.m4a', '.flac',
-            # Documents (uploaded as file attachments)
-            '.pdf', '.docx', '.doc', '.odt', '.rtf', '.txt', '.md',
-            # Spreadsheets / data
-            '.xlsx', '.xls', '.ods', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml',
-            # Presentations
-            '.pptx', '.ppt', '.odp', '.key',
-            # Archives
-            '.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar',
-            # Web / rendered output
-            '.html', '.htm',
-        )
+        _LOCAL_MEDIA_EXTS = MEDIA_DELIVERY_EXTS
        ext_part = '|'.join(e.lstrip('.') for e in _LOCAL_MEDIA_EXTS)

        # (?<![/:\w.]) prevents matching inside URLs (e.g. https://…/img.png)
@ -3287,7 +3475,12 @@ class BasePlatformAdapter(ABC):
            return

        coerce_plaintext_gateway_command(event)
-        
+
+        # Rewrite ``event.source.thread_id`` via the installed recovery hook
+        # (Telegram DM topic mode) so the session key, guard checks, and
+        # downstream delivery all agree on the same lane.
+        self._apply_topic_recovery(event)
+
        session_key = build_session_key(
            event.source,
            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
@ -3588,7 +3781,12 @@ class BasePlatformAdapter(ABC):
                # Strip any remaining internal directives from message body (fixes #1561)
                text_content = text_content.replace("[[audio_as_voice]]", "").strip()
                text_content = text_content.replace("[[as_document]]", "").strip()
-                text_content = re.sub(r"MEDIA:\s*\S+", "", text_content).strip()
+                # Strip only MEDIA: tags whose path has a deliverable extension
+                # (shared MEDIA_TAG_CLEANUP_RE). A MEDIA: tag with an unknown
+                # extension is intentionally left in the body so extract_local_files
+                # below can still pick up the bare path — otherwise the file would
+                # be silently dropped (issue #34517).
+                text_content = MEDIA_TAG_CLEANUP_RE.sub("", text_content).strip()
                if images:
                    logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))

--- a/gateway/platforms/feishu.py
+++ b/gateway/platforms/feishu.py
@ -48,6 +48,7 @@ user is seen through different apps in the future.
 from __future__ import annotations

 import asyncio
+import collections
 import hashlib
 import hmac
 import itertools
@ -1408,6 +1409,8 @@ class FeishuAdapter(BasePlatformAdapter):
    """Feishu/Lark bot adapter."""

    MAX_MESSAGE_LENGTH = 8000
+    # Max distinct chat IDs retained in _chat_locks before LRU eviction kicks in.
+    CHAT_LOCK_MAX_SIZE: int = 1000
    # Threshold for detecting Feishu client-side message splits.
    # When a chunk is near the ~4096-char practical limit, a continuation
    # is almost certain.
@ -1445,7 +1448,7 @@ class FeishuAdapter(BasePlatformAdapter):
        self._pending_inbound_lock = threading.Lock()
        self._pending_drain_scheduled = False
        self._pending_inbound_max_depth = 1000  # cap queue; drop oldest beyond
-        self._chat_locks: Dict[str, asyncio.Lock] = {}  # chat_id → lock (per-chat serial processing)
+        self._chat_locks: "collections.OrderedDict[str, asyncio.Lock]" = collections.OrderedDict()  # chat_id → lock (per-chat serial processing, LRU-bounded)
        self._sent_message_ids_to_chat: Dict[str, str] = {}  # message_id → chat_id (for reaction routing)
        self._sent_message_id_order: List[str] = []  # LRU order for _sent_message_ids_to_chat
        self._chat_info_cache: Dict[str, Dict[str, Any]] = {}
@ -2835,11 +2838,28 @@ class FeishuAdapter(BasePlatformAdapter):
    # =========================================================================

    def _get_chat_lock(self, chat_id: str) -> asyncio.Lock:
-        """Return (creating if needed) the per-chat asyncio.Lock for serial message processing."""
+        """Return (creating if needed) the per-chat asyncio.Lock for serial message processing.
+
+        Bounded with LRU eviction so a long-running gateway that sees many
+        distinct chats does not grow ``_chat_locks`` without limit. Locks that
+        are currently held are never evicted; if every entry is locked we fall
+        back to dropping the least-recently-used one.
+        """
        lock = self._chat_locks.get(chat_id)
-        if lock is None:
-            lock = asyncio.Lock()
-            self._chat_locks[chat_id] = lock
+        if lock is not None:
+            self._chat_locks.move_to_end(chat_id)
+            return lock
+        if len(self._chat_locks) >= self.CHAT_LOCK_MAX_SIZE:
+            evicted = False
+            for key in list(self._chat_locks):
+                if not self._chat_locks[key].locked():
+                    self._chat_locks.pop(key)
+                    evicted = True
+                    break
+            if not evicted:
+                self._chat_locks.pop(next(iter(self._chat_locks)))
+        lock = asyncio.Lock()
+        self._chat_locks[chat_id] = lock
        return lock

    async def _handle_message_with_guards(self, event: MessageEvent) -> None:
--- a/gateway/platforms/matrix.py
+++ b/gateway/platforms/matrix.py
@ -2236,7 +2236,8 @@ class MatrixAdapter(BasePlatformAdapter):
            if prompt and not prompt.resolved:
                if room_id != prompt.chat_id:
                    return
-                if self._allowed_user_ids and sender not in self._allowed_user_ids:
+                _allow_all = os.getenv("GATEWAY_ALLOW_ALL_USERS", "").lower() in {"true", "1", "yes"}
+                if not _allow_all and not (self._allowed_user_ids and sender in self._allowed_user_ids):
                    logger.info(
                        "Matrix: ignoring approval reaction from unauthorized user %s on %s",
                        sender, reacts_to,
--- a/gateway/platforms/msgraph_webhook.py
+++ b/gateway/platforms/msgraph_webhook.py
@ -25,6 +25,7 @@ from gateway.platforms.base import (
    MessageEvent,
    MessageType,
    SendResult,
+    is_network_accessible,
 )

 logger = logging.getLogger(__name__)
@ -132,12 +133,24 @@ class MSGraphWebhookAdapter(BasePlatformAdapter):
    def set_notification_scheduler(self, scheduler: Optional[NotificationScheduler]) -> None:
        self._notification_scheduler = scheduler

+    def _source_allowlist_required_but_missing(self) -> bool:
+        return is_network_accessible(self._host) and not self._allowed_source_networks
+
    async def connect(self) -> bool:
        if self._client_state is None:
            logger.error(
                "[msgraph_webhook] Refusing to start without extra.client_state configured"
            )
            return False
+        if self._source_allowlist_required_but_missing():
+            logger.error(
+                "[msgraph_webhook] Refusing to start: binding to %s requires "
+                "extra.allowed_source_cidrs. Configure the Microsoft Graph "
+                "source CIDRs or bind to loopback (127.0.0.1/::1) behind a "
+                "tunnel or reverse proxy.",
+                self._host,
+            )
+            return False

        app = web.Application()
        app.router.add_get(self._health_path, self._handle_health)
@ -177,6 +190,8 @@ class MSGraphWebhookAdapter(BasePlatformAdapter):
        return {"name": chat_id, "type": "webhook"}

    async def _handle_health(self, request: "web.Request") -> "web.Response":
+        if not self._source_ip_allowed(request):
+            return web.Response(status=403)
        return web.json_response(
            {
                "status": "ok",
@ -271,9 +286,12 @@ class MSGraphWebhookAdapter(BasePlatformAdapter):
    def _source_ip_allowed(self, request: "web.Request") -> bool:
        """Return True if the request's source IP is in the configured allowlist.

-        When ``allowed_source_cidrs`` is empty (the default), everything is
-        allowed — preserves behavior for dev tunnels / localhost setups.
+        Loopback-only binds may omit ``allowed_source_cidrs`` for local reverse
+        proxies and dev tunnels. Network-accessible binds fail closed until an
+        explicit CIDR allowlist is configured.
        """
+        if self._source_allowlist_required_but_missing():
+            return False
        if not self._allowed_source_networks:
            return True
        peer = request.remote or ""
--- a/gateway/platforms/qqbot/adapter.py
+++ b/gateway/platforms/qqbot/adapter.py
@ -126,7 +126,6 @@ from gateway.platforms.qqbot.chunked_upload import (
 )
 from gateway.platforms.qqbot.keyboards import (
    ApprovalRequest,
-    ApprovalSender,
    InlineKeyboard,
    InteractionEvent,
    build_approval_keyboard,
@ -270,6 +269,11 @@ class QQAdapter(BasePlatformAdapter):
    def name(self) -> str:
        return "QQBot"

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """QQBot gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    # ------------------------------------------------------------------
    # Connection lifecycle
    # ------------------------------------------------------------------
--- a/gateway/platforms/qqbot/chunked_upload.py
+++ b/gateway/platforms/qqbot/chunked_upload.py
@ -37,7 +37,7 @@ import asyncio
 import functools
 import hashlib
 import logging
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Awaitable, Callable, Dict, List, Optional

--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@ -1690,7 +1690,6 @@ class TelegramAdapter(BasePlatformAdapter):
                    BotCommandScopeAllPrivateChats,
                    BotCommandScopeAllGroupChats,
                    BotCommandScopeDefault,
-                    BotCommandScopeChat,
                )
                from hermes_cli.commands import telegram_menu_commands
                # Telegram allows up to 100 commands but has an undocumented
@ -5027,8 +5026,14 @@ class TelegramAdapter(BasePlatformAdapter):
    # ------------------------------------------------------------------

    def _text_batch_key(self, event: MessageEvent) -> str:
-        """Session-scoped key for text message batching."""
+        """Session-scoped key for text message batching.
+
+        Applies the installed topic-recovery hook first so DM-topic batches
+        coalesce on (and dispatch to) the recovered lane rather than the
+        raw inbound ``message_thread_id`` Telegram may have attached.
+        """
        from gateway.session import build_session_key
+        self._apply_topic_recovery(event)
        return build_session_key(
            event.source,
            group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True),
--- a/gateway/platforms/wecom.py
+++ b/gateway/platforms/wecom.py
@ -847,6 +847,11 @@ class WeComAdapter(BasePlatformAdapter):
    # Policy helpers
    # ------------------------------------------------------------------

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """WeCom gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    def _is_dm_allowed(self, sender_id: str) -> bool:
        if self._dm_policy == "disabled":
            return False
--- a/gateway/platforms/weixin.py
+++ b/gateway/platforms/weixin.py
@ -658,52 +658,6 @@ def _split_table_row(line: str) -> List[str]:
    return [cell.strip() for cell in row.split("|")]


-def _rewrite_headers_for_weixin(line: str) -> str:
-    match = _HEADER_RE.match(line)
-    if not match:
-        return line.rstrip()
-    level = len(match.group(1))
-    title = match.group(2).strip()
-    if level == 1:
-        return f"【{title}】"
-    return f"**{title}**"
-
-
-def _rewrite_table_block_for_weixin(lines: List[str]) -> str:
-    if len(lines) < 2:
-        return "\n".join(lines)
-    headers = _split_table_row(lines[0])
-    body_rows = [_split_table_row(line) for line in lines[2:] if line.strip()]
-    if not headers or not body_rows:
-        return "\n".join(lines)
-
-    formatted_rows: List[str] = []
-    for row in body_rows:
-        pairs = []
-        for idx, header in enumerate(headers):
-            if idx >= len(row):
-                break
-            label = header or f"Column {idx + 1}"
-            value = row[idx].strip()
-            if value:
-                pairs.append((label, value))
-        if not pairs:
-            continue
-        if len(pairs) == 1:
-            label, value = pairs[0]
-            formatted_rows.append(f"- {label}: {value}")
-            continue
-        if len(pairs) == 2:
-            label, value = pairs[0]
-            other_label, other_value = pairs[1]
-            formatted_rows.append(f"- {label}: {value}")
-            formatted_rows.append(f"  {other_label}: {other_value}")
-            continue
-        summary = " | ".join(f"{label}: {value}" for label, value in pairs)
-        formatted_rows.append(f"- {summary}")
-    return "\n".join(formatted_rows) if formatted_rows else "\n".join(lines)
-
-
 def _normalize_markdown_blocks(content: str) -> str:
    lines = content.splitlines()
    result: List[str] = []
@ -1443,6 +1397,11 @@ class WeixinAdapter(BasePlatformAdapter):
        logger.info("[%s] inbound from=%s type=%s media=%d", self.name, _safe_id(sender_id), source.chat_type, len(media_paths))
        await self.handle_message(event)

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """Weixin gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    def _is_dm_allowed(self, sender_id: str) -> bool:
        if self._dm_policy == "disabled":
            return False
--- a/gateway/platforms/yuanbao.py
+++ b/gateway/platforms/yuanbao.py
@ -2230,6 +2230,45 @@ class MediaResolveMiddleware(InboundMiddleware):

    name = "media-resolve"

+    # --- Resource download cache (keyed by resourceId) ---
+    # Avoids redundant downloads of the same resource within the TTL window.
+    # The same resourceId can be referenced multiple times in a session (own
+    # attachment, then quoted again, then observed in a group backfill); each
+    # reference otherwise triggers a fresh token exchange + download.
+    _resource_cache: ClassVar[Dict[str, Tuple[str, str, float]]] = {}  # rid -> (local_path, mime, ts)
+    _RESOURCE_CACHE_TTL_S: ClassVar[int] = 24 * 60 * 60  # 24 hours
+    _RESOURCE_CACHE_MAX_SIZE: ClassVar[int] = 256
+
+    @classmethod
+    def _get_cached_resource(cls, resource_id: str) -> Optional[Tuple[str, str]]:
+        """Return cached ``(local_path, mime)`` if still valid and file exists, else None."""
+        if not resource_id:
+            return None
+        entry = cls._resource_cache.get(resource_id)
+        if entry is None:
+            return None
+        local_path, mime, ts = entry
+        if time.time() - ts > cls._RESOURCE_CACHE_TTL_S:
+            cls._resource_cache.pop(resource_id, None)
+            return None
+        # Verify the cached file still exists on disk (cache dir may be swept).
+        if not os.path.isfile(local_path):
+            cls._resource_cache.pop(resource_id, None)
+            return None
+        return local_path, mime
+
+    @classmethod
+    def _put_cached_resource(cls, resource_id: str, local_path: str, mime: str) -> None:
+        """Store download result in cache. Evicts oldest entries when over capacity."""
+        if not resource_id:
+            return
+        if len(cls._resource_cache) >= cls._RESOURCE_CACHE_MAX_SIZE:
+            # Drop the oldest 25% of entries by timestamp.
+            sorted_keys = sorted(cls._resource_cache, key=lambda k: cls._resource_cache[k][2])
+            for k in sorted_keys[: cls._RESOURCE_CACHE_MAX_SIZE // 4]:
+                cls._resource_cache.pop(k, None)
+        cls._resource_cache[resource_id] = (local_path, mime, time.time())
+
    @staticmethod
    def _guess_image_ext_from_url(url: str) -> str:
        """Guess image extension from URL path."""
@ -2327,8 +2366,23 @@ class MediaResolveMiddleware(InboundMiddleware):
    async def _download_and_cache(
        cls, adapter, *, fetch_url: str, kind: str,
        file_name: Optional[str] = None, log_tag: str = "",
+        resource_id: str = "",
    ) -> Optional[Tuple[str, str]]:
-        """Download a Yuanbao resource and cache locally. Returns ``(local_path, mime)`` or ``None``."""
+        """Download a Yuanbao resource and cache locally. Returns ``(local_path, mime)`` or ``None``.
+
+        When *resource_id* is provided, an in-memory cache keyed by resourceId
+        is consulted first to skip redundant downloads of the same resource
+        within the TTL window.
+        """
+        if resource_id:
+            hit = cls._get_cached_resource(resource_id)
+            if hit is not None:
+                logger.debug(
+                    "[%s] resource cache hit: rid=%s path=%s",
+                    adapter.name, resource_id, hit[0],
+                )
+                return hit
+
        try:
            file_bytes, content_type = await media_download_url(
                fetch_url, max_size_mb=adapter.MEDIA_MAX_SIZE_MB,
@ -2353,6 +2407,7 @@ class MediaResolveMiddleware(InboundMiddleware):
            mime = guess_mime_type(f"image{ext}")
            if not mime.startswith("image/"):
                mime = content_type if content_type.startswith("image/") else "image/jpeg"
+            cls._put_cached_resource(resource_id, local_path, mime)
            return local_path, mime

        # kind == "file"
@ -2368,6 +2423,7 @@ class MediaResolveMiddleware(InboundMiddleware):
            )
            return None
        mime = guess_mime_type(file_name) or content_type or "application/octet-stream"
+        cls._put_cached_resource(resource_id, local_path, mime)
        return local_path, mime

    @classmethod
@ -2393,6 +2449,9 @@ class MediaResolveMiddleware(InboundMiddleware):
            if kind not in _RESOLVABLE_MEDIA_KINDS or not url:
                continue

+            # Extract resourceId from the placeholder URL for cache dedup.
+            rid = ExtractContentMiddleware._parse_resource_id(url)
+
            try:
                fetch_url = await cls._resolve_download_url(adapter, url)
            except Exception as exc:
@ -2408,6 +2467,7 @@ class MediaResolveMiddleware(InboundMiddleware):
                kind=kind,
                file_name=str(ref.get("name") or "").strip() or None,
                log_tag=f"placeholder_url={url[:80]}",
+                resource_id=rid,
            )
            if cached is None:
                continue
@ -2480,6 +2540,7 @@ class MediaResolveMiddleware(InboundMiddleware):
                kind=kind,
                file_name=filename or None,
                log_tag=f"rid={rid}",
+                resource_id=rid,
            )
            if cached is None:
                continue
@ -2563,6 +2624,7 @@ class DispatchMiddleware(InboundMiddleware):
                        kind=kind,
                        file_name=filename or None,
                        log_tag=f"quote rid={rid}",
+                        resource_id=rid,
                    )
                    if cached is None:
                        continue
@ -4629,6 +4691,11 @@ class YuanbaoAdapter(BasePlatformAdapter):
    # Abstract method implementations
    # ------------------------------------------------------------------

+    @property
+    def enforces_own_access_policy(self) -> bool:
+        """Yuanbao gates DM/group access at intake via dm_policy/group_policy."""
+        return True
+
    async def connect(self) -> bool:
        """Connect to Yuanbao WS gateway and authenticate.

--- a/Show more
+++ b/Show more