From 4849a8e55583d5eb83c838c7c7be659c19201a3e Mon Sep 17 00:00:00 2001 From: xxxigm Date: Sun, 24 May 2026 21:01:23 +0700 Subject: [PATCH 01/26] hermes_state: add SessionDB.delete_telegram_topic_binding (#31501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Targeted ``(chat_id, thread_id)`` prune for the ``telegram_dm_topic_bindings`` table — the missing piece for #31501, where the Telegram adapter detects a topic the user deleted out-of-band but the binding row keeps living in state.db. The recovery logic in ``gateway.run._recover_telegram_topic_thread_id`` then steers every future inbound message back to the dead topic, dropping tool progress, approvals and replies into the wrong place. Returns the number of rows deleted; silently no-ops when the topic-mode tables haven't been migrated yet (read-only / pristine profile) so the helper is safe to call from a send-fallback hot path before the schema has run. --- hermes_state.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/hermes_state.py b/hermes_state.py index c4d07268972..d307db7a735 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -4598,6 +4598,49 @@ class SessionDB: return None return dict(row) if row else None + def delete_telegram_topic_binding( + self, + *, + chat_id: str, + thread_id: str, + ) -> int: + """Remove the binding row for a single (chat, thread) pair. + + Called when the Telegram Bot API confirms a topic was deleted + externally (``Thread not found`` after the same-thread retry + already failed). Without this prune, the stale row keeps + living in ``telegram_dm_topic_bindings`` and the + recovery logic in ``gateway.run._recover_telegram_topic_thread_id`` + cheerfully redirects future inbound messages to the deleted + topic, causing tool progress, approvals, and replies to land + in the wrong place. Issue #31501. + + Returns the number of rows deleted (0 when the binding was + already absent or the topic-mode tables haven't been + migrated yet — both are silent no-ops; we never raise from + a cleanup hot path). + """ + chat_id = str(chat_id) + thread_id = str(thread_id) + deleted = {"count": 0} + + def _do(conn): + try: + cursor = conn.execute( + """ + DELETE FROM telegram_dm_topic_bindings + WHERE chat_id = ? AND thread_id = ? + """, + (chat_id, thread_id), + ) + deleted["count"] = cursor.rowcount or 0 + except sqlite3.OperationalError: + # Tables don't exist yet — nothing to prune. + deleted["count"] = 0 + + self._execute_write(_do) + return deleted["count"] + def bind_telegram_topic( self, *, From 142a5751a2b3ee2be8ac405942879efac81c228f Mon Sep 17 00:00:00 2001 From: xxxigm Date: Sun, 24 May 2026 21:01:38 +0700 Subject: [PATCH 02/26] gateway/telegram: prune stale DM topic binding on Thread-not-found (#31501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both fallback sites that currently log "Thread X not found, retrying without message_thread_id" now also drop the ``telegram_dm_topic_bindings`` row keyed on ``(chat_id, thread_id)``: * The streaming send loop (``send`` body) — fires on the second failure, after the same-thread one-shot retry confirms the thread really is gone (the first attempt is left alone because Bot API has been observed to return a transient "Thread not found" that recovers on immediate retry). * The control-message helper ``_send_message_with_thread_fallback`` (approval prompts, model picker, update prompts) — single-shot retry, prune unconditionally on the BadRequest match. Without this prune, a user who deletes a Telegram DM topic in the client keeps getting their next inbound message recovered back to the dead thread by ``_recover_telegram_topic_thread_id`` in ``gateway/run.py``, which walks the per-user binding list newest-first and treats the deleted thread as authoritative. The reproduction in the bug report is exactly this: tool progress, approvals, activity messages and replies all land in the wrong place until the user manually runs DELETE on state.db. Cleanup is best-effort — we log at INFO when it succeeds, swallow any exception from the SessionDB call, and the user-facing send proceeds either way. Refs #31501 --- plugins/platforms/telegram/adapter.py | 56 ++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/plugins/platforms/telegram/adapter.py b/plugins/platforms/telegram/adapter.py index 026ee7bc55c..2de169ee092 100644 --- a/plugins/platforms/telegram/adapter.py +++ b/plugins/platforms/telegram/adapter.py @@ -810,6 +810,47 @@ class TelegramAdapter(BasePlatformAdapter): def _is_thread_not_found_error(error: Exception) -> bool: return "thread not found" in str(error).lower() + def _prune_stale_dm_topic_binding( + self, chat_id: Any, thread_id: Any, + ) -> None: + """Drop the stale ``telegram_dm_topic_bindings`` row for a + topic Telegram has confirmed deleted. + + Without this prune the recovery logic in + ``gateway.run._recover_telegram_topic_thread_id`` keeps + steering future inbound messages to the dead thread (the + bug behind #31501 — tool progress, approvals, replies all + end up in the wrong place even though the user has moved + on to a fresh topic). Best-effort: we never raise from a + send-fallback path — a failed cleanup must not turn into a + failed user-facing send. + """ + if chat_id is None or thread_id is None: + return + store = getattr(self, "_session_store", None) + if store is None: + return + db = getattr(store, "_db", None) + if db is None or not hasattr(db, "delete_telegram_topic_binding"): + return + try: + removed = db.delete_telegram_topic_binding( + chat_id=str(chat_id), thread_id=str(thread_id), + ) + except Exception: + logger.debug( + "[%s] delete_telegram_topic_binding failed for " + "chat=%s thread=%s — skipping prune", + self.name, chat_id, thread_id, exc_info=True, + ) + return + if removed: + logger.info( + "[%s] Pruned stale Telegram DM topic binding " + "chat=%s thread=%s (Bot API: thread not found)", + self.name, chat_id, thread_id, + ) + @staticmethod def _is_bad_request_error(error: Exception) -> bool: name = error.__class__.__name__.lower() @@ -2670,11 +2711,17 @@ class TelegramAdapter(BasePlatformAdapter): continue # Second failure: the thread is genuinely gone. # Retry without ``message_thread_id`` so the - # message still reaches the chat. + # message still reaches the chat, and prune + # the stale binding so future inbound + # messages aren't redirected back to it + # (#31501). logger.warning( "[%s] Thread %s not found, retrying without message_thread_id", self.name, effective_thread_id, ) + self._prune_stale_dm_topic_binding( + chat_id, effective_thread_id, + ) used_thread_fallback = True effective_thread_id = None thread_kwargs = {"message_thread_id": None} @@ -3355,6 +3402,13 @@ class TelegramAdapter(BasePlatformAdapter): self.name, message_thread_id, ) + # Same prune as the streaming send path — the + # control-message retry tells us the topic is gone, + # so the binding row in state.db must go too + # (#31501). + self._prune_stale_dm_topic_binding( + kwargs.get("chat_id"), message_thread_id, + ) retry_kwargs = dict(kwargs) retry_kwargs.pop("message_thread_id", None) return await self._bot.send_message(**retry_kwargs) From 11246dbe215fc39a42094d3a35cae86f348cf8fe Mon Sep 17 00:00:00 2001 From: xxxigm Date: Sun, 24 May 2026 21:06:13 +0700 Subject: [PATCH 03/26] tests: regression coverage for stale topic-binding prune (#31501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thirteen tests across four layers: * ``SessionDB.delete_telegram_topic_binding`` — pin the new helper's contract: removes only the (chat_id, thread_id) row it was asked about, leaves siblings alone, returns 0 silently when the row never existed, and is a no-op on a pristine database whose topic-mode tables haven't been migrated yet. * ``TelegramAdapter._prune_stale_dm_topic_binding`` — the glue must drop the binding when ``self._session_store._db`` exposes the helper, swallow exceptions so a failed cleanup never breaks the user-facing send, and refuse to issue a DELETE for ``chat_id=None`` / ``thread_id=None`` so a bookkeeping miss can't accidentally null-match every row. * Source-level guards on ``TelegramAdapter.send`` and ``_send_message_with_thread_fallback`` — the prune call must sit beside the two existing "Thread X not found, retrying without message_thread_id" warnings, before the retry runs, so a future refactor can't silently drop the cleanup wire. * End-to-end semantic — once a topic is pruned, the ``GatewayRunner._recover_telegram_topic_thread_id`` walk steers future inbound messages to the surviving binding instead of the dead one. This is the exact behaviour change the bug report's reproduction asks for: no more landings in the wrong topic until the operator hand-edits ``state.db``. Refs #31501 --- ...elegram_prune_stale_topic_binding_31501.py | 394 ++++++++++++++++++ 1 file changed, 394 insertions(+) create mode 100644 tests/gateway/test_telegram_prune_stale_topic_binding_31501.py diff --git a/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py b/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py new file mode 100644 index 00000000000..349ae856904 --- /dev/null +++ b/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py @@ -0,0 +1,394 @@ +"""Regression tests for #31501 — prune stale Telegram DM topic bindings. + +When a Telegram user deletes a DM topic in the client, the Bot API +responds to the gateway's next send with ``Thread not found``. The +adapter falls back to a plain send (no ``message_thread_id``), but +prior to this fix it left the corresponding row in +``telegram_dm_topic_bindings`` untouched. +``gateway.run._recover_telegram_topic_thread_id`` then walked the +user's bindings newest-first on every later inbound message and +cheerfully redirected them back to the deleted topic — tool +progress, approvals and replies all silently landed in the wrong +place until the operator manually ran ``DELETE`` on ``state.db``. + +The fix has three pieces — these tests pin all three: + +1. ``SessionDB.delete_telegram_topic_binding`` — the targeted + prune helper (new public API). +2. ``TelegramAdapter._prune_stale_dm_topic_binding`` — the + adapter glue that calls the helper from a send-fallback hot + path without raising on cleanup failure. +3. The two "Thread not found" call sites in the streaming send + loop and the control-message helper now invoke (2) — we pin + this with a source-level guard rather than spinning the full + send pipeline. +""" + +from __future__ import annotations + +import inspect +from types import SimpleNamespace + +import pytest + +from hermes_state import SessionDB + + +# --------------------------------------------------------------------------- +# SessionDB.delete_telegram_topic_binding +# --------------------------------------------------------------------------- + + +def _seed_binding( + db: SessionDB, + *, + chat_id: str = "5595856929", + thread_id: str = "15287", + user_id: str = "5595856929", + session_id: str = "sess-target", +) -> None: + db.create_session( + session_id=session_id, + source="telegram", + user_id=user_id, + ) + db.bind_telegram_topic( + chat_id=chat_id, + thread_id=thread_id, + user_id=user_id, + session_key=f"agent:main:telegram:dm:{chat_id}:{thread_id}", + session_id=session_id, + ) + + +class TestDeleteTelegramTopicBinding: + def test_removes_matching_row_and_returns_count(self, tmp_path): + db = SessionDB(db_path=tmp_path / "state.db") + _seed_binding(db, thread_id="15287") + # Sanity check — binding present before prune. + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) is not None + + removed = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) + + assert removed == 1 + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) is None + db.close() + + def test_does_not_touch_unrelated_bindings(self, tmp_path): + # Critical for the fix: a chat with multiple topics must + # only lose the one Telegram confirmed deleted, never the + # rest. Otherwise the user's healthy topics also vanish + # from recovery's view. + db = SessionDB(db_path=tmp_path / "state.db") + _seed_binding(db, thread_id="15287", session_id="sess-stale") + _seed_binding(db, thread_id="15418", session_id="sess-fresh") + + removed = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) + assert removed == 1 + + # Stale binding is gone; the fresh one survives. + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) is None + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15418", + ) is not None + db.close() + + def test_missing_row_returns_zero_silently(self, tmp_path): + db = SessionDB(db_path=tmp_path / "state.db") + _seed_binding(db, thread_id="15287") + + # Different thread_id — must not raise, just report 0. + removed = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="99999", + ) + assert removed == 0 + # Original binding still intact. + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) is not None + db.close() + + def test_pristine_database_with_no_topic_tables_is_silent_noop(self, tmp_path): + # Fresh profile that has never run /topic — the topic-mode + # tables don't exist yet. The send-fallback hot path can + # still hit this code, so we must not crash. + db = SessionDB(db_path=tmp_path / "state.db") + # Confirm precondition: tables really aren't there. + tables = { + row[0] + for row in db._conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' " + "AND name LIKE 'telegram_dm%'" + ).fetchall() + } + assert "telegram_dm_topic_bindings" not in tables + + removed = db.delete_telegram_topic_binding( + chat_id="any", thread_id="any", + ) + assert removed == 0 + db.close() + + def test_idempotent_under_repeated_calls(self, tmp_path): + db = SessionDB(db_path=tmp_path / "state.db") + _seed_binding(db, thread_id="15287") + + first = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) + second = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) + + assert first == 1 + assert second == 0 # already gone, no spurious "1" + db.close() + + +# --------------------------------------------------------------------------- +# Adapter glue — _prune_stale_dm_topic_binding +# --------------------------------------------------------------------------- + + +def _bare_adapter(db: SessionDB | None = None): + # The adapter accesses the SessionDB via + # ``self._session_store._db`` (set by GatewayRunner via + # ``set_session_store``). Build a minimal stand-in with just + # the surface the prune helper touches; we don't need the + # python-telegram-bot import-graph here. ``name`` is a + # property that delegates to ``platform.value.title()``, so + # we set ``platform`` rather than poking ``name`` directly. + from gateway.config import Platform + from plugins.platforms.telegram.adapter import TelegramAdapter + + adapter = object.__new__(TelegramAdapter) + adapter.platform = Platform.TELEGRAM + if db is not None: + adapter._session_store = SimpleNamespace(_db=db) + return adapter + + +class TestPruneStaleDmTopicBindingHelper: + def test_drops_binding_when_session_store_db_is_present(self, tmp_path): + db = SessionDB(db_path=tmp_path / "state.db") + _seed_binding(db, thread_id="15287") + + adapter = _bare_adapter(db) + adapter._prune_stale_dm_topic_binding("5595856929", 15287) + + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) is None + db.close() + + def test_silent_when_session_store_unavailable(self): + # No ``_session_store`` attribute — the helper must not + # explode (the streaming send path hits this in tests + # that bypass the gateway runner). + adapter = _bare_adapter() + adapter._prune_stale_dm_topic_binding("123", "456") + + def test_silent_when_db_lacks_helper(self): + # Old SessionDB without the new method (e.g. running + # against an older state.db schema). Must be a no-op + # rather than AttributeError. + adapter = _bare_adapter() + adapter._session_store = SimpleNamespace( + _db=SimpleNamespace(), # no methods at all + ) + adapter._prune_stale_dm_topic_binding("123", "456") + + def test_swallows_db_exceptions_so_send_continues(self): + class ExplodingDb: + def delete_telegram_topic_binding(self, **_): + raise RuntimeError("disk full or whatever") + + adapter = _bare_adapter() + adapter._session_store = SimpleNamespace(_db=ExplodingDb()) + + # The point of the helper is that a failed cleanup must + # NEVER turn into a failed user-facing send. No exception + # should escape. + adapter._prune_stale_dm_topic_binding("123", "456") + + def test_skips_when_chat_or_thread_missing(self, tmp_path): + # Defensive — control-message paths sometimes call us + # with chat_id=None when kwargs lack the key. We must + # not produce a spurious DELETE that matches every row + # with a NULL chat_id. + db = SessionDB(db_path=tmp_path / "state.db") + _seed_binding(db, thread_id="15287") + + adapter = _bare_adapter(db) + + adapter._prune_stale_dm_topic_binding(None, "15287") + adapter._prune_stale_dm_topic_binding("5595856929", None) + + # Still there — neither call generated a DELETE. + assert db.get_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) is not None + db.close() + + +# --------------------------------------------------------------------------- +# Source-level wiring guards — both fallback sites must call the helper +# --------------------------------------------------------------------------- + + +class TestThreadNotFoundFallbackSitesPruneBinding: + """Pin that the two ``Thread not found`` warning sites in the + Telegram adapter actually invoke ``_prune_stale_dm_topic_binding``. + These guards stop a future refactor from quietly losing the + cleanup wire — re-opening #31501. + """ + + def test_streaming_send_fallback_calls_prune(self): + from plugins.platforms.telegram import adapter as telegram_mod + + src = inspect.getsource(telegram_mod.TelegramAdapter.send) + # Locate the second-failure branch (the one that flips + # ``used_thread_fallback``). It must invoke the prune + # helper before flipping the flag. + marker = "retrying without message_thread_id" + idx = src.find(marker) + assert idx != -1, ( + "Streaming send must keep its 'thread not found' " + "fallback log line — the prune wiring is anchored " + "next to it." + ) + # 600 char window is enough to cover the warning, the + # prune call, and the ``used_thread_fallback = True`` + # assignment that follows. + window = src[idx:idx + 600] + assert "_prune_stale_dm_topic_binding" in window, ( + "Streaming send 'Thread not found' fallback must call " + "_prune_stale_dm_topic_binding so the stale row in " + "telegram_dm_topic_bindings doesn't keep redirecting " + "future inbound messages to the deleted topic (#31501)." + ) + + def test_control_message_helper_calls_prune(self): + from plugins.platforms.telegram import adapter as telegram_mod + + src = inspect.getsource( + telegram_mod.TelegramAdapter._send_message_with_thread_fallback + ) + # The helper has a single retry path; the prune call + # must sit inside it, not in dead code outside the + # ``if message_thread_id is not None and …`` guard. + assert "_prune_stale_dm_topic_binding" in src, ( + "_send_message_with_thread_fallback must call " + "_prune_stale_dm_topic_binding when Telegram returns " + "BadRequest('Thread not found') for a control message " + "(#31501)." + ) + # Belt-and-braces: the call must precede the retry + # ``send_message`` so the prune happens whether or not + # the retry itself succeeds. + prune_idx = src.find("_prune_stale_dm_topic_binding") + retry_idx = src.find("send_message(**retry_kwargs)") + assert 0 <= prune_idx < retry_idx, ( + "_prune_stale_dm_topic_binding must run before the " + "fallback send_message retry." + ) + + +# --------------------------------------------------------------------------- +# End-to-end semantic — prune + recovery returns None for deleted topic +# --------------------------------------------------------------------------- + + +class TestRecoveryAfterPrune: + """The whole point of the fix: once a topic is pruned, the + GatewayRunner's ``_recover_telegram_topic_thread_id`` must no + longer steer future inbound messages to it. + """ + + def test_recovery_no_longer_returns_pruned_topic(self, tmp_path): + # Build the same fixture used elsewhere: two topic bindings + # for the same user, then prune the most-recent one. + # ``_recover_telegram_topic_thread_id`` walks bindings + # newest-first, so without the prune it would pick the + # one we just removed. + from gateway.config import GatewayConfig, Platform, PlatformConfig + from gateway.run import GatewayRunner + from gateway.session import SessionSource, build_session_key + + db = SessionDB(db_path=tmp_path / "state.db") + db.enable_telegram_topic_mode( + chat_id="5595856929", user_id="5595856929", + ) + + for sid, thread in (("sess-A", "111"), ("sess-B", "222")): + db.create_session( + session_id=sid, source="telegram", + user_id="5595856929", + ) + db.bind_telegram_topic( + chat_id="5595856929", + thread_id=thread, + user_id="5595856929", + session_key=build_session_key(SessionSource( + platform=Platform.TELEGRAM, + user_id="5595856929", + chat_id="5595856929", + user_name="tester", + chat_type="dm", + thread_id=thread, + )), + session_id=sid, + ) + + runner = object.__new__(GatewayRunner) + runner.config = GatewayConfig( + platforms={ + Platform.TELEGRAM: PlatformConfig(enabled=True, token="***"), + } + ) + runner._session_db = db + runner._telegram_topic_mode_enabled = lambda _src: True + + # Sanity: before the prune, recovery picks "222" (newest). + # Recovery only fires for a lobby-shaped inbound (omitted + # message_thread_id or General topic "1"); a non-lobby + # unknown thread is preserved as a brand-new topic. Use the + # General topic id so the recovery walk actually runs. + before = runner._recover_telegram_topic_thread_id(SessionSource( + platform=Platform.TELEGRAM, + user_id="5595856929", + chat_id="5595856929", + user_name="tester", + chat_type="dm", + thread_id="1", # General/stripped reply — triggers recovery + )) + assert before == "222" + + # User deletes topic 222 in Telegram → adapter prunes. + db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="222", + ) + + # Now recovery falls back to topic 111 (the surviving + # binding) instead of the dead one. This is the exact + # behaviour change the bug report asks for. + after = runner._recover_telegram_topic_thread_id(SessionSource( + platform=Platform.TELEGRAM, + user_id="5595856929", + chat_id="5595856929", + user_name="tester", + chat_type="dm", + thread_id="1", + )) + assert after == "111" + db.close() From 6681f28d5b14ac38e444d3578c9170fffa5363d9 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:17:20 -0700 Subject: [PATCH 04/26] fix(telegram): disable DM topic mode when last binding is pruned Follow-up to #31501. When the send-fallback prune removes a chat's final telegram_dm_topic_bindings row, also flip telegram_dm_topic_mode.enabled to 0 in the same transaction. Without this, a user who turns topics off in the Telegram client (rather than via /topic off) leaves enabled=1 with zero lanes: _recover_telegram_topic_thread_id keeps treating the chat as topic-enabled and lobby messages keep hunting for bindings that no longer exist. Clearing the flag makes recovery fully stand down once the dead topics are gone. Adds 3 regression tests covering the last-binding clear, the multi-binding no-op, and the unmatched-prune no-op. --- hermes_state.py | 38 ++++++++++- ...elegram_prune_stale_topic_binding_31501.py | 65 +++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/hermes_state.py b/hermes_state.py index d307db7a735..cfb63bd165b 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -4615,8 +4615,19 @@ class SessionDB: topic, causing tool progress, approvals, and replies to land in the wrong place. Issue #31501. - Returns the number of rows deleted (0 when the binding was - already absent or the topic-mode tables haven't been + When this prune removes the chat's *last* remaining binding, + the chat's row in ``telegram_dm_topic_mode`` is also flipped to + ``enabled = 0`` in the same transaction. Otherwise the chat + would be left in topic mode with zero lanes — and + ``gateway.run._recover_telegram_topic_thread_id`` keeps treating + the chat as topic-enabled, lobby messages keep hunting for a + binding that no longer exists, and a user who disabled topics in + the Telegram client (rather than via ``/topic off``) stays stuck + until the next send happens to fail. Clearing the flag makes + recovery fully stand down once the dead topics are gone. + + Returns the number of binding rows deleted (0 when the binding + was already absent or the topic-mode tables haven't been migrated yet — both are silent no-ops; we never raise from a cleanup hot path). """ @@ -4637,6 +4648,29 @@ class SessionDB: except sqlite3.OperationalError: # Tables don't exist yet — nothing to prune. deleted["count"] = 0 + return + if not deleted["count"]: + return + # If that was the chat's last binding, disable topic mode for + # the chat so recovery stops steering lobby messages at a now + # empty lane set. Same transaction → no read-after-prune race. + try: + remaining = conn.execute( + """ + SELECT 1 FROM telegram_dm_topic_bindings + WHERE chat_id = ? LIMIT 1 + """, + (chat_id,), + ).fetchone() + if remaining is None: + conn.execute( + "UPDATE telegram_dm_topic_mode " + "SET enabled = 0, updated_at = ? WHERE chat_id = ?", + (time.time(), chat_id), + ) + except sqlite3.OperationalError: + # telegram_dm_topic_mode absent — binding prune still stands. + pass self._execute_write(_do) return deleted["count"] diff --git a/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py b/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py index 349ae856904..d93d6589689 100644 --- a/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py +++ b/tests/gateway/test_telegram_prune_stale_topic_binding_31501.py @@ -155,6 +155,71 @@ class TestDeleteTelegramTopicBinding: db.close() +class TestPruneClearsTopicModeWhenLastBindingGone: + """Proactive cleanup (#31501 follow-up): pruning the chat's final + binding must also flip ``telegram_dm_topic_mode.enabled`` to 0 so + recovery fully stands down — covers the user who disabled topics in + the Telegram client without ever running ``/topic off``.""" + + def test_clears_enabled_when_last_binding_pruned(self, tmp_path): + db = SessionDB(db_path=tmp_path / "state.db") + db.enable_telegram_topic_mode( + chat_id="5595856929", user_id="5595856929", + ) + _seed_binding(db, thread_id="15287") + assert db.is_telegram_topic_mode_enabled( + chat_id="5595856929", user_id="5595856929", + ) is True + + removed = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) + + assert removed == 1 + assert db.is_telegram_topic_mode_enabled( + chat_id="5595856929", user_id="5595856929", + ) is False + db.close() + + def test_keeps_enabled_while_other_bindings_remain(self, tmp_path): + # Deleting one of several topics must NOT disable topic mode — + # the chat still has healthy lanes that recovery should serve. + db = SessionDB(db_path=tmp_path / "state.db") + db.enable_telegram_topic_mode( + chat_id="5595856929", user_id="5595856929", + ) + _seed_binding(db, thread_id="15287", session_id="sess-stale") + _seed_binding(db, thread_id="15418", session_id="sess-fresh") + + db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="15287", + ) + + assert db.is_telegram_topic_mode_enabled( + chat_id="5595856929", user_id="5595856929", + ) is True + db.close() + + def test_noop_prune_leaves_enabled_untouched(self, tmp_path): + # A prune that matches no row must not flip the flag — there's + # still a live binding the (wrong) thread_id didn't match. + db = SessionDB(db_path=tmp_path / "state.db") + db.enable_telegram_topic_mode( + chat_id="5595856929", user_id="5595856929", + ) + _seed_binding(db, thread_id="15287") + + removed = db.delete_telegram_topic_binding( + chat_id="5595856929", thread_id="99999", + ) + + assert removed == 0 + assert db.is_telegram_topic_mode_enabled( + chat_id="5595856929", user_id="5595856929", + ) is True + db.close() + + # --------------------------------------------------------------------------- # Adapter glue — _prune_stale_dm_topic_binding # --------------------------------------------------------------------------- From 2a58fee1a1bcae25c4159c49db213c87ff0709de Mon Sep 17 00:00:00 2001 From: Austin Pickett Date: Mon, 22 Jun 2026 15:55:33 -0400 Subject: [PATCH 05/26] fix(api): allow dashboard updates for git checkouts in containers (#51005) Salvages #50469 by @libre-7. _dashboard_local_update_managed_externally() previously blocked every containerized dashboard from the local update API, even when the running install was a bind-mounted git checkout that can be updated with hermes update. Allow the dashboard updater only for git installs inside containers, while keeping hosted /opt/data, docker, and pip installs managed externally. Pip remains blocked because its apply path mutates the running container filesystem and is not the self-managed checkout case. Adds regression coverage for docker, git, and pip install-method handling inside containers, and maps the contributor email for release attribution. Co-authored-by: libre-7 --- hermes_cli/web_server.py | 24 +++++++++++++++++++++++- scripts/release.py | 1 + tests/hermes_cli/test_web_server.py | 25 +++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index eb24b9f50eb..997803b8f0a 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -1322,13 +1322,35 @@ def _dashboard_local_update_managed_externally() -> bool: in-browser local update action. Keep this dashboard capability separate from install-method detection: manual git/pip installs inside containers can still behave like their actual install method in the CLI. + + However, when the install method is ``git`` (a bind-mounted checkout inside + a container — e.g. the hermes-webui image sharing the Hermes source tree), + the dashboard's ``hermes update`` button is the correct update path and + should not be suppressed. Other containerized install methods remain + externally managed unless their apply path is proven safe inside the + running container filesystem. """ + if _default_hermes_root_is_opt_data(): + return True try: from hermes_constants import is_container - return is_container() + if not is_container(): + return False except Exception: return False + # We are inside a container, but the install may still be self-managed. + # If the install method is git, the dashboard update button works against + # the mounted checkout and should be offered. Keep pip blocked inside + # containers: its apply path mutates the running container filesystem and + # is not the bind-mounted checkout case this gate is meant to recover. + try: + method = detect_install_method(PROJECT_ROOT) + if method == "git": + return False + except Exception: + pass + return True def _managed_files_policy(request: Request, *, create_root: bool = True) -> ManagedFilesPolicy: diff --git a/scripts/release.py b/scripts/release.py index 7cea21ce9b6..2c781838fc8 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -107,6 +107,7 @@ AUTHOR_MAP = { "804436395@qq.com": "LaPhilosophie", "maxmitcham@mac.home": "maxtrigify", "ccook@nvms.com": "ccook1963", + "libre-7@users.noreply.github.com": "libre-7", "kristian@agrointel.no": "kristianvast", "thomas.paquette@gmail.com": "RyTsYdUp", "techxacm@gmail.com": "ProgramCaiCai", diff --git a/tests/hermes_cli/test_web_server.py b/tests/hermes_cli/test_web_server.py index 0618221a301..76ba0e5f488 100644 --- a/tests/hermes_cli/test_web_server.py +++ b/tests/hermes_cli/test_web_server.py @@ -263,6 +263,29 @@ class TestWebServerEndpoints: import hermes_cli.web_server as web_server monkeypatch.setattr(hermes_constants, "is_container", lambda: True) + # A docker install inside a container should be managed externally. + monkeypatch.setattr(web_server, "detect_install_method", lambda _root: "docker") + + assert web_server._dashboard_local_update_managed_externally() is True + + def test_dashboard_update_capability_allows_git_in_container(self, monkeypatch): + """A git checkout inside a container (e.g. bind-mounted in hermes-webui) + should still offer dashboard updates — the checkout is self-managed.""" + import hermes_constants + import hermes_cli.web_server as web_server + + monkeypatch.setattr(hermes_constants, "is_container", lambda: True) + monkeypatch.setattr(web_server, "detect_install_method", lambda _root: "git") + + assert web_server._dashboard_local_update_managed_externally() is False + + def test_dashboard_update_capability_blocks_pip_in_container(self, monkeypatch): + """A pip install inside a container is still managed externally.""" + import hermes_constants + import hermes_cli.web_server as web_server + + monkeypatch.setattr(hermes_constants, "is_container", lambda: True) + monkeypatch.setattr(web_server, "detect_install_method", lambda _root: "pip") assert web_server._dashboard_local_update_managed_externally() is True @@ -1011,6 +1034,8 @@ class TestWebServerEndpoints: spawned = True raise AssertionError("docker update guard should not spawn hermes update") + # Bypass the managed-externally gate so we reach the docker install check. + monkeypatch.setattr(web_server, "_dashboard_local_update_managed_externally", lambda: False) monkeypatch.setattr(web_server, "detect_install_method", lambda _root: "docker") monkeypatch.setattr(web_server, "_spawn_hermes_action", fail_spawn) web_server._ACTION_PROCS.pop("hermes-update", None) From f721d2cda9f25fecd782525d8ea1312cfebec879 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 13:40:42 -0700 Subject: [PATCH 06/26] fix(image/video gen): make schema delivery instruction platform-neutral (#51031) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: re-trigger CI (workflows did not dispatch on prior head) * fix(image/video gen): make schema delivery instruction platform-neutral The image_generate and video_generate tool schema descriptions hardcoded a gateway-only delivery instruction ('display it with markdown ![description](url-or-path) and the gateway will deliver it'). That schema is sent on every platform, so on CLI it directly contradicted the CLI platform hint ('Do NOT emit MEDIA:/path tags ... state its absolute path in plain text'), and on messaging platforms it was also wrong about the mechanism (local file paths are delivered via MEDIA: tags, not markdown image syntax — markdown ![]() only works for URLs). The per-platform file-delivery convention is already owned correctly by the platform hints in prompt_builder.py. The tool schema now just describes the result shape (URL or absolute path in the image/video field) and defers 'how to deliver' to the active platform's guidance. Provider/model injection already works via _build_dynamic_image_schema() (the 'Active backend: · model: ' line); no change there. --- tools/image_generation_tool.py | 12 +++++++----- tools/video_generation_tool.py | 8 +++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/image_generation_tool.py b/tools/image_generation_tool.py index 101b000db2a..81c6491f9d9 100644 --- a/tools/image_generation_tool.py +++ b/tools/image_generation_tool.py @@ -1184,11 +1184,13 @@ IMAGE_GENERATE_SCHEMA = { "`reference_image_urls` for style/composition references; omit both " "for text-to-image. The underlying backend (FAL, OpenAI, xAI, etc.) " "and model are user-configured and not selectable by the agent. " - "Returns either a URL or an absolute file path in the `image` field; " - "display it with markdown ![description](url-or-path) and the gateway " - "will deliver it. When the active terminal backend has a different " - "filesystem, successful local-file results may also include " - "`agent_visible_image` for follow-up terminal/file operations." + "Returns the result in the `image` field — either a URL or an absolute " + "file path. To show it to the user, reference that path/URL in your " + "response using the file-delivery convention for the current platform " + "(your platform guidance describes how files are delivered here). When " + "the active terminal backend has a different filesystem, successful " + "local-file results may also include `agent_visible_image` for " + "follow-up terminal/file operations." ), "parameters": { "type": "object", diff --git a/tools/video_generation_tool.py b/tools/video_generation_tool.py index 2465199f3d1..789ead6a054 100644 --- a/tools/video_generation_tool.py +++ b/tools/video_generation_tool.py @@ -419,9 +419,11 @@ _GENERIC_DESCRIPTION = ( "endpoint. The backend and model family are user-configured via " "`hermes tools` → Video Generation; the agent does not pick them. " "Long-running generations may take 30 seconds to several minutes — " - "the call blocks until the video is ready. Returns either an HTTP " - "URL or an absolute file path in the `video` field; display it with " - "markdown ![description](url-or-path) and the gateway will deliver it." + "the call blocks until the video is ready. Returns the result in the " + "`video` field — either an HTTP URL or an absolute file path. To show " + "it to the user, reference that path/URL in your response using the " + "file-delivery convention for the current platform (your platform " + "guidance describes how files are delivered here)." ) From 5f1d23cfb2c5bae3c76bd36981df0e932940cf06 Mon Sep 17 00:00:00 2001 From: Francesco Bonacci Date: Mon, 22 Jun 2026 07:24:37 -0700 Subject: [PATCH 07/26] fix(computer-use): delete broken pre-install asset probe; trust the upstream installer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `hermes computer-use install` refused to install on Linux, Windows, and macOS x86_64 because the pre-install asset probe was hitting the wrong GitHub endpoint AND duplicating tag-resolution logic the upstream installer already does correctly. `_check_cua_driver_asset_for_arch()` queried `https://api.github.com/repos/trycua/cua/releases/latest`. On trycua/cua: - cua-driver-rs releases (the binary the installer fetches) are marked **prerelease** on every cut. GitHub's `/releases/latest` explicitly skips prereleases. - The Python package releases (`cua-agent`, `cua-computer`, `cua-train`) are non-prerelease and end up as the "latest" instead. Live API check today: $ curl -sf https://api.github.com/repos/trycua/cua/releases/latest \ | jq '{tag:.tag_name, asset_count: (.assets|length)}' { "tag": "agent-v0.8.3", "asset_count": 0 } The probe sees zero assets, prints "Latest CUA release has no Linux x86_64 asset", and skips install on every Linux / Windows / macOS-x86_64 host — even though the cua-driver-rs-v0.6.0 release ships 19 binary assets covering all those platforms. Filtering `/releases?per_page=N` for the `cua-driver-rs-v*` prefix fixes the bug, but it duplicates tag-resolution logic the upstream `_install-rust.sh` already does correctly via `CUA_DRIVER_RS_BAKED_VERSION` (auto-baked by CD on every release, with a `/releases?per_page=N` API fallback for dev checkouts). The right answer is to trust that contract instead of mirroring it in Python where it can drift. Two paths get the same outcome without the probe: 1. **Fresh install**: run `install.sh` directly. It has the baked release tag, fetches the right asset, and errors with a clear message on missing-arch downloads. No preflight needed. 2. **Upgrade path**: `cua_driver_update_check()` (separately added) shells `cua-driver check-update --json` against the installed binary, which returns the canonical update answer from the same source the installer uses. - `hermes_cli/tools_config.py`: delete `_check_cua_driver_asset_for_arch` and its two call sites in `install_cua_driver`. Replace with an inline comment near the top of the module explaining the rationale. - `tests/hermes_cli/test_install_cua_driver.py`: drop the `TestCheckCuaDriverAssetForArch` block. Add `TestArchProbeRemoval` with three regressions: - `test_probe_function_is_gone` — asserts the deleted helpers stay deleted. - `test_fresh_install_does_not_call_github_api` — asserts the install path doesn't hit GitHub directly from Python anymore. - `test_upgrade_with_binary_does_not_call_github_api_directly` — same for the upgrade path. All 9 `test_install_cua_driver` tests pass. Reported by @teknium1 while testing on a headed Ubuntu host. --- hermes_cli/tools_config.py | 132 ++----- tests/hermes_cli/test_install_cua_driver.py | 417 +++----------------- 2 files changed, 97 insertions(+), 452 deletions(-) diff --git a/hermes_cli/tools_config.py b/hermes_cli/tools_config.py index 741dbb267dd..dfd7c60e744 100644 --- a/hermes_cli/tools_config.py +++ b/hermes_cli/tools_config.py @@ -667,102 +667,31 @@ def _pip_install( -def _check_cua_driver_asset_for_arch() -> bool: - """Check whether the latest CUA release ships an asset for this OS+arch. - - Returns True if the asset likely exists (or if we cannot determine it). - Returns False and prints a warning when the asset is confirmed missing, - so callers can skip the install attempt and avoid a raw 404. - - Recognizes release-asset names across all supported platforms: - - * macOS (``Darwin``) — arm64 always ships; x86_64/amd64 probed. - * Windows (``AMD64``/``ARM64``) — amd64/x86_64 and arm64 probed. - * Linux (``x86_64``/``aarch64``) — x86_64/amd64 and aarch64/arm64 probed. - """ - import platform as _plat - import urllib.request - - system = _plat.system() - machine = _plat.machine().lower() # e.g. "x86_64", "arm64", "amd64", "aarch64" - - # arm64 (Apple Silicon) macOS assets are always published — short-circuit - # to preserve the original fail-open behaviour and avoid a network call. - if system == "Darwin" and machine == "arm64": - return True - - # Map this host's arch to the set of asset-name substrings we'll accept. - # Asset names vary by OS (darwin-x86_64, windows-amd64, linux-aarch64, …), - # so we match on the architecture token only and let any of the common - # aliases satisfy the probe. - if machine in {"x86_64", "amd64", "x64"}: - arch_names = {"x86_64", "amd64", "x64"} - arch_label = "x86_64/amd64" - elif machine in {"arm64", "aarch64"}: - arch_names = {"arm64", "aarch64"} - arch_label = "arm64/aarch64" - else: - # Unknown arch — fail open and let the installer surface the error. - return True - - # Probe the cua-driver release for an OS+arch asset before falling through - # to the upstream installer. - # - # The cua-driver-rs binaries are published to the trycua/cua monorepo under - # tag prefix ``cua-driver-rs-v*``. The repo's ``releases/latest`` is NOT - # that — it floats across the monorepo's other components (agent-*, - # computer-*, lume-*, train-*), most of which ship zero binary assets. So - # we list releases and pick the newest ``cua-driver-rs-v*`` tag, matching - # what the upstream install.sh does. Failing to find one => fail open and - # let the installer (which resolves the tag itself) be the source of truth. - driver_tag_prefix = "cua-driver-rs-v" - api_url = ( - "https://api.github.com/repos/trycua/cua/releases?per_page=100" - ) - try: - req = urllib.request.Request(api_url, headers={"Accept": "application/vnd.github+json"}) - with urllib.request.urlopen(req, timeout=10) as resp: - releases = _json.loads(resp.read().decode()) - if not isinstance(releases, list): - return True - # GitHub returns releases newest-first; take the first cua-driver-rs tag. - driver_release = next( - ( - r for r in releases - if str(r.get("tag_name", "")).startswith(driver_tag_prefix) - ), - None, - ) - if driver_release is None: - # No cua-driver-rs release surfaced (API hiccup / unexpected shape). - # Fail open — the installer resolves the tag on its own. - return True - tag = driver_release.get("tag_name", "") - assets = driver_release.get("assets", []) - # OS token gates the asset alongside arch so a darwin asset can't - # satisfy a Linux probe (every cua-driver-rs release ships all three - # OSes, so the arch token alone would always match). - os_token = {"Darwin": "darwin", "Windows": "windows", "Linux": "linux"}.get(system, "") - has_asset = any( - os_token in (name := a_info.get("name", "").lower()) - and any(a in name for a in arch_names) - for a_info in assets - ) - if not has_asset: - _print_warning( - f" Latest cua-driver release ({tag}) has no {system} {arch_label} asset." - ) - _print_info( - " CUA Driver may not yet ship a build for this platform." - ) - _print_info( - " See: https://github.com/trycua/cua/releases" - ) - return False - except Exception: - # Network / API failure — proceed and let the installer handle it. - pass - return True +# The asset-probe that lived here used to hit `/releases/latest` on +# trycua/cua and inspect the release's asset list before piping the +# installer to bash. It was broken in two places: +# +# 1. cua-driver-rs releases are marked **prerelease** on every cut, +# and GitHub's `/releases/latest` endpoint explicitly skips +# prereleases. On the live trycua/cua repo today, `/releases/latest` +# returns the Python `cua-agent v0.8.3` package (zero binary +# assets) instead of `cua-driver-rs-v0.6.0` (19 binary assets). +# The probe then reported "no asset for this arch" and skipped the +# install on every non-arm64 host — Linux x86_64, Windows, macOS +# Intel, Linux arm64 — even when the upstream installer would have +# succeeded. +# 2. Even with the right endpoint, we'd be duplicating tag-resolution +# logic the upstream installer already does correctly via +# `CUA_DRIVER_RS_BAKED_VERSION` (auto-baked by CD on every release, +# with an API fallback). Drift between our probe and theirs is a +# maintenance hazard. +# +# Resolution: trust the upstream installer. For fresh installs, run +# install.sh directly — it errors clean if the target arch has no +# asset. For the upgrade path, `cua_driver_update_check()` (which calls +# `cua-driver check-update --json`) gives us the canonical update +# answer from the binary itself — same tag-resolution as the installer, +# no Python-side duplication. def install_cua_driver(upgrade: bool = False) -> bool: @@ -811,8 +740,9 @@ def install_cua_driver(upgrade: bool = False) -> bool: _print_warning(f" {fetch_tool} not found — install manually:") _print_info(" https://github.com/trycua/cua/blob/main/libs/cua-driver/README.md") return False - if not _check_cua_driver_asset_for_arch(): - return False + # Pre-install asset probe deleted — see comment near the top of + # tools_config.py for why. install.sh has CUA_DRIVER_RS_BAKED_VERSION + # baked in by CD and errors cleanly on missing-arch assets. return _run_cua_driver_installer(label="Installing") # Already installed and caller didn't ask to upgrade → just confirm. @@ -841,8 +771,10 @@ def install_cua_driver(upgrade: bool = False) -> bool: _print_warning(f" {fetch_tool} not found — cannot refresh cua-driver.") return bool(binary) - if not _check_cua_driver_asset_for_arch(): - return bool(binary) + # Pre-install asset probe deleted (see top-of-file comment). The + # `cua_driver_update_check()` call further down asks the installed + # cua-driver binary itself whether an update exists — same + # tag-resolution as the installer, no duplication. # Skip the (network) re-install when the driver itself reports it's already # on the latest release. Best-effort: an older driver (no check-update diff --git a/tests/hermes_cli/test_install_cua_driver.py b/tests/hermes_cli/test_install_cua_driver.py index 27da8d22e06..e05dd42627c 100644 --- a/tests/hermes_cli/test_install_cua_driver.py +++ b/tests/hermes_cli/test_install_cua_driver.py @@ -1,42 +1,43 @@ -"""Tests for ``install_cua_driver`` upgrade semantics and architecture pre-check. +"""Tests for ``install_cua_driver`` upgrade semantics. The cua-driver upstream installer always pulls the latest release tag, so re-running it is the canonical upgrade path. ``install_cua_driver(upgrade=True)`` must: -* Be cross-platform — run on macOS, Windows, and Linux. Only genuinely - unsupported platforms no-op silently on upgrade so ``hermes update`` can - call it unconditionally without warning those users. -* Choose the right installer per OS: ``install.sh`` via ``curl | bash`` on - macOS/Linux, ``install.ps1`` via PowerShell ``irm | iex`` on Windows. +* Be macOS-only — no-op silently on Linux/Windows so ``hermes update`` can + call it unconditionally without warning every non-macOS user. * Re-run the installer even when the binary is already on PATH (this is the fix for the "we only pulled cua-driver once on enable" complaint). * Preserve original ``upgrade=False`` behaviour for the toolset-enable flow: - skip if installed, install otherwise, warn on unsupported platforms. -* Pre-check architecture compatibility before downloading to avoid raw 404 - errors when the upstream release lacks an asset for this OS+arch. + skip if installed, install otherwise, warn on non-macOS. + +The pre-install arch probe that used to live alongside this function was +deleted (see top-of-file comment in tools_config.py) — the upstream +installer has CUA_DRIVER_RS_BAKED_VERSION baked in by CD and errors +cleanly on missing-arch assets, and the upgrade path uses +``cua_driver_update_check()`` (which shells `cua-driver check-update +--json` against the already-installed binary). """ from __future__ import annotations -import json -from unittest.mock import MagicMock, patch +from unittest.mock import patch class TestInstallCuaDriverUpgrade: - def test_upgrade_on_unsupported_platform_is_silent_noop(self): + def test_upgrade_on_non_macos_is_silent_noop(self): from hermes_cli import tools_config with patch.object(tools_config, "_print_warning") as warn, \ - patch("platform.system", return_value="FreeBSD"): + patch("platform.system", return_value="Linux"): assert tools_config.install_cua_driver(upgrade=True) is False warn.assert_not_called() - def test_non_upgrade_on_unsupported_platform_warns(self): + def test_non_upgrade_on_non_macos_warns(self): from hermes_cli import tools_config with patch.object(tools_config, "_print_warning") as warn, \ - patch("platform.system", return_value="FreeBSD"): + patch("platform.system", return_value="Linux"): assert tools_config.install_cua_driver(upgrade=False) is False warn.assert_called() @@ -47,8 +48,6 @@ class TestInstallCuaDriverUpgrade: patch.object(tools_config.shutil, "which", side_effect=lambda n: "/usr/local/bin/" + n if n in {"cua-driver", "curl"} else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ patch.object(tools_config, "_run_cua_driver_installer", return_value=True) as runner, \ patch("subprocess.run"): @@ -63,8 +62,6 @@ class TestInstallCuaDriverUpgrade: with patch("platform.system", return_value="Darwin"), \ patch.object(tools_config.shutil, "which", side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ patch.object(tools_config, "_run_cua_driver_installer", return_value=True) as runner: assert tools_config.install_cua_driver(upgrade=True) is True @@ -88,359 +85,75 @@ class TestInstallCuaDriverUpgrade: with patch("platform.system", return_value="Darwin"), \ patch.object(tools_config.shutil, "which", side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ patch.object(tools_config, "_run_cua_driver_installer", return_value=True) as runner: assert tools_config.install_cua_driver(upgrade=False) is True + runner.assert_called_once() -class TestCheckCuaDriverAssetForArch: - def test_arm64_macos_always_returns_true(self): +class TestArchProbeRemoval: + """Regression tests for the deletion of `_check_cua_driver_asset_for_arch`. + + The old probe queried ``/releases/latest`` on trycua/cua and inspected + asset names. That was wrong in two ways: + + 1. cua-driver-rs releases are marked **prerelease** on every cut, so + ``/releases/latest`` returns the Python ``cua-agent`` / ``cua-computer`` + package instead — a release with zero binary assets. The probe then + reported "no asset for $arch" on Linux x86_64, Windows, macOS Intel, + Linux arm64 — every non-Apple-Silicon host. + 2. Even with the right endpoint, it duplicated tag-resolution the upstream + installer already does correctly via ``CUA_DRIVER_RS_BAKED_VERSION`` + (auto-baked by CD on every release). + + The fix: stop probing. Trust the upstream installer for fresh installs + (it has the baked version + correct API fallback) and the + ``cua-driver check-update --json`` MCP-binary native command for the + upgrade path. + """ + + def test_probe_function_is_gone(self): from hermes_cli import tools_config + assert not hasattr(tools_config, "_check_cua_driver_asset_for_arch") + assert not hasattr(tools_config, "_latest_cua_driver_rs_release") - # Apple Silicon assets are always published — short-circuits without - # a network probe. - with patch("platform.system", return_value="Darwin"), \ - patch("platform.machine", return_value="arm64"): - assert tools_config._check_cua_driver_asset_for_arch() is True - - def test_x86_64_with_asset_returns_true(self): + def test_fresh_install_does_not_call_github_api(self): + """Pre-install no longer probes the GitHub API — the upstream + ``install.sh`` resolves the tag from its baked CUA_DRIVER_RS_BAKED_VERSION + line. install.sh errors cleanly when the arch has no asset, so the + probe was duplicate gatekeeping. + """ from hermes_cli import tools_config - releases = [{ - "tag_name": "cua-driver-rs-v0.1.6", - "assets": [ - {"name": "cua-driver-rs-0.1.6-darwin-arm64.tar.gz"}, - {"name": "cua-driver-rs-0.1.6-darwin-x86_64.tar.gz"}, - ], - }] - mock_resp = MagicMock() - mock_resp.read.return_value = json.dumps(releases).encode() - mock_resp.__enter__ = lambda s: s - mock_resp.__exit__ = MagicMock(return_value=False) - - with patch("platform.system", return_value="Darwin"), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=mock_resp): - assert tools_config._check_cua_driver_asset_for_arch() is True - - def test_x86_64_without_asset_returns_false(self): - from hermes_cli import tools_config - - releases = [{ - "tag_name": "cua-driver-rs-v0.1.6", - "assets": [ - {"name": "cua-driver-rs-0.1.6-darwin-arm64.tar.gz"}, - {"name": "cua-driver-rs.tar.gz"}, - ], - }] - mock_resp = MagicMock() - mock_resp.read.return_value = json.dumps(releases).encode() - mock_resp.__enter__ = lambda s: s - mock_resp.__exit__ = MagicMock(return_value=False) - - with patch("platform.system", return_value="Darwin"), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=mock_resp), \ - patch.object(tools_config, "_print_warning") as warn, \ - patch.object(tools_config, "_print_info"): - assert tools_config._check_cua_driver_asset_for_arch() is False - warn.assert_called_once() - assert "no Intel" in warn.call_args[0][0].lower() or "x86_64" in warn.call_args[0][0] - - def test_x86_64_api_failure_returns_true(self): - """Network failure should fail open — let the installer handle it.""" - from hermes_cli import tools_config - - with patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", side_effect=Exception("timeout")): - assert tools_config._check_cua_driver_asset_for_arch() is True - - def test_fresh_install_x86_64_no_asset_skips_installer(self): - """When the latest release has no Intel asset, skip the installer.""" - from hermes_cli import tools_config - - releases = [{ - "tag_name": "cua-driver-rs-v0.1.6", - "assets": [{"name": "cua-driver-rs-0.1.6-darwin-arm64.tar.gz"}], - }] - mock_resp = MagicMock() - mock_resp.read.return_value = json.dumps(releases).encode() - mock_resp.__enter__ = lambda s: s - mock_resp.__exit__ = MagicMock(return_value=False) - with patch("platform.system", return_value="Darwin"), \ patch.object(tools_config.shutil, "which", side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=mock_resp), \ - patch.object(tools_config, "_print_warning"), \ - patch.object(tools_config, "_print_info"), \ - patch.object(tools_config, "_run_cua_driver_installer") as runner: - assert tools_config.install_cua_driver(upgrade=False) is False - runner.assert_not_called() + patch("urllib.request.urlopen") as urlopen, \ + patch.object(tools_config, "_run_cua_driver_installer", + return_value=True) as runner: + assert tools_config.install_cua_driver(upgrade=False) is True + runner.assert_called_once() + urlopen.assert_not_called() - def test_upgrade_x86_64_no_asset_returns_existing_status(self): - """On upgrade with no Intel asset, return whether binary existed.""" + def test_upgrade_with_binary_does_not_call_github_api_directly(self): + """The upgrade path no longer hits GitHub from Python — it delegates + to the upstream ``install.sh`` (which has the baked release tag and + the proper API fallback). When cua-driver is already installed, + ``cua_driver_update_check()`` (added in a separate change) further + short-circuits the network re-install via the binary's native + ``check-update --json`` verb. + """ from hermes_cli import tools_config - releases = [{ - "tag_name": "cua-driver-rs-v0.1.6", - "assets": [{"name": "cua-driver-rs-0.1.6-darwin-arm64.tar.gz"}], - }] - mock_resp = MagicMock() - mock_resp.read.return_value = json.dumps(releases).encode() - mock_resp.__enter__ = lambda s: s - mock_resp.__exit__ = MagicMock(return_value=False) - - # With binary installed — returns True (binary exists) with patch("platform.system", return_value="Darwin"), \ patch.object(tools_config.shutil, "which", side_effect=lambda n: "/usr/local/bin/" + n if n in ("cua-driver", "curl") else None), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=mock_resp), \ - patch.object(tools_config, "_print_warning"), \ - patch.object(tools_config, "_print_info"), \ - patch.object(tools_config, "_run_cua_driver_installer") as runner: - assert tools_config.install_cua_driver(upgrade=True) is True - runner.assert_not_called() - - # Without binary — returns False - with patch("platform.system", return_value="Darwin"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=mock_resp), \ - patch.object(tools_config, "_print_warning"), \ - patch.object(tools_config, "_print_info"), \ - patch.object(tools_config, "_run_cua_driver_installer") as runner: - assert tools_config.install_cua_driver(upgrade=True) is False - runner.assert_not_called() - - -class TestInstallCuaDriverWindows: - """install_cua_driver dispatch on Windows hosts.""" - - def test_fresh_install_runs_installer(self): - from hermes_cli import tools_config - - # PowerShell present, cua-driver not yet installed. - with patch("platform.system", return_value="Windows"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: r"C:\\Windows\\powershell.exe" - if n == "powershell" else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ + patch("urllib.request.urlopen") as urlopen, \ + patch("subprocess.run"), \ patch.object(tools_config, "_run_cua_driver_installer", return_value=True) as runner: - assert tools_config.install_cua_driver(upgrade=False) is True - runner.assert_called_once() - - def test_fresh_install_without_powershell_fails(self): - from hermes_cli import tools_config - - with patch("platform.system", return_value="Windows"), \ - patch.object(tools_config.shutil, "which", lambda n: None), \ - patch.object(tools_config, "_print_warning") as warn, \ - patch.object(tools_config, "_print_info"), \ - patch.object(tools_config, "_run_cua_driver_installer") as runner: - assert tools_config.install_cua_driver(upgrade=False) is False - runner.assert_not_called() - # The warning should name the missing fetch tool (powershell). - assert "powershell" in warn.call_args[0][0].lower() - - def test_upgrade_with_binary_runs_installer(self): - from hermes_cli import tools_config - - with patch("platform.system", return_value="Windows"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: r"C:\\bin\\" + n - if n in {"cua-driver", "powershell"} else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ - patch.object(tools_config, "_run_cua_driver_installer", - return_value=True) as runner, \ - patch("subprocess.run"): assert tools_config.install_cua_driver(upgrade=True) is True runner.assert_called_once() - assert runner.call_args.kwargs.get("verbose") is False - - def test_installer_uses_powershell_irm_command(self): - """_run_cua_driver_installer must shell out to PowerShell irm|iex.""" - from hermes_cli import tools_config - - completed = MagicMock(returncode=0) - with patch("platform.system", return_value="Windows"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: r"C:\\bin\\" + n - if n == "cua-driver" else None), \ - patch("subprocess.run", return_value=completed) as run, \ - patch.object(tools_config, "_print_info"), \ - patch.object(tools_config, "_print_success"), \ - patch.object(tools_config, "_print_warning"): - assert tools_config._run_cua_driver_installer() is True - cmd = run.call_args[0][0] - # Argument list (shell=False), not a string. - assert isinstance(cmd, list) - assert cmd[0] == "powershell" - assert run.call_args.kwargs.get("shell") is False - joined = " ".join(cmd) - assert "install.ps1" in joined - assert "iex" in joined - - -class TestInstallCuaDriverLinux: - """install_cua_driver dispatch on Linux hosts (alpha).""" - - def test_fresh_install_runs_installer(self): - from hermes_cli import tools_config - - with patch("platform.system", return_value="Linux"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: "/usr/bin/curl" if n == "curl" else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ - patch.object(tools_config, "_run_cua_driver_installer", - return_value=True) as runner: - assert tools_config.install_cua_driver(upgrade=False) is True - runner.assert_called_once() - - def test_upgrade_with_binary_runs_installer(self): - from hermes_cli import tools_config - - with patch("platform.system", return_value="Linux"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: "/usr/local/bin/" + n - if n in {"cua-driver", "curl"} else None), \ - patch.object(tools_config, "_check_cua_driver_asset_for_arch", - return_value=True), \ - patch.object(tools_config, "_run_cua_driver_installer", - return_value=True) as runner, \ - patch("subprocess.run"): - assert tools_config.install_cua_driver(upgrade=True) is True - runner.assert_called_once() - - def test_installer_uses_curl_bash_command(self): - """_run_cua_driver_installer must shell out to curl | bash install.sh.""" - from hermes_cli import tools_config - - completed = MagicMock(returncode=0) - with patch("platform.system", return_value="Linux"), \ - patch.object(tools_config.shutil, "which", - side_effect=lambda n: "/usr/local/bin/" + n - if n == "cua-driver" else None), \ - patch("subprocess.run", return_value=completed) as run, \ - patch.object(tools_config, "_print_info"), \ - patch.object(tools_config, "_print_success"), \ - patch.object(tools_config, "_print_warning"): - assert tools_config._run_cua_driver_installer() is True - cmd = run.call_args[0][0] - assert isinstance(cmd, str) # shell string on POSIX - assert run.call_args.kwargs.get("shell") is True - assert "install.sh" in cmd - assert "curl" in cmd - - -class TestCheckCuaDriverAssetCrossPlatform: - """_check_cua_driver_asset_for_arch recognizes Windows/Linux asset names.""" - - @staticmethod - def _mock_release(asset_names): - # The probe lists /releases and picks the newest cua-driver-rs-v* tag, - # so the mock returns a LIST of releases with that tag prefix. - releases = [{"tag_name": "cua-driver-rs-v0.5.0", - "assets": [{"name": n} for n in asset_names]}] - resp = MagicMock() - resp.read.return_value = json.dumps(releases).encode() - resp.__enter__ = lambda s: s - resp.__exit__ = MagicMock(return_value=False) - return resp - - def test_windows_amd64_with_asset_returns_true(self): - from hermes_cli import tools_config - - resp = self._mock_release([ - "cua-driver-rs-0.5.0-windows-x86_64.zip", - "cua-driver-rs-0.5.0-darwin-arm64.tar.gz", - ]) - with patch("platform.system", return_value="Windows"), \ - patch("platform.machine", return_value="AMD64"), \ - patch("urllib.request.urlopen", return_value=resp): - assert tools_config._check_cua_driver_asset_for_arch() is True - - def test_windows_arm64_without_asset_returns_false(self): - from hermes_cli import tools_config - - resp = self._mock_release([ - "cua-driver-rs-0.5.0-windows-x86_64.zip", - ]) - with patch("platform.system", return_value="Windows"), \ - patch("platform.machine", return_value="ARM64"), \ - patch("urllib.request.urlopen", return_value=resp), \ - patch.object(tools_config, "_print_warning") as warn, \ - patch.object(tools_config, "_print_info"): - assert tools_config._check_cua_driver_asset_for_arch() is False - warn.assert_called_once() - assert "arm64" in warn.call_args[0][0].lower() - - def test_linux_x86_64_with_asset_returns_true(self): - from hermes_cli import tools_config - - resp = self._mock_release([ - "cua-driver-rs-0.5.0-linux-x86_64.tar.gz", - ]) - with patch("platform.system", return_value="Linux"), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=resp): - assert tools_config._check_cua_driver_asset_for_arch() is True - - def test_linux_aarch64_with_asset_returns_true(self): - from hermes_cli import tools_config - - resp = self._mock_release([ - "cua-driver-rs-0.5.0-linux-arm64.tar.gz", - ]) - with patch("platform.system", return_value="Linux"), \ - patch("platform.machine", return_value="aarch64"), \ - patch("urllib.request.urlopen", return_value=resp): - assert tools_config._check_cua_driver_asset_for_arch() is True - - def test_linux_aarch64_without_asset_returns_false(self): - from hermes_cli import tools_config - - resp = self._mock_release([ - "cua-driver-rs-0.5.0-linux-x86_64.tar.gz", - ]) - with patch("platform.system", return_value="Linux"), \ - patch("platform.machine", return_value="aarch64"), \ - patch("urllib.request.urlopen", return_value=resp), \ - patch.object(tools_config, "_print_warning") as warn, \ - patch.object(tools_config, "_print_info"): - assert tools_config._check_cua_driver_asset_for_arch() is False - warn.assert_called_once() - - def test_releases_latest_tag_ignored_picks_driver_rs_tag(self): - """A non-driver tag at the head of the list must not gate the probe. - - Regression guard: the monorepo's newest release is often a Python - component (agent-*, computer-*) with zero binary assets. The probe - must skip past it to the newest cua-driver-rs-v* release. - """ - from hermes_cli import tools_config - - releases = [ - {"tag_name": "agent-v0.8.3", "assets": []}, - {"tag_name": "computer-v0.5.19", "assets": []}, - {"tag_name": "cua-driver-rs-v0.6.0", - "assets": [{"name": "cua-driver-rs-0.6.0-linux-x86_64-binary.tar.gz"}]}, - ] - resp = MagicMock() - resp.read.return_value = json.dumps(releases).encode() - resp.__enter__ = lambda s: s - resp.__exit__ = MagicMock(return_value=False) - with patch("platform.system", return_value="Linux"), \ - patch("platform.machine", return_value="x86_64"), \ - patch("urllib.request.urlopen", return_value=resp): - assert tools_config._check_cua_driver_asset_for_arch() is True + # Probe deleted — no direct GitHub API call from Python. + urlopen.assert_not_called() From 0f741cef285aec8014cbf5e00c5df950bc2a4d8a Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:31:25 -0700 Subject: [PATCH 08/26] fix(tests): update cua install tests for cross-platform support f-trycua's #50855 test file predated the cross-platform PR (#50552) and reintroduced two stale tests asserting Linux is unsupported (test_*_non_macos_*, patching platform.system="Linux" and expecting a no-op/warn). Linux + Windows are supported now, so install proceeds on those platforms. Restore main's cross-platform-correct versions: test_*_on_unsupported_platform_* using FreeBSD as the genuinely unsupported case. --- tests/hermes_cli/test_install_cua_driver.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/hermes_cli/test_install_cua_driver.py b/tests/hermes_cli/test_install_cua_driver.py index e05dd42627c..d12eacca264 100644 --- a/tests/hermes_cli/test_install_cua_driver.py +++ b/tests/hermes_cli/test_install_cua_driver.py @@ -25,19 +25,19 @@ from unittest.mock import patch class TestInstallCuaDriverUpgrade: - def test_upgrade_on_non_macos_is_silent_noop(self): + def test_upgrade_on_unsupported_platform_is_silent_noop(self): from hermes_cli import tools_config with patch.object(tools_config, "_print_warning") as warn, \ - patch("platform.system", return_value="Linux"): + patch("platform.system", return_value="FreeBSD"): assert tools_config.install_cua_driver(upgrade=True) is False warn.assert_not_called() - def test_non_upgrade_on_non_macos_warns(self): + def test_non_upgrade_on_unsupported_platform_warns(self): from hermes_cli import tools_config with patch.object(tools_config, "_print_warning") as warn, \ - patch("platform.system", return_value="Linux"): + patch("platform.system", return_value="FreeBSD"): assert tools_config.install_cua_driver(upgrade=False) is False warn.assert_called() From 39727014246c3db2d6748ad2584191b622882ca3 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:05:15 -0600 Subject: [PATCH 09/26] fix(agent): complete final text on last turn --- agent/turn_finalizer.py | 6 ++++- .../test_turn_finalizer_cleanup_guard.py | 27 ++++++++++++++++--- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/agent/turn_finalizer.py b/agent/turn_finalizer.py index 91496d72040..3a013503110 100644 --- a/agent/turn_finalizer.py +++ b/agent/turn_finalizer.py @@ -122,10 +122,14 @@ def finalize_turn( ) # Determine if conversation completed successfully + normal_text_response = str(_turn_exit_reason).startswith("text_response(") completed = ( final_response is not None - and api_call_count < agent.max_iterations and not failed + and ( + api_call_count < agent.max_iterations + or normal_text_response + ) ) # Post-loop cleanup must never lose the response. Trajectory save, diff --git a/tests/agent/test_turn_finalizer_cleanup_guard.py b/tests/agent/test_turn_finalizer_cleanup_guard.py index e988501dc8e..f4c992fd26e 100644 --- a/tests/agent/test_turn_finalizer_cleanup_guard.py +++ b/tests/agent/test_turn_finalizer_cleanup_guard.py @@ -100,7 +100,13 @@ class _StubAgent: pass -def _run(agent): +def _run( + agent, + *, + final_response=None, + api_call_count=3, + turn_exit_reason="unknown", +): messages = [ {"role": "user", "content": "do a thing"}, { @@ -114,8 +120,8 @@ def _run(agent): ] return finalize_turn( agent, - final_response=None, # forces the max-iterations summary path - api_call_count=3, + final_response=final_response, + api_call_count=api_call_count, interrupted=False, failed=False, messages=messages, @@ -125,7 +131,7 @@ def _run(agent): user_message="do a thing", original_user_message="do a thing", _should_review_memory=False, - _turn_exit_reason="unknown", + _turn_exit_reason=turn_exit_reason, ) @@ -162,4 +168,17 @@ def test_clean_turn_has_no_cleanup_errors_key(): agent = _StubAgent(raise_in=()) result = _run(agent) assert result["final_response"] == "PARTIAL SUMMARY FROM MODEL" + assert result["completed"] is False assert "cleanup_errors" not in result + + +def test_text_response_on_last_allowed_call_is_completed(): + agent = _StubAgent(raise_in=()) + result = _run( + agent, + final_response="final report", + api_call_count=agent.max_iterations, + turn_exit_reason="text_response(finish_reason=stop)", + ) + assert result["final_response"] == "final report" + assert result["completed"] is True From ae7e857420bde96875c4889c8332ba08e9bf5e82 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:49:23 -0600 Subject: [PATCH 10/26] fix(cron): deliver max-iteration fallback reports --- cron/scheduler.py | 18 ++++++++++++-- tests/cron/test_scheduler.py | 46 ++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/cron/scheduler.py b/cron/scheduler.py index 99f910d8630..c48935c84a6 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -2189,13 +2189,27 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: # would otherwise be delivered as if it were the agent's reply and the # job's `last_status` set to "ok". Raise so the except handler below # builds the proper failure tuple. (issue #17855) - if result.get("failed") is True or result.get("completed") is False: + turn_exit_reason = str(result.get("turn_exit_reason") or "") + final_response_text = (result.get("final_response") or "").strip() + max_iteration_summary = ( + result.get("failed") is not True + and result.get("completed") is False + and turn_exit_reason.startswith("max_iterations_reached(") + and bool(final_response_text) + ) + if result.get("failed") is True or (result.get("completed") is False and not max_iteration_summary): _err_text = ( result.get("error") - or (result.get("final_response") or "").strip() + or final_response_text or "agent reported failure" ) raise RuntimeError(_err_text) + if max_iteration_summary: + logger.warning( + "Job '%s' reached the iteration limit but produced a final fallback response; " + "delivering the response instead of failing the cron run", + job_name, + ) final_response = result.get("final_response", "") or "" # Strip leaked placeholder text that upstream may inject on empty completions. diff --git a/tests/cron/test_scheduler.py b/tests/cron/test_scheduler.py index a3c17048bb6..f766d4474f3 100644 --- a/tests/cron/test_scheduler.py +++ b/tests/cron/test_scheduler.py @@ -1394,6 +1394,52 @@ class TestRunJobSessionPersistence: assert error is None assert final_response == "all good" + def test_run_job_delivers_max_iteration_fallback_summary(self, tmp_path): + """Cron should deliver a usable max-iteration fallback summary. + + A cron run can exhaust the iteration budget, get a final text summary + from the no-tools fallback call, and still have ``completed=False`` in + the generic agent result. That should not make cron raise the report + text as a RuntimeError. + """ + job = { + "id": "summary-job", + "name": "summary", + "prompt": "finish the report", + } + fake_db = MagicMock() + + with patch("cron.scheduler._hermes_home", tmp_path), \ + patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=fake_db), \ + patch( + "hermes_cli.runtime_provider.resolve_runtime_provider", + return_value={ + "api_key": "***", + "base_url": "https://example.invalid/v1", + "provider": "openrouter", + "api_mode": "chat_completions", + }, + ), \ + patch("run_agent.AIAgent") as mock_agent_cls: + mock_agent = MagicMock() + mock_agent.run_conversation.return_value = { + "final_response": "final fallback report", + "completed": False, + "failed": False, + "turn_exit_reason": "max_iterations_reached(60/60)", + } + mock_agent_cls.return_value = mock_agent + + success, output, final_response, error = run_job(job) + + assert success is True + assert error is None + assert final_response == "final fallback report" + assert "final fallback report" in output + assert "(FAILED)" not in output + def test_tick_marks_empty_response_as_error(self, tmp_path): """When run_job returns success=True but final_response is empty, tick() should mark the job as error so last_status != 'ok'. From 91c465f6e79accf9daf44c86daa5c6058d41546a Mon Sep 17 00:00:00 2001 From: infinitycrew39 Date: Mon, 22 Jun 2026 22:51:50 +0700 Subject: [PATCH 11/26] test(discord): add regression test for 100-command sync limit Add a test to verify that _safe_sync_slash_commands deletes obsolete commands before creating new ones. This ensures we never temporarily exceed Discord's 100-command limit during sync, which would trigger error 30032 and break all slash commands. This test guards against the regression where sync could fail even though the registration cap was properly enforced. --- tests/gateway/test_discord_sync_limit.py | 140 +++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/gateway/test_discord_sync_limit.py diff --git a/tests/gateway/test_discord_sync_limit.py b/tests/gateway/test_discord_sync_limit.py new file mode 100644 index 00000000000..ca8f298f80f --- /dev/null +++ b/tests/gateway/test_discord_sync_limit.py @@ -0,0 +1,140 @@ +"""Test Discord slash command sync respects the 100-command hard limit.""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch +import sys + +import pytest + +from gateway.config import PlatformConfig + + +def _ensure_discord_mock(): + if "discord" in sys.modules and hasattr(sys.modules["discord"], "__file__"): + return + if sys.modules.get("discord") is None: + discord_mod = MagicMock() + discord_mod.Intents.default.return_value = MagicMock() + sys.modules["discord"] = discord_mod + sys.modules["discord.ext"] = MagicMock() + sys.modules["discord.ext.commands"] = MagicMock() + + +_ensure_discord_mock() + +from plugins.platforms.discord.adapter import DiscordAdapter + + +class _FakeTreeCommand: + """Minimal command stub matching discord.py tree command API.""" + + def __init__(self, name: str, command_type: int = 1): + self.name = name + self.type = command_type + + def to_dict(self, _tree): + return {"name": self.name, "type": self.type} + + +@pytest.fixture +def adapter(): + """Create a Discord adapter with mocked Discord client.""" + _ensure_discord_mock() + config = PlatformConfig(enabled=True, token="fake-token") + adapter = DiscordAdapter(config) + + # Mock the Discord client and tree + adapter._client = MagicMock() + adapter._client.tree = MagicMock() + adapter._client.http = AsyncMock() + adapter._client.application_id = "test_app_id" + + adapter._sleep_between_command_sync_mutations = AsyncMock() + adapter._existing_command_to_payload = MagicMock(side_effect=lambda cmd: {"name": cmd.name}) + adapter._canonicalize_app_command_payload = MagicMock(side_effect=lambda p: p) + adapter._patchable_app_command_payload = MagicMock(side_effect=lambda p: p) + + return adapter + + +@pytest.mark.asyncio +async def test_safe_sync_deletes_before_creating(): + """Sync must delete obsolete commands BEFORE creating new ones. + + Discord's 100-command limit is enforced when trying to upsert. If we + have 100 commands on Discord, try to add 1 new one, and haven't deleted + any yet, Discord rejects with error 30032. + + The fix: identify and delete obsolete commands first, then create/update. + This ensures we never temporarily exceed 100 during the sync operation. + + This is a regression guard for the samuraiheart bug where sync would fail + with error 30032 even though the registration code properly capped at 100. + """ + _ensure_discord_mock() + config = PlatformConfig(enabled=True, token="fake-token") + adapter = DiscordAdapter(config) + + adapter._client = MagicMock() + adapter._client.tree = MagicMock() + adapter._client.http = AsyncMock() + adapter._client.application_id = "test_app_id" + adapter._sleep_between_command_sync_mutations = AsyncMock() + adapter._existing_command_to_payload = MagicMock(side_effect=lambda cmd: {"name": cmd.name}) + adapter._canonicalize_app_command_payload = MagicMock(side_effect=lambda p: p) + adapter._patchable_app_command_payload = MagicMock(side_effect=lambda p: p) + + # Simulate having 100 commands on Discord, with 1 that's no longer desired + # and 1 new command that should be created. + # Existing on Discord: cmd_0, cmd_1, ..., cmd_99 (100 total) + # Desired locally: cmd_1, cmd_2, ..., cmd_99, cmd_new (100 total) + # So: delete cmd_0 (1 deletion), create cmd_new (1 creation) + + existing_commands = [ + SimpleNamespace(id=f"id_{i}", name=f"cmd_{i}", type=1) + for i in range(100) + ] + adapter._client.tree.fetch_commands = AsyncMock(return_value=existing_commands) + + adapter._client.tree.get_commands = MagicMock( + return_value=[ + _FakeTreeCommand(name=f"cmd_{i}", command_type=1) + for i in range(1, 100) + ] + [_FakeTreeCommand(name="cmd_new", command_type=1)] + ) + + # Track the order of mutations + mutation_log = [] + + async def mock_delete(*args): + mutation_log.append(("delete", args[-1])) + + async def mock_upsert(*args): + mutation_log.append(("create", args[-1].get("name"))) + + adapter._client.http.delete_global_command = mock_delete + adapter._client.http.upsert_global_command = mock_upsert + adapter._client.http.edit_global_command = AsyncMock() + + # Call sync + await adapter._safe_sync_slash_commands() + + # Verify that: + # 1. A deletion happened (cmd_0) + # 2. It happened BEFORE any creation + # 3. The creation of cmd_new happened AFTER deletion + deletes = [m for m in mutation_log if m[0] == "delete"] + creates = [m for m in mutation_log if m[0] == "create"] + + assert len(deletes) >= 1, "At least one command should be deleted" + assert len(creates) >= 1, "At least one command should be created" + + # The key assertion: all deletions should come before all creations. + # Find the index of the last delete and the first create. + last_delete_idx = max(i for i, m in enumerate(mutation_log) if m[0] == "delete") + first_create_idx = min(i for i, m in enumerate(mutation_log) if m[0] == "create") + + assert last_delete_idx < first_create_idx, ( + f"Deletions must happen before creations to avoid exceeding 100-command limit. " + f"Last delete at index {last_delete_idx}, first create at index {first_create_idx}" + ) From e9b86f352fc73db5ca3de6e3fb50ef57d774f8f9 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 12:20:28 -0700 Subject: [PATCH 12/26] fix(discord): delete obsolete slash commands before creating new ones Discord enforces a hard 100-command limit per app and rejects an upsert that would push the live total over 100 (error 30032), which silently breaks ALL slash commands. The sync deleted obsolete commands AFTER creating new ones, so an app already at the cap momentarily exceeded it and the whole sync failed. Reorder: delete no-longer-desired commands up front, then create/update. Removes the now-redundant trailing delete loop. Adapts @infinitycrew39 PR #50890 to current main (the original adapter diff no longer applied after the platform refactor); test commit cherry-picked with authorship preserved. --- plugins/platforms/discord/adapter.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/plugins/platforms/discord/adapter.py b/plugins/platforms/discord/adapter.py index e64f4acd701..7d14adfcc70 100644 --- a/plugins/platforms/discord/adapter.py +++ b/plugins/platforms/discord/adapter.py @@ -1590,6 +1590,19 @@ class DiscordAdapter(BasePlatformAdapter): mutation_count += 1 return result + # Delete obsolete commands FIRST to stay under Discord's 100-command + # limit. Discord rejects an upsert that would push the live total over + # 100 (error 30032), which silently breaks ALL slash commands. If a new + # command is created before the obsolete ones are removed, an app that + # is already at the cap momentarily exceeds it and the whole sync fails. + # Removing the no-longer-desired commands up front guarantees the live + # total never rises above the cap mid-sync. + obsolete_keys = set(existing_by_key.keys()) - set(desired_by_key.keys()) + for key in obsolete_keys: + current = existing_by_key.pop(key) + await mutate(http.delete_global_command, app_id, current.id) + deleted += 1 + for key, desired in desired_by_key.items(): current = existing_by_key.pop(key, None) if current is None: @@ -1613,10 +1626,6 @@ class DiscordAdapter(BasePlatformAdapter): await mutate(http.edit_global_command, app_id, current.id, desired) updated += 1 - for current in existing_by_key.values(): - await mutate(http.delete_global_command, app_id, current.id) - deleted += 1 - return { "total": len(desired_payloads), "unchanged": unchanged, From 100e7be20ed88d8b78adb6664b41c8821052d592 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Tue, 23 Jun 2026 02:51:00 +0530 Subject: [PATCH 13/26] fix(security): deny root-level credential stores in media delivery The media-delivery denylist in gateway/platforms/base.py enumerated only .env/auth.json/credentials/config.yaml under HERMES_HOME, so other credential stores that live at the root fell through and could be auto-attached to chat replies. The reported case: the Google Workspace skill's google_token.json refreshes every turn, bumping its mtime to 'now', which kept passing the strict-mode recency window and re-sent the OAuth token on every reply. Extend the explicit per-file denylist to mirror the canonical credential set already enforced by the read/write guards in agent/file_safety.py: google_token.json, google_oauth_pending.json, auth/google_oauth.json, .anthropic_oauth.json, webhook_subscriptions.json, cache/bws_cache.json, auth.lock, and the pairing/ token directory. Targeted per-file additions (not a blanket ~/.hermes deny, which was declined in #32090/#34425 because it would block skills/, logs/, and ad-hoc agent-written deliverables). mcp-tokens/ (#37222) and state.db/kanban.db (#41071) are left to their sibling targeted PRs. Reported-by: xxxigm (#50912) --- gateway/platforms/base.py | 55 +++++++++++++--- tests/gateway/test_platform_base.py | 99 +++++++++++++++++++++++++++++ 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 085ea1d20e0..55f74f88f0c 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -1066,12 +1066,48 @@ def _media_delivery_denied_paths() -> List[Path]: denied.append(home / sub) # The active Hermes profile and shared Hermes root both contain control # files and credentials. Only cache subdirectories under them are - # explicitly allowlisted above. + # explicitly allowlisted above (matched BEFORE this denylist in + # validate_media_delivery_path, so generated media still delivers). + # + # These are the per-file credential / secret stores that live at the + # HERMES_HOME root. The set mirrors the canonical read guard in + # agent/file_safety.py (get_read_block_error / build_write_denied_*) so the + # delivery (read/exfil) side can't trail the write side: a credential the + # agent is forbidden to write or read must also never be auto-attached to a + # chat reply. Enumerated explicitly per-file rather than denying the whole + # tree, so skills/, logs/, and ad-hoc agent-written files under ~/.hermes + # stay deliverable (see #32090, #34425). + _ROOT_CREDENTIAL_FILES = ( + ".env", + "auth.json", + "auth.lock", + "credentials", + "config.yaml", + # Anthropic PKCE / OAuth refresh credential store. + ".anthropic_oauth.json", + # Google Workspace skill: auto-refreshing OAuth token (mtime bumps + # every turn, which defeated the strict-mode recency window) plus the + # pending-exchange session/verifier file. + "google_token.json", + "google_oauth_pending.json", + os.path.join("auth", "google_oauth.json"), + # Webhook subscription HMAC secrets. + "webhook_subscriptions.json", + # Bitwarden Secrets Manager plaintext disk cache. + os.path.join("cache", "bws_cache.json"), + ) + # Directory trees whose every child is credential material. (MCP OAuth + # tokens under mcp-tokens/ are handled by the sibling targeted PR #37222; + # session/kanban SQLite stores by #41071 — kept out of this diff to avoid + # overlap.) + _ROOT_CREDENTIAL_DIRS = ( + "pairing", + ) for hermes_root in (_HERMES_HOME, _HERMES_ROOT): - denied.append(hermes_root / ".env") - denied.append(hermes_root / "auth.json") - denied.append(hermes_root / "credentials") - denied.append(hermes_root / "config.yaml") + for rel in _ROOT_CREDENTIAL_FILES: + denied.append(hermes_root / rel) + for rel in _ROOT_CREDENTIAL_DIRS: + denied.append(hermes_root / rel) return denied @@ -1190,9 +1226,12 @@ def validate_media_delivery_path(path: str) -> Optional[str]: return str(resolved) # Non-strict mode (default): accept anything not on the denylist. - # The denylist still blocks /etc, /proc, ~/.ssh, ~/.aws, ~/.hermes/.env, - # ~/.hermes/auth.json, etc. — so the obvious prompt-injection sites - # (``MEDIA:/etc/passwd``, ``MEDIA:~/.ssh/id_rsa``) remain rejected. + # The denylist still blocks /etc, /proc, ~/.ssh, ~/.aws, and the + # credential/secret stores under the Hermes root (~/.hermes/.env, + # auth.json, .anthropic_oauth.json, google_token.json, pairing/, ...) — + # so the obvious prompt-injection / credential-exfil sites + # (``MEDIA:/etc/passwd``, ``MEDIA:~/.ssh/id_rsa``, + # ``MEDIA:~/.hermes/google_token.json``) remain rejected. if not _media_delivery_strict_mode(): if _path_under_denied_prefix(resolved): return None diff --git a/tests/gateway/test_platform_base.py b/tests/gateway/test_platform_base.py index 3a4f85a5e41..60b69e000be 100644 --- a/tests/gateway/test_platform_base.py +++ b/tests/gateway/test_platform_base.py @@ -967,6 +967,105 @@ class TestMediaDeliveryDefaultMode: assert BasePlatformAdapter.validate_media_delivery_path(str(config_file)) is None + def test_denylist_blocks_google_token_default_mode(self, tmp_path, monkeypatch): + """Integration credentials at the HERMES_HOME root (google_token.json) + must never be deliverable, even though they aren't the historically + enumerated .env/auth.json/config.yaml files. Regression for a + refreshed google_token.json being auto-attached to a Slack reply + (#50912). + """ + self._patch_roots(monkeypatch) + + fake_home = tmp_path / "home" + hermes_dir = fake_home / ".hermes" + hermes_dir.mkdir(parents=True) + token = hermes_dir / "google_token.json" + token.write_text('{"access_token": "***", "refresh_token": "***"}') + monkeypatch.setenv("HOME", str(fake_home)) + monkeypatch.setattr("gateway.platforms.base._HERMES_HOME", hermes_dir) + monkeypatch.setattr("gateway.platforms.base._HERMES_ROOT", hermes_dir) + + assert BasePlatformAdapter.validate_media_delivery_path(str(token)) is None + + def test_denylist_blocks_google_token_even_when_freshly_refreshed(self, tmp_path, monkeypatch): + """The exploit was that the Google integration rewrites + google_token.json every turn, bumping its mtime to ~now, so the + strict-mode recency window (trust_recent_files) kept re-trusting it + and it re-sent on every reply. An explicit denylist entry must win + over recency trust. + """ + self._patch_roots(monkeypatch) # zero cache allowlist, strict mode on + monkeypatch.setenv("HERMES_MEDIA_TRUST_RECENT_FILES", "1") + monkeypatch.setenv("HERMES_MEDIA_TRUST_RECENT_SECONDS", "600") + + fake_home = tmp_path / "home" + hermes_dir = fake_home / ".hermes" + hermes_dir.mkdir(parents=True) + token = hermes_dir / "google_token.json" + token.write_text('{"access_token": "***"}') # mtime = now → "recent" + monkeypatch.setenv("HOME", str(fake_home)) + monkeypatch.setattr("gateway.platforms.base._HERMES_HOME", hermes_dir) + monkeypatch.setattr("gateway.platforms.base._HERMES_ROOT", hermes_dir) + + assert BasePlatformAdapter.validate_media_delivery_path(str(token)) is None + + def test_denylist_blocks_pairing_directory_contents(self, tmp_path, monkeypatch): + """Files under ~/.hermes/pairing/ (platform pairing tokens) are + credential material and must not be deliverable. + """ + self._patch_roots(monkeypatch) + + fake_home = tmp_path / "home" + hermes_dir = fake_home / ".hermes" + pairing = hermes_dir / "pairing" + pairing.mkdir(parents=True) + token = pairing / "telegram-approved.json" + token.write_text('{"approved": ["123"]}') + monkeypatch.setenv("HOME", str(fake_home)) + monkeypatch.setattr("gateway.platforms.base._HERMES_HOME", hermes_dir) + monkeypatch.setattr("gateway.platforms.base._HERMES_ROOT", hermes_dir) + + assert BasePlatformAdapter.validate_media_delivery_path(str(token)) is None + + def test_hermes_cache_still_delivers_under_denied_home(self, tmp_path, monkeypatch): + """The targeted credential denylist must not break legitimate cache + deliveries: a generated artifact under the allowlisted cache root is + matched before the denylist and still delivers. + """ + fake_home = tmp_path / "home" + hermes_dir = fake_home / ".hermes" + cache_dir = hermes_dir / "cache" / "documents" + cache_dir.mkdir(parents=True) + artifact = cache_dir / "report.pdf" + artifact.write_bytes(b"%PDF-1.4") + self._patch_roots(monkeypatch, cache_dir) + monkeypatch.setenv("HOME", str(fake_home)) + monkeypatch.setattr("gateway.platforms.base._HERMES_HOME", hermes_dir) + monkeypatch.setattr("gateway.platforms.base._HERMES_ROOT", hermes_dir) + + assert BasePlatformAdapter.validate_media_delivery_path(str(artifact)) == str(artifact.resolve()) + + def test_denylist_blocks_non_cache_file_under_hermes_home(self, tmp_path, monkeypatch): + """A non-credential file the agent wrote directly under ~/.hermes + (not in a cache subdir) is still deliverable via recency trust — we + did NOT blanket-deny the tree (per #32090/#34425). This guards against + accidentally re-introducing the rejected whole-tree deny. + """ + self._patch_roots(monkeypatch) # strict mode on + monkeypatch.setenv("HERMES_MEDIA_TRUST_RECENT_FILES", "1") + monkeypatch.setenv("HERMES_MEDIA_TRUST_RECENT_SECONDS", "600") + + fake_home = tmp_path / "home" + hermes_dir = fake_home / ".hermes" + hermes_dir.mkdir(parents=True) + artifact = hermes_dir / "adhoc_report.pdf" + artifact.write_bytes(b"%PDF-1.4") # fresh mtime + monkeypatch.setenv("HOME", str(fake_home)) + monkeypatch.setattr("gateway.platforms.base._HERMES_HOME", hermes_dir) + monkeypatch.setattr("gateway.platforms.base._HERMES_ROOT", hermes_dir) + + assert BasePlatformAdapter.validate_media_delivery_path(str(artifact)) == str(artifact.resolve()) + def test_strict_mode_envvar_restores_legacy_behavior(self, tmp_path, monkeypatch): """Setting HERMES_MEDIA_DELIVERY_STRICT=1 reactivates the older allowlist+recency logic. A stale file outside the allowlist is From 3147cbb1363554a404e6941f1862981326348d1b Mon Sep 17 00:00:00 2001 From: Max Hsu Date: Tue, 16 Jun 2026 07:58:56 +0800 Subject: [PATCH 14/26] fix(memory): apply /memory approve against a fresh store when no live agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI /memory slash handler (cli_commands_mixin._handle_memory_command) passed self.agent._memory_store straight through, which is None when the command runs without a live agent — e.g. /memory approve from the Desktop GUI. The shared write-approval handler then returns "memory store unavailable" and applies nothing, even with built-in memory enabled and pending writes present. Fall back to a freshly loaded on-disk MemoryStore when no live store is available, mirroring the gateway path (gateway/slash_commands.py). It persists to the same MEMORY/USER.md and creates MEMORY.md on the first approved write. Fixes #46783 Co-Authored-By: Claude Opus 4.8 (1M context) --- hermes_cli/cli_commands_mixin.py | 10 ++++++++++ tests/tools/test_write_approval.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/hermes_cli/cli_commands_mixin.py b/hermes_cli/cli_commands_mixin.py index d8df27a5df4..b645900d4f9 100644 --- a/hermes_cli/cli_commands_mixin.py +++ b/hermes_cli/cli_commands_mixin.py @@ -1361,6 +1361,16 @@ class CLICommandsMixin: parts = cmd.strip().split() args = parts[1:] if len(parts) > 1 else [] store = getattr(self.agent, "_memory_store", None) if getattr(self, "agent", None) else None + if store is None: + # No live agent store (e.g. /memory approve invoked from the Desktop + # GUI, or any context without an active agent). Apply against a freshly + # loaded on-disk store, mirroring the gateway path + # (gateway/slash_commands.py): it persists to the same MEMORY/USER.md + # and creates MEMORY.md on the first approved write. Without this the + # shared handler returns "memory store unavailable". See #46783. + from tools.memory_tool import MemoryStore + store = MemoryStore() + store.load_from_disk() out = handle_pending_subcommand( wa.MEMORY, args, memory_store=store, diff --git a/tests/tools/test_write_approval.py b/tests/tools/test_write_approval.py index fbfa804fbb9..7b65978f0ac 100644 --- a/tests/tools/test_write_approval.py +++ b/tests/tools/test_write_approval.py @@ -107,6 +107,36 @@ def test_memory_gate_on_then_apply(hermes_home): assert "approved entry" in store.user_entries[0] +def test_cli_memory_approve_without_live_agent_uses_fresh_store(hermes_home, capsys): + """#46783: ``/memory approve`` from a context with no live agent (e.g. the + Desktop GUI) passed ``memory_store=None`` into the shared handler, which + returned "memory store unavailable" and applied nothing. The CLI handler must + fall back to a freshly loaded on-disk store, like the gateway path does.""" + import json + from tools.memory_tool import memory_tool, MemoryStore + from tools import write_approval as wa + from hermes_cli.cli_commands_mixin import CLICommandsMixin + + _set_approval("memory", True) + staging = MemoryStore(); staging.load_from_disk() + r = json.loads(memory_tool("add", "memory", "remember the launch date", store=staging)) + assert r.get("pending_id"), r + assert wa.pending_count("memory") == 1 + + # Bare CLI handler with no live agent → store resolves to None pre-fix. + handler = CLICommandsMixin.__new__(CLICommandsMixin) + handler.agent = None + handler._handle_memory_command("/memory approve all") + + out = capsys.readouterr().out + assert "memory store unavailable" not in out, out + assert "Approved 1" in out, out + assert wa.pending_count("memory") == 0 + # The approved write landed in a freshly loaded on-disk store (MEMORY.md). + reloaded = MemoryStore(); reloaded.load_from_disk() + assert any("remember the launch date" in e for e in reloaded.memory_entries) + + # --------------------------------------------------------------------------- # Skill gate # --------------------------------------------------------------------------- From 0e69cd4b37aa3f218ada018d5f0456660e0b726b Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Tue, 23 Jun 2026 03:05:31 +0530 Subject: [PATCH 15/26] fix(memory): honor configured char limits in the no-agent on-disk store Follow-up to the /memory approve fresh-store fix. Both the CLI fallback and the messaging-gateway handler built a bare MemoryStore() with the hardcoded default char limits (2200/1375), ignoring the user's configured memory.memory_char_limit / user_char_limit. A live agent honors those overrides (agent/agent_init.py), so an approval applied without a live agent could accept a write the user's lower cap would reject, or vice versa. Extract a shared tools.memory_tool.load_on_disk_store() factory that reads the configured limits (falling back to defaults if config can't load) and wire both the CLI and gateway handlers to it, closing the gap on both surfaces and de-duplicating the construction block. --- gateway/slash_commands.py | 6 +++--- hermes_cli/cli_commands_mixin.py | 7 ++++--- tests/tools/test_write_approval.py | 27 +++++++++++++++++++++++++ tools/memory_tool.py | 32 ++++++++++++++++++++++++++++++ 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py index f35682f8603..ab9ea9759bd 100644 --- a/gateway/slash_commands.py +++ b/gateway/slash_commands.py @@ -2343,7 +2343,7 @@ class GatewaySlashCommandsMixin: from gateway.run import _hermes_home from hermes_cli.write_approval_commands import handle_pending_subcommand from tools import write_approval as wa - from tools.memory_tool import MemoryStore + from tools.memory_tool import load_on_disk_store raw_args = event.get_command_args().strip() args = raw_args.split() if raw_args else [] @@ -2363,8 +2363,8 @@ class GatewaySlashCommandsMixin: # Apply approved writes against a fresh on-disk store (the gateway has # no long-lived agent; the store persists to the same MEMORY/USER.md). - store = MemoryStore() - store.load_from_disk() + # load_on_disk_store() honors the user's configured char limits. + store = load_on_disk_store() out = handle_pending_subcommand( wa.MEMORY, args, memory_store=store, set_mode_fn=_set_approval, diff --git a/hermes_cli/cli_commands_mixin.py b/hermes_cli/cli_commands_mixin.py index b645900d4f9..95292314c5a 100644 --- a/hermes_cli/cli_commands_mixin.py +++ b/hermes_cli/cli_commands_mixin.py @@ -1368,9 +1368,10 @@ class CLICommandsMixin: # (gateway/slash_commands.py): it persists to the same MEMORY/USER.md # and creates MEMORY.md on the first approved write. Without this the # shared handler returns "memory store unavailable". See #46783. - from tools.memory_tool import MemoryStore - store = MemoryStore() - store.load_from_disk() + # load_on_disk_store() honors the user's configured char limits, so + # an approval here enforces the same caps as the live agent would. + from tools.memory_tool import load_on_disk_store + store = load_on_disk_store() out = handle_pending_subcommand( wa.MEMORY, args, memory_store=store, diff --git a/tests/tools/test_write_approval.py b/tests/tools/test_write_approval.py index 7b65978f0ac..73ea119e0e5 100644 --- a/tests/tools/test_write_approval.py +++ b/tests/tools/test_write_approval.py @@ -137,6 +137,33 @@ def test_cli_memory_approve_without_live_agent_uses_fresh_store(hermes_home, cap assert any("remember the launch date" in e for e in reloaded.memory_entries) +def test_load_on_disk_store_honors_configured_char_limits(hermes_home, monkeypatch): + """load_on_disk_store() must read memory.memory_char_limit / + user_char_limit from config so approvals applied without a live agent + enforce the SAME caps as the live agent (agent_init.py). Falls back to + defaults when config can't be loaded. + """ + from tools.memory_tool import load_on_disk_store + + # Config override path: helper picks up the configured limits. + monkeypatch.setattr( + "hermes_cli.config.load_config", + lambda: {"memory": {"memory_char_limit": 999, "user_char_limit": 444}}, + ) + store = load_on_disk_store() + assert store.memory_char_limit == 999 + assert store.user_char_limit == 444 + + # Failure path: config raises → defaults, never blows up. + def _boom(): + raise RuntimeError("no config") + + monkeypatch.setattr("hermes_cli.config.load_config", _boom) + fallback = load_on_disk_store() + assert fallback.memory_char_limit == 2200 + assert fallback.user_char_limit == 1375 + + # --------------------------------------------------------------------------- # Skill gate # --------------------------------------------------------------------------- diff --git a/tools/memory_tool.py b/tools/memory_tool.py index 33d6ffff5e5..47d9d2c9922 100644 --- a/tools/memory_tool.py +++ b/tools/memory_tool.py @@ -731,6 +731,38 @@ class MemoryStore: raise RuntimeError(f"Failed to write memory file {path}: {e}") +def load_on_disk_store() -> "MemoryStore": + """Build a fresh on-disk :class:`MemoryStore`, honoring configured char limits. + + Use this from any context that has no live agent (the messaging gateway, the + Desktop GUI, the bare CLI ``/memory`` handler) but still needs to read or + apply approved memory writes. Mirrors how the live agent constructs its store + in ``agent/agent_init.py`` — including the user's ``memory.memory_char_limit`` + / ``memory.user_char_limit`` overrides — so an approval applied without a live + agent enforces the SAME caps as one applied with one. + + Falls back to the built-in defaults if config can't be loaded, so this can + never raise on a missing/unreadable config. + """ + memory_char_limit = 2200 + user_char_limit = 1375 + try: + from hermes_cli.config import load_config + + mem_cfg = (load_config() or {}).get("memory", {}) or {} + memory_char_limit = int(mem_cfg.get("memory_char_limit", memory_char_limit)) + user_char_limit = int(mem_cfg.get("user_char_limit", user_char_limit)) + except Exception: + pass # config optional — fall back to defaults rather than break /memory + + store = MemoryStore( + memory_char_limit=memory_char_limit, + user_char_limit=user_char_limit, + ) + store.load_from_disk() + return store + + def _apply_write_gate(action: str, target: str, content: Optional[str], old_text: Optional[str]) -> Optional[str]: """Evaluate the memory write gate. Returns a JSON tool-result string when From c080b2dc3ee672251cce6de4d002632f4027f9f8 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Mon, 22 Jun 2026 23:06:11 +0530 Subject: [PATCH 16/26] fix(gateway): redact credentials from TUI approval prompts (#48456) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #50767, which redacted the chat-platform (_approval_notify_sync) and SSE/API (_approval_notify) approval transports. The TUI JSON-RPC transport is the third egress and was missed: three register_gateway_notify callbacks in tui_gateway/server.py emitted the raw approval_data — including the unredacted command Tirith flagged — straight to the TUI client via _emit. Route all three registrations through a new module-level _emit_approval_request() helper that redacts payload['command'] via the shared gateway.run._redact_approval_command seam before emitting, matching the pattern used for the other two transports. Completes the whole-bug-class fix for #48456. Tests: assert the helper emits a redacted command (real credential pattern), handles missing/None command, and a wiring guard that no registration emits the raw payload directly (only the helper may). Both mutation-checked. The #48456 fix series originated from @liuhao1024's #48462 — credit to them for the original report and chat-platform fix; this completes the remaining transport. Co-authored-by: liuhao1024 --- tests/gateway/test_tui_approval_redaction.py | 66 ++++++++++++++++++++ tui_gateway/server.py | 21 ++++++- 2 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 tests/gateway/test_tui_approval_redaction.py diff --git a/tests/gateway/test_tui_approval_redaction.py b/tests/gateway/test_tui_approval_redaction.py new file mode 100644 index 00000000000..04716222e78 --- /dev/null +++ b/tests/gateway/test_tui_approval_redaction.py @@ -0,0 +1,66 @@ +"""Regression test for TUI approval-prompt credential redaction (#48456). + +Follow-up to #50767, which redacted the chat-platform and SSE/API approval +transports. The TUI JSON-RPC transport is the third egress: three +`register_gateway_notify` callbacks in `tui_gateway/server.py` emit the raw +`approval_data` (with an unredacted `command`) to the TUI client. They now +route through the module-level `_emit_approval_request` helper, which redacts +`payload["command"]` via the shared `gateway.run._redact_approval_command` seam +before emitting. +""" + +import inspect + +import pytest + + +class TestTuiApprovalEmitRedaction: + def test_emit_approval_request_redacts_command_in_payload(self, monkeypatch): + from tui_gateway import server as tui_server + + emitted = {} + monkeypatch.setattr( + tui_server, "_emit", + lambda event, sid, payload=None: emitted.update( + {"event": event, "sid": sid, "payload": payload} + ), + ) + raw = "curl -H 'Authorization: token ghp_01...6789' https://api.github.com" + tui_server._emit_approval_request("sess-1", {"command": raw, "description": "x"}) + + assert emitted["event"] == "approval.request" + # credential removed, non-command field + command structure preserved + assert "ghp_01...6789" not in emitted["payload"]["command"] + assert emitted["payload"]["description"] == "x" + assert "github.com" in emitted["payload"]["command"] + + def test_emit_approval_request_handles_missing_command(self, monkeypatch): + from tui_gateway import server as tui_server + + emitted = {} + monkeypatch.setattr( + tui_server, "_emit", + lambda event, sid, payload=None: emitted.update({"payload": payload}), + ) + tui_server._emit_approval_request("s", {"description": "no command here"}) + assert emitted["payload"] == {"description": "no command here"} + tui_server._emit_approval_request("s", None) + assert emitted["payload"] == {} + + def test_no_raw_command_emit_in_approval_registrations(self): + """Every register_gateway_notify approval callback must route through the + redacting `_emit_approval_request` helper — no registration may emit the + raw payload via `_emit("approval.request", ...)` directly. The ONLY + allowed raw emit is inside the helper itself.""" + from tui_gateway import server as tui_server + + src = inspect.getsource(tui_server) + raw_emits = src.count('_emit("approval.request"') + assert raw_emits == 1, ( + f'expected exactly 1 raw _emit("approval.request") (inside the ' + f"redacting helper), found {raw_emits} — a registration may be " + f"emitting the unredacted command" + ) + assert "_emit_approval_request(sid, data)" in src, ( + "registration lambdas must route through _emit_approval_request" + ) diff --git a/tui_gateway/server.py b/tui_gateway/server.py index e8accfa8ba2..6bb4743dc9f 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -806,6 +806,21 @@ def _emit(event: str, sid: str, payload: dict | None = None): write_json({"jsonrpc": "2.0", "method": "event", "params": params}) +def _emit_approval_request(sid: str, data: dict | None) -> None: + """Emit an ``approval.request`` event to the TUI client with the command + redacted. The approval payload is built from the RAW command string, so a + credential-shaped value Tirith flagged would otherwise be echoed verbatim + to the TUI client (#48456 — third egress transport alongside the chat + platforms and the SSE/API stream fixed in #50767). Reuse the shared gateway + seam so all approval transports redact consistently.""" + payload = dict(data or {}) + if "command" in payload: + from gateway.run import _redact_approval_command + + payload["command"] = _redact_approval_command(payload.get("command")) + _emit("approval.request", sid, payload) + + def _status_update(sid: str, kind: str, text: str | None = None): body = (text if text is not None else kind).strip() if not body: @@ -1040,7 +1055,7 @@ def _start_agent_build(sid: str, session: dict) -> None: ) register_gateway_notify( - key, lambda data: _emit("approval.request", sid, data) + key, lambda data: _emit_approval_request(sid, data) ) notify_registered = True load_permanent_allowlist() @@ -2554,7 +2569,7 @@ def _sync_session_key_after_compress( try: register_gateway_notify( new_session_id, - lambda data: _emit("approval.request", sid, data), + lambda data: _emit_approval_request(sid, data), ) except Exception: pass @@ -3916,7 +3931,7 @@ def _init_session( try: from tools.approval import register_gateway_notify, load_permanent_allowlist - register_gateway_notify(key, lambda data: _emit("approval.request", sid, data)) + register_gateway_notify(key, lambda data: _emit_approval_request(sid, data)) load_permanent_allowlist() except Exception: pass From 15880da8bbd5c9a48c3bc5f6955bea86fba54965 Mon Sep 17 00:00:00 2001 From: Tranquil-Flow <66773372+Tranquil-Flow@users.noreply.github.com> Date: Fri, 19 Jun 2026 22:15:26 +0200 Subject: [PATCH 17/26] fix(file_tools): resolve tilde using profile home for file operations (#48552) File tools (read_file, write_file, patch, list_directory, etc.) used os.path.expanduser() which reads the gateway process HOME env var. In Docker/systemd/s6 deployments where the gateway HOME differs from interactive sessions, tilde expanded to the wrong directory. Add _expand_tilde() helper that delegates to get_subprocess_home() when available, falling back to os.path.expanduser(). Replace all 9 expanduser() call sites in file_tools.py with _expand_tilde(). --- tests/tools/test_file_tools_tilde_profile.py | 109 +++++++++++++++++++ tools/file_tools.py | 41 +++++-- 2 files changed, 141 insertions(+), 9 deletions(-) create mode 100644 tests/tools/test_file_tools_tilde_profile.py diff --git a/tests/tools/test_file_tools_tilde_profile.py b/tests/tools/test_file_tools_tilde_profile.py new file mode 100644 index 00000000000..fc3dadef45c --- /dev/null +++ b/tests/tools/test_file_tools_tilde_profile.py @@ -0,0 +1,109 @@ +"""Regression tests for profile-aware tilde expansion in file tools. + +The bug (#48552): in-process file tools (write_file, read_file, patch, +search_files) resolved ``~`` via ``os.path.expanduser()``, which reads the +gateway process's ``HOME``. In profile mode (Docker, systemd, s6) the gateway +``HOME`` differs from the profile ``HOME`` that interactive sessions use, so +``~`` expanded to the wrong directory and file operations failed with +"no such file or directory". + +The fix adds ``_expand_tilde()`` which delegates to +``hermes_constants.get_subprocess_home()`` — the same policy the terminal tool +uses for subprocess environments. + +See: https://github.com/NousResearch/hermes-agent/issues/48552 +""" + +import os +from pathlib import Path +from unittest.mock import patch + +import pytest + +import tools.file_tools as ft + + +# --------------------------------------------------------------------------- +# _expand_tilde() unit tests +# --------------------------------------------------------------------------- + +class TestExpandTilde: + """Verify the _expand_tilde() helper resolves ~ to the profile home.""" + + def test_tilde_expands_to_profile_home(self): + """When get_subprocess_home returns a value, ~/path uses it.""" + with patch("hermes_constants.get_subprocess_home", return_value="/opt/data/profiles/coder/home"): + result = ft._expand_tilde("~/scratch/file.txt") + assert result == "/opt/data/profiles/coder/home/scratch/file.txt" + + def test_bare_tilde_expands_to_profile_home(self): + """Bare ~ expands to the profile home.""" + with patch("hermes_constants.get_subprocess_home", return_value="/opt/data/profiles/coder/home"): + result = ft._expand_tilde("~") + assert result == "/opt/data/profiles/coder/home" + + def test_falls_back_when_no_profile_home(self): + """When get_subprocess_home returns None, use os.path.expanduser.""" + with patch("hermes_constants.get_subprocess_home", return_value=None): + result = ft._expand_tilde("~/Documents") + assert result == os.path.expanduser("~/Documents") + + def test_other_user_tilde_not_overridden(self): + """~user/path must NOT use the profile home — it's a different user.""" + with patch("hermes_constants.get_subprocess_home", return_value="/opt/data/profiles/coder/home"): + result = ft._expand_tilde("~root/file.txt") + # Should use os.path.expanduser, not the profile home + assert "/opt/data/profiles/coder/home" not in result + + def test_no_tilde_unchanged(self): + """Paths without ~ are returned unchanged (modulo expanduser).""" + with patch("hermes_constants.get_subprocess_home", return_value="/opt/data/profiles/coder/home"): + result = ft._expand_tilde("/etc/passwd") + assert result == "/etc/passwd" + + def test_empty_path_unchanged(self): + """Empty string returns empty.""" + with patch("hermes_constants.get_subprocess_home", return_value="/opt/data/profiles/coder/home"): + assert ft._expand_tilde("") == "" + + +# --------------------------------------------------------------------------- +# Integration: _resolve_path_for_task uses profile home +# --------------------------------------------------------------------------- + +class TestResolvePathUsesProfileHome: + """Verify _resolve_path_for_task resolves ~ to the profile home.""" + + def test_relative_tilde_resolves_to_profile_home(self, tmp_path, monkeypatch): + """A ~/path argument resolves under the profile home, not process HOME.""" + profile_home = tmp_path / "profile_home" + profile_home.mkdir() + process_home = tmp_path / "process_home" + process_home.mkdir() + + monkeypatch.setenv("HOME", str(process_home)) + monkeypatch.setattr(ft, "_get_live_tracking_cwd", lambda task_id="default": None) + + with patch("hermes_constants.get_subprocess_home", return_value=str(profile_home)): + resolved = ft._resolve_path_for_task("~/test_file.txt", task_id="test") + + assert str(resolved).startswith(str(profile_home)) + assert "process_home" not in str(resolved) + + def test_absolute_tilde_in_workspace_root(self, tmp_path, monkeypatch): + """A workspace root specified with ~ resolves to profile home.""" + profile_home = tmp_path / "profile_home" + profile_home.mkdir() + process_home = tmp_path / "process_home" + process_home.mkdir() + + monkeypatch.setenv("HOME", str(process_home)) + monkeypatch.setattr(ft, "_get_live_tracking_cwd", lambda task_id="default": None) + + with patch("hermes_constants.get_subprocess_home", return_value=str(profile_home)): + # _resolve_base_dir uses the workspace root from config; if it contains ~, + # it should resolve to profile home + resolved = ft._resolve_path_for_task("~/data/config.json", task_id="test") + + assert str(profile_home) in str(resolved) + assert str(process_home) not in str(resolved) diff --git a/tools/file_tools.py b/tools/file_tools.py index a28c057e63a..ffae69a6012 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -23,6 +23,29 @@ logger = logging.getLogger(__name__) _EXPECTED_WRITE_ERRNOS = {errno.EACCES, errno.EPERM, errno.EROFS} + +def _expand_tilde(path: str) -> str: + """Expand ``~`` using the effective profile home when available. + + In-process file tools share the gateway process's HOME, which may differ + from the profile-specific HOME that interactive CLI sessions use. This + mirrors ``hermes_constants.get_subprocess_home()`` so that ``~`` resolves + consistently regardless of whether the tool runs interactively or inside a + gateway-driven cron job (#48552). + """ + if not path or "~" not in path: + return path + try: + from hermes_constants import get_subprocess_home + + home = get_subprocess_home() + except Exception: + home = None + if home and (path == "~" or path.startswith("~/")): + return home if path == "~" else os.path.join(home, path[2:]) + return os.path.expanduser(path) + + # --------------------------------------------------------------------------- # Read-size guard: cap the character count returned to the model. # We're model-agnostic so we can't count tokens; characters are a safe proxy. @@ -107,7 +130,7 @@ def _sentinel_free_abs_cwd(raw: str | None) -> str | None: raw = str(raw or "").strip() if raw.lower() in _TERMINAL_CWD_SENTINELS: return None - expanded = os.path.expanduser(raw) + expanded = _expand_tilde(raw) if not os.path.isabs(expanded): return None return expanded @@ -222,7 +245,7 @@ def _resolve_base_dir(task_id: str = "default") -> Path: """ root = _authoritative_workspace_root(task_id) if root: - base = Path(root).expanduser() + base = Path(_expand_tilde(root)) else: base = Path(os.getcwd()) if not base.is_absolute(): @@ -239,7 +262,7 @@ def _resolve_path_for_task(filepath: str, task_id: str = "default") -> Path: See :func:`_resolve_base_dir` for how the base is chosen. Absolute input paths are returned resolved-but-unanchored. """ - p = Path(filepath).expanduser() + p = Path(_expand_tilde(filepath)) if p.is_absolute(): return p.resolve() return (_resolve_base_dir(task_id) / p).resolve() @@ -261,12 +284,12 @@ def _path_resolution_warning(filepath: str, resolved: Path, task_id: str = "defa (no ``cd`` run yet) is warned on the very first write. """ try: - if Path(filepath).expanduser().is_absolute(): + if Path(_expand_tilde(filepath)).is_absolute(): return None workspace_root = _authoritative_workspace_root(task_id) if not workspace_root: return None # No authoritative workspace root to compare against. - root = Path(workspace_root).expanduser().resolve() + root = Path(_expand_tilde(workspace_root)).resolve() # Is `resolved` inside `root`? try: resolved.relative_to(root) @@ -285,7 +308,7 @@ def _path_resolution_warning(filepath: str, resolved: Path, task_id: str = "defa def _is_blocked_device_path(path: str) -> bool: """Return True for concrete device/fd paths that can hang reads.""" - normalized = os.path.normpath(os.path.expanduser(path)) + normalized = os.path.normpath(_expand_tilde(path)) if normalized in _BLOCKED_DEVICE_PATHS: return True # /proc/self/fd/0-2 and /proc//fd/0-2 are Linux aliases for stdio @@ -309,7 +332,7 @@ def _is_blocked_device(filepath: str, base_dir: str | Path | None = None) -> boo they resolve to terminal-specific paths. Then check each symlink hop before the final resolved path so aliases to devices cannot bypass the guard. """ - expanded = os.path.expanduser(filepath) + expanded = _expand_tilde(filepath) if base_dir is not None and not os.path.isabs(expanded): expanded = os.path.join(os.fspath(base_dir), expanded) normalized = os.path.normpath(expanded) @@ -365,7 +388,7 @@ def _get_hermes_config_resolved() -> str | None: _hermes_config_resolved = str(get_config_path().resolve()) except Exception: try: - _hermes_config_resolved = str(Path("~/.hermes/config.yaml").expanduser().resolve()) + _hermes_config_resolved = str(Path(_expand_tilde("~/.hermes/config.yaml")).resolve()) except Exception: _hermes_config_resolved = None return _hermes_config_resolved @@ -377,7 +400,7 @@ def _check_sensitive_path(filepath: str, task_id: str = "default") -> str | None resolved = str(_resolve_path_for_task(filepath, task_id)) except (OSError, ValueError): resolved = filepath - normalized = os.path.normpath(os.path.expanduser(filepath)) + normalized = os.path.normpath(_expand_tilde(filepath)) _err = ( f"Refusing to write to sensitive system path: {filepath}\n" "Use the terminal tool with sudo if you need to modify system files." From 660e36f097e8bc0c2dc2a9e22d203eb6a9d9361c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:54:28 -0700 Subject: [PATCH 18/26] fix(cron): scope job execution to its owning profile (#32091 follow-up) (#50993) The #32091 fix moved every profile's cron jobs into one shared root store, but never wired the execution-scoping half it recommended: a job still ran under whichever profile's ticker picked it up, not its owning profile. So a job created under `hermes -p donna` could execute with the root profile's .env / config.yaml / credentials. - jobs.py: create_job auto-captures the active profile (explicit profile= override available) and stores it on the job; resolve_profile_home() maps a profile name to its HERMES_HOME; legacy jobs backfill to 'default'. - scheduler.py: run_job applies the job's profile via a scoped HERMES_HOME override (env var + in-process ContextVar) before any .env/config/script load, restored in finally. tick() routes profile-mismatched jobs to the single-worker sequential pool so the env mutation can't race. - cronjob tool threads profile through (NOT exposed in the model schema, to avoid cross-profile privilege escalation); hermes cron add gains --profile. E2E verified against a temp HERMES_HOME with a real profile dir: a root-profile ticker runs a profile='donna' job with HERMES_HOME=donna during execution and restores the ticker env afterward. --- cron/jobs.py | 57 ++++++++++ cron/scheduler.py | 65 +++++++++-- hermes_cli/cron.py | 7 ++ hermes_cli/subcommands/cron.py | 4 + tests/cron/test_cron_profile_storage.py | 136 ++++++++++++++++++++++++ tools/cronjob_tools.py | 2 + 6 files changed, 265 insertions(+), 6 deletions(-) diff --git a/cron/jobs.py b/cron/jobs.py index 6ec6d5be123..7a117c37775 100644 --- a/cron/jobs.py +++ b/cron/jobs.py @@ -248,6 +248,12 @@ def _normalize_job_record(job: Dict[str, Any]) -> Dict[str, Any]: state = "scheduled" if normalized.get("enabled", True) else "paused" normalized["state"] = state + # Legacy jobs (created before per-job profile scoping) have no profile + # field. Default them to "default" so the scheduler treats them as + # root-profile jobs — matching their pre-existing behaviour. + prof = normalized.get("profile") + normalized["profile"] = (str(prof).strip() if isinstance(prof, str) and prof.strip() else "default") + return normalized @@ -268,6 +274,43 @@ def _secure_file(path: Path): pass +def current_profile_name() -> str: + """Return the active profile name for the process creating a job. + + ``~/.hermes`` -> ``"default"`` + ``~/.hermes/profiles/X`` -> ``"X"`` + + Used at create time to tag a job with the profile whose environment + (.env / config.yaml / credentials) it should execute under, so the + job runs as its owning profile regardless of which profile's ticker + picks it up from the shared root store (#32091). + """ + try: + from agent.file_safety import _resolve_active_profile_name + return _resolve_active_profile_name() or "default" + except Exception: + return "default" + + +def resolve_profile_home(profile_name: Optional[str]) -> Optional[Path]: + """Map a job's ``profile`` name to the HERMES_HOME it should run under. + + ``"default"`` / empty / ``None`` -> the root home (``get_default_hermes_root()``). + ``""`` -> ``/profiles/``. + + Returns ``None`` when the named profile directory does not exist, so the + scheduler can fall back to the ticker's own home and log a warning rather + than pointing a job at a missing profile. + """ + name = (profile_name or "").strip() + if not name or name == "default": + return get_default_hermes_root().resolve() + candidate = (get_default_hermes_root() / "profiles" / name).resolve() + if candidate.is_dir(): + return candidate + return None + + def ensure_dirs(): """Ensure cron directories exist with secure permissions.""" CRON_DIR.mkdir(parents=True, exist_ok=True) @@ -772,6 +815,7 @@ def create_job( enabled_toolsets: Optional[List[str]] = None, workdir: Optional[str] = None, no_agent: bool = False, + profile: Optional[str] = None, ) -> Dict[str, Any]: """ Create a new cron job. @@ -816,6 +860,13 @@ def create_job( and deliver its stdout directly. Empty stdout = silent (no delivery). Requires ``script`` to be set. Ideal for classic watchdogs and periodic alerts that don't need LLM reasoning. + profile: Optional Hermes profile name the job should EXECUTE under + (its .env / config.yaml / credentials). Defaults to the active + profile of the session creating the job. The shared root store + holds every profile's jobs (#32091); this field is what scopes + a job's runtime environment to its owning profile so it runs + with that profile's permissions regardless of which ticker + picks it up. Returns: The created job dict @@ -850,6 +901,11 @@ def create_job( normalized_toolsets = normalized_toolsets or None normalized_workdir = _normalize_workdir(workdir) normalized_no_agent = bool(no_agent) + # Tag the job with the profile whose environment it should execute under. + # When the caller does not pass one explicitly, capture the active profile + # of the session creating the job so a job created under `hermes -p donna` + # runs as donna even though it now lives in the shared root store (#32091). + normalized_profile = (str(profile).strip() if isinstance(profile, str) else "") or current_profile_name() # no_agent jobs are meaningless without a script — the script IS the job. # Surface this as a clear ValueError at create time so bad configs never @@ -903,6 +959,7 @@ def create_job( "origin": origin, # Tracks where job was created for "origin" delivery "enabled_toolsets": normalized_toolsets, "workdir": normalized_workdir, + "profile": normalized_profile, } with _jobs_lock(): diff --git a/cron/scheduler.py b/cron/scheduler.py index c48935c84a6..eee3bc1656f 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -1857,6 +1857,32 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: os.environ["TERMINAL_CWD"] = _job_workdir logger.info("Job '%s': using workdir %s", job_id, _job_workdir) + # Scope this job's execution to its owning profile's HERMES_HOME (#32091). + # The shared root store holds every profile's jobs, but a job must run with + # the .env / config.yaml / credentials of the profile that created it — not + # whichever profile's ticker happened to pick it up. We set both the + # in-process ContextVar override (consumed by _get_hermes_home() for the + # config/.env/script loads below) AND os.environ["HERMES_HOME"] (inherited + # by any child subprocess the agent spawns). tick() routes profile-scoped + # jobs to the single-worker sequential pool, so mutating os.environ here is + # safe — they never overlap. Restored in the finally block. + from cron.jobs import resolve_profile_home + from hermes_constants import set_hermes_home_override + _job_profile = (job.get("profile") or "default").strip() or "default" + _profile_home = resolve_profile_home(_job_profile) + _prior_hermes_home = os.environ.get("HERMES_HOME", "_UNSET_") + _hermes_home_token = None + if _profile_home is not None and _profile_home != _get_hermes_home().resolve(): + os.environ["HERMES_HOME"] = str(_profile_home) + _hermes_home_token = set_hermes_home_override(str(_profile_home)) + logger.info("Job '%s': executing under profile %r (HERMES_HOME=%s)", + job_id, _job_profile, _profile_home) + elif _profile_home is None and _job_profile != "default": + logger.warning( + "Job '%s': profile %r no longer exists — running under the " + "ticker's profile instead", job_id, _job_profile, + ) + try: # Re-read .env and config.yaml fresh every run so provider/key # changes take effect without a gateway restart. @@ -2268,6 +2294,19 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: os.environ.pop("TERMINAL_CWD", None) else: os.environ["TERMINAL_CWD"] = _prior_terminal_cwd + # Restore HERMES_HOME to the ticker's value when this job overrode it + # for profile-scoped execution (#32091). Mirrors the TERMINAL_CWD + # restore above; the sequential pool guarantees no overlap. + if _hermes_home_token is not None: + try: + from hermes_constants import reset_hermes_home_override + reset_hermes_home_override(_hermes_home_token) + except Exception: + pass + if _prior_hermes_home == "_UNSET_": + os.environ.pop("HERMES_HOME", None) + else: + os.environ["HERMES_HOME"] = _prior_hermes_home # Clean up ContextVar session/delivery state for this job. clear_session_vars(_ctx_tokens) for _var_name in _cron_delivery_vars: @@ -2473,12 +2512,26 @@ def tick(verbose: bool = True, adapters=None, loop=None, sync: bool = True) -> i body.""" return run_one_job(job, adapters=adapters, loop=loop, verbose=verbose) - # Partition due jobs: those with a per-job workdir mutate - # os.environ["TERMINAL_CWD"] inside run_job, which is process-global — - # so they MUST run sequentially to avoid corrupting each other. Jobs - # without a workdir leave env untouched and stay parallel-safe. - sequential_jobs = [j for j in due_jobs if (j.get("workdir") or "").strip()] - parallel_jobs = [j for j in due_jobs if not (j.get("workdir") or "").strip()] + # Partition due jobs: those that mutate process-global os.environ + # inside run_job MUST run sequentially to avoid corrupting each other. + # Two cases mutate env: + # - a per-job workdir sets os.environ["TERMINAL_CWD"]. + # - a per-job profile whose HERMES_HOME differs from the ticker's + # sets os.environ["HERMES_HOME"] to scope execution (#32091). + # Jobs that need neither leave env untouched and stay parallel-safe. + def _needs_sequential(j: dict) -> bool: + if (j.get("workdir") or "").strip(): + return True + prof = (j.get("profile") or "default").strip() or "default" + try: + from cron.jobs import resolve_profile_home + phome = resolve_profile_home(prof) + except Exception: + phome = None + return phome is not None and phome != _get_hermes_home().resolve() + + sequential_jobs = [j for j in due_jobs if _needs_sequential(j)] + parallel_jobs = [j for j in due_jobs if not _needs_sequential(j)] _results: list = [] _all_futures: list = [] diff --git a/hermes_cli/cron.py b/hermes_cli/cron.py index 3c3116970a7..44792fa630c 100644 --- a/hermes_cli/cron.py +++ b/hermes_cli/cron.py @@ -120,6 +120,9 @@ def cron_list(show_all: bool = False): workdir = job.get("workdir") if workdir: print(f" Workdir: {workdir}") + _prof = job.get("profile") + if _prof and _prof != "default": + print(f" Profile: {_prof}") # Execution history last_status = job.get("last_status") @@ -259,6 +262,7 @@ def cron_create(args): script=getattr(args, "script", None), workdir=getattr(args, "workdir", None), no_agent=getattr(args, "no_agent", False) or None, + profile=getattr(args, "profile", None), ) if not result.get("success"): print(color(f"Failed to create job: {result.get('error', 'unknown error')}", Colors.RED)) @@ -275,6 +279,9 @@ def cron_create(args): print(" Mode: no-agent (script stdout delivered directly)") if job_data.get("workdir"): print(f" Workdir: {job_data['workdir']}") + _prof = job_data.get("profile") + if _prof and _prof != "default": + print(f" Profile: {_prof}") print(f" Next run: {result['next_run_at']}") return 0 diff --git a/hermes_cli/subcommands/cron.py b/hermes_cli/subcommands/cron.py index c50b3401462..7ceea3a0f58 100644 --- a/hermes_cli/subcommands/cron.py +++ b/hermes_cli/subcommands/cron.py @@ -70,6 +70,10 @@ def build_cron_parser(subparsers, *, cmd_cron: Callable) -> None: "--workdir", help="Absolute path for the job to run from. Injects AGENTS.md / CLAUDE.md / .cursorrules from that directory and uses it as the cwd for terminal/file/code_exec tools. Omit to preserve old behaviour (no project context files).", ) + cron_create.add_argument( + "--profile", + help="Hermes profile the job should EXECUTE under (its .env / config.yaml / credentials). Defaults to the profile that created the job. Jobs live in one shared root store (#32091); this scopes a job's runtime environment to the named profile so it runs with that profile's permissions.", + ) # cron edit cron_edit = cron_subparsers.add_parser( diff --git a/tests/cron/test_cron_profile_storage.py b/tests/cron/test_cron_profile_storage.py index e13a1333d2f..53d0feec912 100644 --- a/tests/cron/test_cron_profile_storage.py +++ b/tests/cron/test_cron_profile_storage.py @@ -103,3 +103,139 @@ def test_get_default_hermes_root_docker_layouts(tmp_path, monkeypatch): # Docker profile layout: /profiles/ -> . monkeypatch.setenv("HERMES_HOME", "/opt/data/profiles/coder") assert hermes_constants.get_default_hermes_root() == Path("/opt/data") + + +# --------------------------------------------------------------------------- +# Per-job profile EXECUTION scoping (#32091 follow-up). +# +# The storage half of #32091 (above) moved every profile's jobs into one shared +# root store. But a job must still EXECUTE under its owning profile's +# environment (.env / config.yaml / credentials) — not whichever profile's +# ticker picks it up. These tests cover the execution-scoping half. +# --------------------------------------------------------------------------- + + +def _profile_env(tmp_path, monkeypatch, active="default"): + """Set up a root home with a 'donna' profile dir and point the platform + default at it. Returns (root, donna_home). ``active`` selects which + HERMES_HOME the process runs under.""" + root = tmp_path / "hermes_home" + (root / "cron").mkdir(parents=True) + donna_home = root / "profiles" / "donna" + (donna_home / "cron").mkdir(parents=True) + import hermes_constants + monkeypatch.setattr(hermes_constants, "_get_platform_default_hermes_home", + lambda: root) + monkeypatch.setenv("HERMES_HOME", str(root if active == "default" else donna_home)) + return root, donna_home + + +def test_create_job_autocaptures_active_profile(tmp_path, monkeypatch): + """A job created from inside a profile session is tagged with that profile, + so the scheduler can later scope its execution back to it.""" + root, donna_home = _profile_env(tmp_path, monkeypatch, active="donna") + import cron.jobs as jobs + importlib.reload(jobs) + try: + job = jobs.create_job(prompt="audit", schedule="every 1h", name="a") + # auto-captured from the active (donna) session + assert job["profile"] == "donna" + # and it landed in the SHARED ROOT store, not donna's profile-local one + assert jobs.JOBS_FILE.resolve() == (root / "cron" / "jobs.json").resolve() + assert jobs.JOBS_FILE.exists() + assert not (donna_home / "cron" / "jobs.json").exists() + finally: + monkeypatch.undo() + importlib.reload(jobs) + + +def test_create_job_explicit_profile_override(tmp_path, monkeypatch): + """An explicit profile= wins over the auto-captured active profile.""" + root, donna_home = _profile_env(tmp_path, monkeypatch, active="default") + (root / "profiles" / "ops" / "cron").mkdir(parents=True) + import cron.jobs as jobs + importlib.reload(jobs) + try: + job = jobs.create_job(prompt="x", schedule="every 2h", profile="ops") + assert job["profile"] == "ops" + finally: + monkeypatch.undo() + importlib.reload(jobs) + + +def test_resolve_profile_home_maps_names(tmp_path, monkeypatch): + """resolve_profile_home maps default/named profiles to homes and returns + None for a missing profile.""" + root, donna_home = _profile_env(tmp_path, monkeypatch, active="default") + import cron.jobs as jobs + importlib.reload(jobs) + try: + assert jobs.resolve_profile_home("default").resolve() == root.resolve() + assert jobs.resolve_profile_home("").resolve() == root.resolve() + assert jobs.resolve_profile_home("donna").resolve() == donna_home.resolve() + assert jobs.resolve_profile_home("ghost") is None + finally: + monkeypatch.undo() + importlib.reload(jobs) + + +def test_normalize_backfills_legacy_profile_to_default(tmp_path, monkeypatch): + """A pre-feature job with no profile field reads back as 'default'.""" + import cron.jobs as jobs + legacy = {"id": "l1", "name": "old", "prompt": "x", + "schedule": {"kind": "interval", "minutes": 60}} + assert jobs._normalize_job_record(legacy)["profile"] == "default" + + +def test_run_job_scopes_execution_to_job_profile(tmp_path, monkeypatch): + """The decisive test: a ticker running as the ROOT profile executes a + job tagged profile='donna' with HERMES_HOME pointed at donna's home + (both the env var and the in-process override), then restores the + ticker's env afterward.""" + from unittest.mock import MagicMock, patch + root, donna_home = _profile_env(tmp_path, monkeypatch, active="default") + (donna_home / "config.yaml").write_text("model:\n default: openrouter/test\n") + + import hermes_constants + import cron.jobs as jobs + import cron.scheduler as sched + importlib.reload(jobs) + importlib.reload(sched) + + captured = {} + + def fake_run_conversation(prompt, *a, **k): + captured["env"] = os.environ.get("HERMES_HOME") + captured["override"] = hermes_constants.get_hermes_home_override() + captured["resolved"] = str(hermes_constants.get_hermes_home()) + return {"final_response": "done", "completed": True, "failed": False, + "turn_exit_reason": "text_response(finish_reason=stop)"} + + job = {"id": "j-donna", "name": "donna-audit", "prompt": "audit", + "profile": "donna", "schedule": {"kind": "interval", "minutes": 60}, + "deliver": "local", "model": "openrouter/test"} + + before = os.environ.get("HERMES_HOME") + try: + fake_agent = MagicMock() + fake_agent.run_conversation.side_effect = fake_run_conversation + with patch("cron.scheduler._resolve_origin", return_value=None), \ + patch("dotenv.load_dotenv"), \ + patch("hermes_state.SessionDB", return_value=MagicMock()), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + return_value={"api_key": "k", "base_url": "https://x/v1", + "provider": "openrouter", "api_mode": "chat_completions"}), \ + patch("run_agent.AIAgent", return_value=fake_agent): + success, output, final, err = sched.run_job(job) + + assert success is True, (success, err) + # During execution the job ran AS donna: + assert captured["env"] == str(donna_home) + assert captured["override"] == str(donna_home) + assert captured["resolved"] == str(donna_home) + # After the job, the ticker's HERMES_HOME is restored (no leak): + assert os.environ.get("HERMES_HOME") == before + finally: + monkeypatch.undo() + importlib.reload(jobs) + importlib.reload(sched) diff --git a/tools/cronjob_tools.py b/tools/cronjob_tools.py index 3339b823941..62f677bc912 100644 --- a/tools/cronjob_tools.py +++ b/tools/cronjob_tools.py @@ -539,6 +539,7 @@ def cronjob( enabled_toolsets: Optional[List[str]] = None, workdir: Optional[str] = None, no_agent: Optional[bool] = None, + profile: Optional[str] = None, task_id: str = None, ) -> str: """Unified cron job management tool.""" @@ -605,6 +606,7 @@ def cronjob( enabled_toolsets=enabled_toolsets or None, workdir=_normalize_optional_job_value(workdir), no_agent=_no_agent, + profile=_normalize_optional_job_value(profile), ) _notify_provider_jobs_changed_safe() return json.dumps( From 87c4a5ebb8a9f8122197a908288cc0abc7cef6b0 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 22 Jun 2026 14:54:53 -0700 Subject: [PATCH 19/26] feat(background-review): aux-model selector for the self-improvement review (#49252) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds auxiliary.background_review.{provider,model} (default auto = main chat model — unchanged). Set it to a different, cheaper model and the post-turn self-improvement review runs there for ~3-5x lower cost. Cache-aware by design: the main chat is warm in the prompt cache, so the default full-history replay on the main model is cheap cache reads — left exactly as-is. A different model can't reuse that cache (different key), so when (and only when) routed to a different model the fork replays a compact digest instead of the full transcript, minimising what it cold-writes on the aux model. Same model -> full replay; different model -> digest. Quality holds in benchmarks: memory capture identical, skill near-identical. Nothing changes unless you opt in by naming a different model. Co-authored-by: Hermes Agent --- agent/background_review.py | 186 +++++++++++++++--- hermes_cli/config.py | 19 ++ .../test_background_review_cost_controls.py | 138 +++++++++++++ website/docs/user-guide/features/memory.md | 25 +++ 4 files changed, 341 insertions(+), 27 deletions(-) create mode 100644 tests/run_agent/test_background_review_cost_controls.py diff --git a/agent/background_review.py b/agent/background_review.py index fa4de508e19..564c5441996 100644 --- a/agent/background_review.py +++ b/agent/background_review.py @@ -27,6 +27,131 @@ from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Background-review aux-model selector + routed digest. +# +# The review fork runs on the MAIN model by default ("auto"), replaying the +# full conversation — already warm in the prompt cache, so cheap cache reads. +# Optimal and unchanged. A user can route the review to a different, cheaper +# model via auxiliary.background_review.{provider,model}. A different model +# cannot reuse the parent's cache (different key), so the fork is cold +# regardless — replaying the full transcript would just cold-write it. So when +# (and only when) routed to a different model, we replay a compact DIGEST to +# minimise cold-written tokens. Same model -> full replay; different model -> +# digest. That's the whole policy. +# --------------------------------------------------------------------------- + + +def _resolve_review_runtime(agent: Any) -> Dict[str, Any]: + """Resolve provider/model/credentials for the review fork. + + Default (auto / unset / same as parent): inherit the parent's live runtime + (with codex_app_server -> codex_responses downgrade). ``routed`` is False — + the fork uses the main model and the warm cache, exactly as before. When + ``auxiliary.background_review.{provider,model}`` names a concrete model + different from the parent's, resolve that runtime and set ``routed=True``. + """ + parent_runtime = agent._current_main_runtime() + parent_api_mode = parent_runtime.get("api_mode") or None + if parent_api_mode == "codex_app_server": + parent_api_mode = "codex_responses" + parent = { + "provider": agent.provider, + "model": agent.model, + "api_key": parent_runtime.get("api_key") or None, + "base_url": parent_runtime.get("base_url") or None, + "api_mode": parent_api_mode, + "routed": False, + } + try: + from hermes_cli.config import load_config + cfg = load_config() + except Exception: + return parent + aux = cfg.get("auxiliary", {}) if isinstance(cfg.get("auxiliary"), dict) else {} + task = aux.get("background_review", {}) if isinstance(aux.get("background_review"), dict) else {} + task_provider = (str(task.get("provider", "")).strip() or None) + task_model = (str(task.get("model", "")).strip() or None) + task_base_url = (str(task.get("base_url", "")).strip() or None) + task_api_key = (str(task.get("api_key", "")).strip() or None) + if not (task_provider and task_provider != "auto" and task_model): + return parent + if task_provider == (agent.provider or "") and task_model == (agent.model or ""): + return parent # same model/provider as parent -> not routed + try: + from hermes_cli.runtime_provider import resolve_runtime_provider + rp = resolve_runtime_provider( + requested=task_provider, + target_model=task_model, + explicit_api_key=task_api_key, + explicit_base_url=task_base_url, + ) + return { + "provider": rp.get("provider") or task_provider, + "model": task_model, + "api_key": rp.get("api_key"), + "base_url": rp.get("base_url"), + "api_mode": rp.get("api_mode"), + "routed": True, + } + except Exception as e: + logger.debug("background-review aux routing failed (%s); using main model", e) + return parent + + +def _msg_text(m: Dict) -> str: + c = m.get("content") + if isinstance(c, str): + return c.strip() + if isinstance(c, list): + return " ".join(b.get("text", "") for b in c if isinstance(b, dict)).strip() + return "" + + +def _digest_history(messages_snapshot: List[Dict], tail: int = 24) -> List[Dict]: + """Compact replay for the routed (different-model) path only. + + Keeps the recent ``tail`` messages verbatim, collapses older turns into one + synthetic user-role digest, preserving role alternation. Used ONLY when + routed to a different model (cache cold regardless, so fewer cold-written + tokens is a pure win). Never on the main-model path (full replay stays warm). + """ + msgs = list(messages_snapshot or []) + if len(msgs) <= tail: + return msgs + keep = msgs[-tail:] + while keep and isinstance(keep[0], dict) and keep[0].get("role") == "tool": + tail += 1 + if len(msgs) <= tail: + return msgs + keep = msgs[-tail:] + old = msgs[:-len(keep)] + lines: List[str] = [] + for m in old: + if not isinstance(m, dict): + continue + role = m.get("role") + text = _msg_text(m).replace("\n", " ") + if role == "user" and text: + lines.append(f"USER: {text[:300]}") + elif role == "assistant": + tcs = m.get("tool_calls") or [] + if tcs: + names = [(tc.get("function") or {}).get("name", "?") for tc in tcs if isinstance(tc, dict)] + lines.append(f"ASSISTANT[tools: {', '.join(names)}]") + if text: + lines.append(f"ASSISTANT: {text[:200]}") + digest = { + "role": "user", + "content": ( + "[Earlier conversation digest — older turns summarised to bound the " + "review's cold-write cost on the routed aux model. Recent turns " + "follow verbatim below.]\n" + "\n".join(lines) + ), + } + return [digest] + keep + + # Review-prompt strings — used by ``spawn_background_review_thread`` to build # the user-message that the forked review agent receives. AIAgent exposes # them as class attributes (``_MEMORY_REVIEW_PROMPT`` etc.) for back-compat; @@ -488,18 +613,13 @@ def _run_review_in_thread( # creds, or credential-pool setups where the resolver can't # reconstruct auth from scratch -- producing the spurious # "No LLM provider configured" warning at end of turn. - _parent_runtime = agent._current_main_runtime() - _parent_api_mode = _parent_runtime.get("api_mode") or None - # The review fork needs to call agent-loop tools (memory, - # skill_manage). Those tools require Hermes' own dispatch, - # which the codex_app_server runtime bypasses entirely - # (it runs the turn inside codex's subprocess). So when - # the parent is on codex_app_server, downgrade the review - # fork to codex_responses — same auth/credentials, but - # talks to the OpenAI Responses API directly so Hermes - # owns the loop and the agent-loop tools dispatch. - if _parent_api_mode == "codex_app_server": - _parent_api_mode = "codex_responses" + # _resolve_review_runtime() returns the parent's live runtime by + # default (routed=False; main model, warm cache), or — when the user + # set auxiliary.background_review.{provider,model} to a different + # model — that model's runtime (routed=True). The codex_app_server + # -> codex_responses downgrade is applied inside the resolver. + _rt = _resolve_review_runtime(agent) + _routed = bool(_rt.get("routed")) # skip_memory=True keeps the review fork from # touching external memory plugins (honcho, mem0, # supermemory, etc.). Without it, the fork's @@ -519,14 +639,14 @@ def _run_review_in_thread( # in the request body — Anthropic's cache key includes it. # (The runtime whitelist below still restricts dispatch.) review_agent = AIAgent( - model=agent.model, + model=_rt.get("model") or agent.model, max_iterations=16, quiet_mode=True, platform=agent.platform, - provider=agent.provider, - api_mode=_parent_api_mode, - base_url=_parent_runtime.get("base_url") or None, - api_key=_parent_runtime.get("api_key") or None, + provider=_rt.get("provider") or agent.provider, + api_mode=_rt.get("api_mode"), + base_url=_rt.get("base_url") or None, + api_key=_rt.get("api_key") or None, credential_pool=getattr(agent, "_credential_pool", None), parent_session_id=agent.session_id, enabled_toolsets=getattr(agent, "enabled_toolsets", None), @@ -565,15 +685,20 @@ def _run_review_in_thread( # issue #25322 and PR #17276 for the full analysis + # measured impact (~26% end-to-end cost reduction on # Sonnet 4.5). - review_agent._cached_system_prompt = agent._cached_system_prompt - # Defensive: pin session_start + session_id to the - # parent's so any code path that re-renders parts of - # the system prompt (compression, plugin hooks) still - # produces byte-identical output. The cached-prompt - # assignment above already short-circuits the normal - # rebuild path, but these pins guarantee parity even - # if a future code path bypasses the cache. - review_agent.session_start = agent.session_start + # Share the parent's warm cached system prompt ONLY when the review + # runs on the SAME model (not routed). When routed to a different + # model the parent's cached prompt is for the wrong model/cache key + # and would miss anyway, so let the routed fork build its own. + if not _routed: + review_agent._cached_system_prompt = agent._cached_system_prompt + # Defensive: pin session_start + session_id to the + # parent's so any code path that re-renders parts of + # the system prompt (compression, plugin hooks) still + # produces byte-identical output. The cached-prompt + # assignment above already short-circuits the normal + # rebuild path, but these pins guarantee parity even + # if a future code path bypasses the cache. + review_agent.session_start = agent.session_start review_agent.session_id = agent.session_id # The fork shares the parent's live session_id (pinned above for # prefix-cache parity). It is single-lifecycle and calls close() @@ -615,6 +740,13 @@ def _run_review_in_thread( ), ) try: + # Routed to a different model -> replay a digest (cache is cold + # on that model anyway, so minimise cold-written tokens). Same + # model -> replay the full snapshot (warm cache reads). + _review_history = ( + _digest_history(messages_snapshot) if _routed + else messages_snapshot + ) review_agent.run_conversation( user_message=( prompt @@ -622,7 +754,7 @@ def _run_review_in_thread( "management tools. Other tools will be denied " "at runtime — do not attempt them." ), - conversation_history=messages_snapshot, + conversation_history=_review_history, ) finally: clear_thread_tool_whitelist() diff --git a/hermes_cli/config.py b/hermes_cli/config.py index ce8ec7d6693..34923375984 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1535,6 +1535,25 @@ DEFAULT_CONFIG = { "timeout": 60, "extra_body": {}, }, + # Background review — the post-turn self-improvement fork that decides + # whether to save a memory / patch a skill. "auto" (default) = run on + # the main chat model, replaying the full conversation, which is already + # warm in the prompt cache (cheap cache reads) — unchanged, optimal. + # Set provider/model to a cheaper model (e.g. openrouter + # google/gemini-3-flash-preview) to run the review there for ~3-5x lower + # cost. A different model can't reuse the main prompt cache anyway, so + # the fork automatically replays a compact digest instead of the full + # transcript when routed (minimises the cold-write). Same model = full + # replay; different model = digest. Quality holds (memory capture + # identical, skill near-identical in benchmarks). + "background_review": { + "provider": "auto", + "model": "", + "base_url": "", + "api_key": "", + "timeout": 120, + "extra_body": {}, + }, }, "display": { diff --git a/tests/run_agent/test_background_review_cost_controls.py b/tests/run_agent/test_background_review_cost_controls.py new file mode 100644 index 00000000000..5ca47b2a0f9 --- /dev/null +++ b/tests/run_agent/test_background_review_cost_controls.py @@ -0,0 +1,138 @@ +"""Unit coverage for the background-review aux-model selector + routed digest. + +Covers the two behaviors this change adds: + • _resolve_review_runtime — auto/same-model → not routed (main model, warm + cache); a configured different model → routed with resolved credentials. + • _digest_history — compact replay used ONLY on the routed path (recent tail + verbatim + a digest of older turns), preserving role alternation. + +Pure-function / config-driven; no live model calls. +""" +from unittest.mock import patch + +from agent import background_review as br + + +def _msg(role, content, tool_calls=None): + m = {"role": role, "content": content} + if tool_calls: + m["tool_calls"] = tool_calls + return m + + +# --------------------------------------------------------------------------- +# _resolve_review_runtime — the aux-model selector +# --------------------------------------------------------------------------- + +class _FakeAgent: + def __init__(self, provider="openai-codex", model="gpt-5.5"): + self.provider = provider + self.model = model + + def _current_main_runtime(self): + return { + "api_key": "parent-key", + "base_url": "https://chatgpt.com/backend-api/codex", + "api_mode": "codex_app_server", + } + + +def test_routing_auto_inherits_parent_and_downgrades_codex_app_server(): + agent = _FakeAgent() + cfg = {"auxiliary": {"background_review": {"provider": "auto", "model": ""}}} + with patch("hermes_cli.config.load_config", return_value=cfg): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is False + assert rt["provider"] == "openai-codex" + assert rt["model"] == "gpt-5.5" + assert rt["api_mode"] == "codex_responses" # downgraded so agent-loop tools dispatch + + +def test_routing_to_different_model_marks_routed_and_resolves_credentials(): + agent = _FakeAgent() + cfg = {"auxiliary": {"background_review": { + "provider": "openrouter", "model": "google/gemini-3-flash-preview", + }}} + fake_rp = { + "provider": "openrouter", "api_key": "or-key", + "base_url": "https://openrouter.ai/api/v1", "api_mode": "chat_completions", + } + with patch("hermes_cli.config.load_config", return_value=cfg), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", return_value=fake_rp): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is True + assert rt["provider"] == "openrouter" + assert rt["model"] == "google/gemini-3-flash-preview" + assert rt["api_key"] == "or-key" + + +def test_routing_same_model_as_parent_is_not_routed(): + agent = _FakeAgent(provider="openrouter", model="anthropic/claude-opus-4.8") + cfg = {"auxiliary": {"background_review": { + "provider": "openrouter", "model": "anthropic/claude-opus-4.8", + }}} + with patch("hermes_cli.config.load_config", return_value=cfg): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is False # same model/provider → keep full-replay path + + +def test_routing_resolution_failure_falls_back_to_parent(): + agent = _FakeAgent() + cfg = {"auxiliary": {"background_review": { + "provider": "openrouter", "model": "google/gemini-3-flash-preview", + }}} + with patch("hermes_cli.config.load_config", return_value=cfg), \ + patch("hermes_cli.runtime_provider.resolve_runtime_provider", + side_effect=RuntimeError("boom")): + rt = br._resolve_review_runtime(agent) + assert rt["routed"] is False + assert rt["provider"] == "openai-codex" + + +# --------------------------------------------------------------------------- +# _digest_history — routed-path compact replay +# --------------------------------------------------------------------------- + +def test_digest_under_tail_returns_full(): + msgs = [_msg("user", "hi"), _msg("assistant", "hello")] + assert br._digest_history(msgs, tail=24) == msgs + + +def test_digest_collapses_old_keeps_tail_verbatim(): + msgs = [] + for i in range(60): + msgs.append(_msg("user", f"u{i} " + "x" * 50)) + msgs.append(_msg("assistant", f"a{i} " + "y" * 50)) + out = br._digest_history(msgs, tail=10) + # First message is the synthetic digest (user role → alternation preserved). + assert out[0]["role"] == "user" + assert out[0]["content"].startswith("[Earlier conversation digest") + # Recent tail preserved verbatim. + assert out[-1] == msgs[-1] + assert len(out) == 11 # 1 digest + 10 tail + + +def test_digest_does_not_open_tail_on_a_tool_message(): + msgs = [] + for i in range(40): + msgs.append(_msg("user", "u" + "x" * 50)) + msgs.append(_msg("assistant", "", tool_calls=[ + {"function": {"name": "terminal", "arguments": "{}"}}])) + msgs.append({"role": "tool", "content": "result " + "w" * 50}) + out = br._digest_history(msgs, tail=2) + # The verbatim tail (after the digest) must not begin on a bare tool message. + assert out[1]["role"] != "tool" + + +def test_digest_records_tool_names_in_arc(): + old = [ + _msg("user", "do the thing"), + _msg("assistant", "", tool_calls=[ + {"function": {"name": "skill_view", "arguments": "{}"}}, + {"function": {"name": "patch", "arguments": "{}"}}]), + ] + msgs = old + [_msg("user", f"tail{i}") for i in range(30)] + out = br._digest_history(msgs, tail=10) + digest = out[0]["content"] + assert "USER: do the thing" in digest + assert "tools: skill_view, patch" in digest diff --git a/website/docs/user-guide/features/memory.md b/website/docs/user-guide/features/memory.md index 41efc92285c..20c37afa12f 100644 --- a/website/docs/user-guide/features/memory.md +++ b/website/docs/user-guide/features/memory.md @@ -270,6 +270,31 @@ display: > writes to your memory/skill stores, are unaffected by this setting. Set it > per-platform via `display.platforms..memory_notifications`. +## Running the review on a cheaper model (`auxiliary.background_review`) + +The review runs on your **main chat model** by default, replaying the +conversation — which is already warm in the prompt cache, so it's cheap cache +reads. On an expensive main model you can run the review on a cheaper model +instead: + +```yaml +auxiliary: + background_review: + provider: openrouter + model: google/gemini-3-flash-preview # auto (default) = main chat model +``` + +When you point it at a model **different** from your main one, the review runs +there for substantially lower cost (~3–5× in benchmarks). Because a different +model can't reuse your main model's prompt cache anyway, the fork automatically +replays a compact **digest** of the conversation (recent turns verbatim + a +summary of older ones) rather than the full transcript — minimizing what it +writes to the new cache. Capture holds: in testing, memory capture was +identical and skill capture near-identical to the main-model review. + +Leave it at `auto` (or set it to your main model) and nothing changes — the +review keeps running on the main model with the full warm-cache replay. + ## Controlling skill writes (`skills.write_approval`) Skills use the same on/off gate, but the review UX differs because a From 0223ea5f590aec3697ebad6b7f533b5e5df2cc83 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 17:33:52 -0500 Subject: [PATCH 20/26] feat(computer-use): surface macOS permission preflight in the desktop Computer Use already worked through the desktop backend (the cua-driver toolset enables + installs via Settings -> Skills & Tools), but there was no in-app way to see or grant the two macOS permissions it needs, so "give a model my Mac" was tribal knowledge. The grants attach to cua-driver's OWN TCC identity (com.trycua.driver / the installed CuaDriver.app), not Hermes -- so no app entitlement is involved. cua-driver 0.5+ exposes `permissions status/grant`, which we wrap: - tools/computer_use/permissions.py: thin client over the two subcommands - hermes computer-use permissions {status,grant}: CLI parity - GET /api/tools/computer-use/status, POST .../permissions/grant: desktop REST - ComputerUsePanel: live Accessibility + Screen Recording state with a Grant button (dialog attributed to CuaDriver), shown in the expanded Computer Use toolset row. Binary install stays in the existing provider post-setup runner. Follow-ups: i18n the card copy; a "Stop driver" control (cua-driver stop) for the runaway-`serve` case. --- .../src/app/settings/computer-use-panel.tsx | 204 ++++++++++++++++++ apps/desktop/src/app/skills/index.tsx | 4 + apps/desktop/src/hermes.ts | 18 ++ apps/desktop/src/types/hermes.ts | 30 +++ hermes_cli/main.py | 57 +++++ hermes_cli/web_server.py | 56 +++++ tools/computer_use/permissions.py | 136 ++++++++++++ 7 files changed, 505 insertions(+) create mode 100644 apps/desktop/src/app/settings/computer-use-panel.tsx create mode 100644 tools/computer_use/permissions.py diff --git a/apps/desktop/src/app/settings/computer-use-panel.tsx b/apps/desktop/src/app/settings/computer-use-panel.tsx new file mode 100644 index 00000000000..826ce80ae62 --- /dev/null +++ b/apps/desktop/src/app/settings/computer-use-panel.tsx @@ -0,0 +1,204 @@ +import { useCallback, useEffect, useRef, useState } from 'react' + +import { Button } from '@/components/ui/button' +import { getActionStatus, getComputerUseStatus, grantComputerUsePermissions } from '@/hermes' +import { AlertTriangle, Check, ExternalLink, Loader2, RefreshCw, X } from '@/lib/icons' +import { upsertDesktopActionTask } from '@/store/activity' +import { notify, notifyError } from '@/store/notifications' +import type { ComputerUseStatus } from '@/types/hermes' + +import { Pill } from './primitives' + +interface ComputerUsePanelProps { + /** Re-read the parent toolset list after a permission/install change so the + * "Configured / Needs keys" pill stays in sync. */ + onConfiguredChange?: () => void +} + +function PermissionRow({ granted, label, hint }: { granted: boolean | null; label: string; hint: string }) { + const tone = granted === true ? 'primary' : 'muted' + const Icon = granted === true ? Check : granted === false ? X : AlertTriangle + + return ( +
+
+ {label} +

{hint}

+
+ + + {granted === true ? 'Granted' : granted === false ? 'Not granted' : 'Unknown'} + +
+ ) +} + +/** + * Computer Use preflight card. + * + * Computer Use drives the Mac through cua-driver, whose Accessibility + + * Screen Recording grants attach to cua-driver's OWN TCC identity + * (`com.trycua.driver` / the installed CuaDriver.app) — not the Hermes + * desktop app. So this card reflects the driver's real grant state and + * triggers a grant via `cua-driver permissions grant`, which launches + * CuaDriver via LaunchServices so the macOS dialog is attributed correctly. + * + * Binary install/upgrade still lives in the cua-driver provider's post-setup + * runner below this card (the generic ToolsetConfigPanel). + */ +export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) { + const [status, setStatus] = useState(null) + const [loading, setLoading] = useState(true) + const [granting, setGranting] = useState(false) + const activeRef = useRef(false) + + const refresh = useCallback(async () => { + try { + const next = await getComputerUseStatus() + setStatus(next) + } catch (err) { + notifyError(err, 'Could not read Computer Use status') + } finally { + setLoading(false) + } + }, []) + + useEffect(() => { + activeRef.current = true + void refresh() + + return () => { + activeRef.current = false + } + }, [refresh]) + + const grant = useCallback(async () => { + setGranting(true) + + try { + const started = await grantComputerUsePermissions() + + if (!started.ok) { + notifyError(new Error('spawn failed'), 'Could not request permissions') + + return + } + + notify({ + kind: 'info', + title: 'Approve in System Settings', + message: 'macOS will show a permission dialog attributed to CuaDriver. Approve it, then return here.' + }) + + // Poll the grant action until it exits (the driver waits for the user to + // flip the switch), then re-read the live permission state. + for (let attempt = 0; attempt < 150 && activeRef.current; attempt += 1) { + await new Promise(resolve => window.setTimeout(resolve, 1500)) + + if (!activeRef.current) { + break + } + + const polled = await getActionStatus(started.name, 200) + upsertDesktopActionTask(polled) + + if (!polled.running) { + break + } + } + + if (activeRef.current) { + await refresh() + onConfiguredChange?.() + } + } catch (err) { + if (activeRef.current) { + notifyError(err, 'Could not request permissions') + } + } finally { + if (activeRef.current) { + setGranting(false) + } + } + }, [onConfiguredChange, refresh]) + + if (loading) { + return ( +
+ + Checking Computer Use status… +
+ ) + } + + if (!status) { + return null + } + + if (!status.platform_supported) { + return ( +

+ Computer Use permissions are managed on macOS. On this platform, enable the cua-driver provider below. +

+ ) + } + + if (!status.installed) { + return ( +

+ Install the cua-driver backend below to drive macOS. After installing, grant Accessibility and Screen + Recording here. +

+ ) + } + + const allGranted = status.accessibility === true && status.screen_recording === true + + return ( +
+
+
+

+ Grants attach to CuaDriver's own identity (com.trycua.driver), not Hermes — so the dialog is + attributed to the process that drives your Mac. +

+ {status.version &&

{status.version}

} +
+ +
+ + + + + {status.error && ( +

+ + {status.error} +

+ )} + + {allGranted ? ( +
+ + Computer Use is ready. Ask the agent to capture an app and click around. +
+ ) : ( + + )} +
+ ) +} diff --git a/apps/desktop/src/app/skills/index.tsx b/apps/desktop/src/app/skills/index.tsx index 716f0181f12..90aa4a24357 100644 --- a/apps/desktop/src/app/skills/index.tsx +++ b/apps/desktop/src/app/skills/index.tsx @@ -17,6 +17,7 @@ import { useRefreshHotkey } from '../hooks/use-refresh-hotkey' import { useRouteEnumParam } from '../hooks/use-route-enum-param' import { PAGE_INSET_X } from '../layout-constants' import { PageSearchShell } from '../page-search-shell' +import { ComputerUsePanel } from '../settings/computer-use-panel' import { asText, includesQuery, prettyName, toolNames, toolsetDisplayLabel } from '../settings/helpers' import { ToolsetConfigPanel } from '../settings/toolset-config-panel' import type { SetStatusbarItemGroup } from '../shell/statusbar-controls' @@ -334,6 +335,9 @@ export function SkillsView({ setStatusbarItemGroup: _setStatusbarItemGroup, ...p ))} )} + {expanded && toolset.name === 'computer_use' && ( + + )} {expanded && } ) diff --git a/apps/desktop/src/hermes.ts b/apps/desktop/src/hermes.ts index 197e24611ab..04340b0a549 100644 --- a/apps/desktop/src/hermes.ts +++ b/apps/desktop/src/hermes.ts @@ -8,6 +8,7 @@ import type { AudioTranscriptionResponse, AuxiliaryModelsResponse, BackendUpdateCheckResponse, + ComputerUseStatus, ConfigSchemaResponse, CronJob, CronJobCreatePayload, @@ -59,6 +60,8 @@ export type { AudioTranscriptionResponse, AuxiliaryModelsResponse, BackendUpdateCheckResponse, + ComputerUsePermissionSource, + ComputerUseStatus, ConfigFieldSchema, ConfigSchemaResponse, CronJob, @@ -516,6 +519,21 @@ export function runToolsetPostSetup(name: string, key: string): Promise { + return window.hermesDesktop.api({ + ...profileScoped(), + path: '/api/tools/computer-use/status' + }) +} + +export function grantComputerUsePermissions(): Promise { + return window.hermesDesktop.api({ + ...profileScoped(), + path: '/api/tools/computer-use/permissions/grant', + method: 'POST' + }) +} + export function getMessagingPlatforms(): Promise { return window.hermesDesktop.api({ path: '/api/messaging/platforms' diff --git a/apps/desktop/src/types/hermes.ts b/apps/desktop/src/types/hermes.ts index b67cc3041a7..b860ea8e89d 100644 --- a/apps/desktop/src/types/hermes.ts +++ b/apps/desktop/src/types/hermes.ts @@ -579,6 +579,36 @@ export interface ToolsetConfig { active_provider: string | null } +/** Shape of `GET /api/tools/computer-use/status`. + * + * Computer Use drives the Mac through cua-driver, whose Accessibility + + * Screen Recording grants attach to cua-driver's OWN TCC identity + * (`com.trycua.driver`), not the Hermes app. Permission booleans are + * `null` when unknown (binary missing, or no CuaDriver daemon running to + * answer for its own identity). */ +export interface ComputerUsePermissionSource { + attribution?: string + executable?: string + note?: string + pid?: number + responsible_ppid?: number +} + +export interface ComputerUseStatus { + /** macOS is the only platform with the TCC permission model cua-driver gates. */ + platform_supported: boolean + /** cua-driver binary resolved on PATH. */ + installed: boolean + /** e.g. "cua-driver 0.5.1", or null when unknown. */ + version: string | null + accessibility: boolean | null + screen_recording: boolean | null + screen_recording_capturable: boolean | null + source: ComputerUsePermissionSource | null + /** Populated when the status probe itself failed. */ + error: string | null +} + export interface SessionSearchResult { /** Lineage root of the matched conversation. Stable across compression and * used as the durable pin id; falls back to session_id when absent. */ diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 4b1a3f64db2..906497055c8 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -12507,6 +12507,33 @@ def main(): action="store_true", help="Emit the raw structured payload as JSON (same shape as `tools/call`).", ) + computer_use_perms = computer_use_sub.add_parser( + "permissions", + help="Check or grant macOS Accessibility + Screen Recording (macOS)", + description=( + "Computer Use drives the Mac through cua-driver, whose TCC grants\n" + "attach to cua-driver's own identity (com.trycua.driver) — not the\n" + "terminal or the Hermes app. `status` reports the driver's grant\n" + "state; `grant` launches CuaDriver via LaunchServices so the macOS\n" + "permission dialog is attributed to the process that does the work." + ), + ) + computer_use_perms_sub = computer_use_perms.add_subparsers( + dest="computer_use_perms_action" + ) + computer_use_perms_status = computer_use_perms_sub.add_parser( + "status", + help="Report Accessibility + Screen Recording grant state (read-only)", + ) + computer_use_perms_status.add_argument( + "--json", + action="store_true", + help="Emit the normalized permission payload as JSON.", + ) + computer_use_perms_sub.add_parser( + "grant", + help="Request the grants (opens the dialog attributed to CuaDriver)", + ) def cmd_computer_use(args): action = getattr(args, "computer_use_action", None) @@ -12564,6 +12591,36 @@ def main(): json_output=bool(getattr(args, "json", False)), ) sys.exit(code) + if action == "permissions": + perms_action = getattr(args, "computer_use_perms_action", None) + if perms_action == "grant": + from tools.computer_use.permissions import request_permissions_grant + sys.exit(request_permissions_grant()) + if perms_action == "status": + import json as _json + from tools.computer_use.permissions import permissions_status + st = permissions_status() + if bool(getattr(args, "json", False)): + print(_json.dumps(st, indent=2, sort_keys=True)) + else: + if not st["installed"]: + print("cua-driver: not installed") + print(" Run: hermes computer-use install") + elif not st["platform_supported"]: + print("Computer Use permissions are managed on macOS only.") + else: + def _glyph(v): + return "✅" if v is True else ("❌" if v is False else "•") + print(f"cua-driver: {st.get('version') or 'installed'}") + print(f" {_glyph(st['accessibility'])} Accessibility") + print(f" {_glyph(st['screen_recording'])} Screen Recording") + if st.get("error"): + print(f" ⚠ {st['error']}") + if st["accessibility"] is not True or st["screen_recording"] is not True: + print(" Grant: hermes computer-use permissions grant") + sys.exit(0 if st.get("accessibility") and st.get("screen_recording") else 1) + computer_use_perms.print_help() + return # No subcommand → show help computer_use_parser.print_help() diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 997803b8f0a..5a6b764e00f 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -8349,6 +8349,7 @@ async def install_mcp_catalog_entry(body: MCPCatalogInstall, profile: Optional[s # Register the mcp-install action log so /api/actions/mcp-install/status works. _ACTION_LOG_FILES.setdefault("mcp-install", "action-mcp-install.log") +_ACTION_LOG_FILES.setdefault("computer-use-grant", "action-computer-use-grant.log") # --------------------------------------------------------------------------- @@ -10671,6 +10672,61 @@ async def run_toolset_post_setup( return {"ok": True, "pid": proc.pid, "name": "tools-post-setup", "key": body.key} +# --------------------------------------------------------------------------- +# Computer Use (cua-driver) — install + macOS permission state +# +# Computer Use drives the Mac through cua-driver, whose Accessibility + +# Screen Recording grants attach to cua-driver's OWN TCC identity +# (com.trycua.driver / the installed CuaDriver.app) — not the Hermes desktop +# app or this server. The desktop's Computer Use card reflects that state and +# triggers a grant via the same `cua-driver permissions grant` flow the CLI +# uses, so no Hermes-side entitlement is involved. +# --------------------------------------------------------------------------- + + +@app.get("/api/tools/computer-use/status") +async def get_computer_use_status(profile: Optional[str] = None): + """Report cua-driver install + macOS permission state for the desktop card. + + See ``tools.computer_use.permissions.permissions_status`` for the payload + shape. Read-only and fast (shells ``cua-driver permissions status``). + """ + from tools.computer_use.permissions import permissions_status + + with _profile_scope(profile): + return permissions_status() + + +@app.post("/api/tools/computer-use/permissions/grant") +async def grant_computer_use_permissions(profile: Optional[str] = None): + """Spawn ``hermes computer-use permissions grant`` as a background action. + + ``cua-driver permissions grant`` launches CuaDriver via LaunchServices so + the macOS TCC dialog is attributed to com.trycua.driver, then waits for + the user to approve. The frontend polls ``GET /api/actions/computer-use- + grant/status`` for progress and re-reads ``/status`` once it exits. + """ + if sys.platform != "darwin": + raise HTTPException( + status_code=400, + detail="Computer Use permissions are managed on macOS only.", + ) + try: + proc = _spawn_hermes_action( + _profile_cli_args(profile) + + ["computer-use", "permissions", "grant"], + "computer-use-grant", + ) + except HTTPException: + raise + except Exception as exc: + _log.exception("Failed to spawn computer-use permissions grant") + raise HTTPException( + status_code=500, detail=f"Failed to request permissions: {exc}" + ) + return {"ok": True, "pid": proc.pid, "name": "computer-use-grant"} + + # --------------------------------------------------------------------------- # Raw YAML config endpoint # --------------------------------------------------------------------------- diff --git a/tools/computer_use/permissions.py b/tools/computer_use/permissions.py new file mode 100644 index 00000000000..45a6ac2534d --- /dev/null +++ b/tools/computer_use/permissions.py @@ -0,0 +1,136 @@ +""" +macOS Accessibility + Screen Recording permission helpers for Computer Use. + +cua-driver 0.5+ owns the permission model. Crucially, the grants attach to +cua-driver's OWN TCC identity (``com.trycua.driver`` — the installed +``CuaDriver.app``), NOT the terminal, the Hermes CLI, or the Hermes desktop +app. So: + + * ``cua-driver permissions status --json`` reports the driver daemon's real + grant state, independent of who asks. + * ``cua-driver permissions grant`` launches CuaDriver via LaunchServices so + the macOS dialog is attributed to ``com.trycua.driver`` — the process that + actually does the work. + +Because the permission lives with the cua-driver binary, the Hermes desktop +app needs no Accessibility / Screen Recording entitlements of its own. This is +a thin, testable client driven by the ``hermes computer-use permissions`` CLI +and the desktop ``/api/tools/computer-use/status`` endpoint. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import sys +from typing import Any, Dict, Optional + +_BOOLS = ("accessibility", "screen_recording", "screen_recording_capturable") + + +def _driver_cmd(override: Optional[str]) -> str: + if override: + return override + try: + from hermes_cli.tools_config import _cua_driver_cmd + + return _cua_driver_cmd() + except Exception: + return os.environ.get("HERMES_CUA_DRIVER_CMD", "").strip() or "cua-driver" + + +def _child_env() -> Dict[str, str]: + """cua-driver child env honoring the Hermes telemetry opt-in policy.""" + try: + from tools.computer_use.cua_backend import cua_driver_child_env + + return cua_driver_child_env() + except Exception: + return dict(os.environ) + + +def _run(binary: str, *args: str, timeout: float) -> subprocess.CompletedProcess: + return subprocess.run( + [binary, *args], + capture_output=True, + text=True, + timeout=timeout, + env=_child_env(), + ) + + +def permissions_status(driver_cmd: Optional[str] = None) -> Dict[str, Any]: + """Computer Use install + macOS permission state for the desktop card. + + ``None`` permission values mean "unknown" — the driver binary is missing, + the platform has no TCC model, or no CuaDriver daemon is running to answer + for its own identity yet. + """ + binary = shutil.which(_driver_cmd(driver_cmd)) + out: Dict[str, Any] = { + "platform_supported": sys.platform == "darwin", + "installed": bool(binary), + "version": None, + "source": None, + "error": None, + **{k: None for k in _BOOLS}, + } + if not binary: + return out + + try: + out["version"] = (_run(binary, "--version", timeout=5).stdout or "").strip() or None + except Exception: + pass + + # Permissions are a macOS concept; cua-driver only exposes the subcommand there. + if sys.platform != "darwin": + return out + + try: + raw = (_run(binary, "permissions", "status", "--json", timeout=10).stdout or "").strip() + data = json.loads(raw) if raw else {} + except subprocess.TimeoutExpired: + out["error"] = "cua-driver permissions status timed out" + return out + except Exception as exc: # spawn failure or malformed JSON + out["error"] = f"cua-driver permissions status failed: {exc}" + return out + + if isinstance(data, dict): + out.update({k: data[k] for k in _BOOLS if isinstance(data.get(k), bool)}) + if isinstance(data.get("source"), dict): + out["source"] = data["source"] + return out + + +def request_permissions_grant(driver_cmd: Optional[str] = None) -> int: + """Run ``cua-driver permissions grant`` (macOS); stream its output. + + Launches CuaDriver via LaunchServices so the TCC dialog is attributed to + ``com.trycua.driver``, then waits for the grant. Returns the driver's exit + code (0 ok), 2 if the binary is missing, 64 on an unsupported platform. + """ + if sys.platform != "darwin": + print("Computer Use permissions are managed on macOS only.") + return 64 + + binary = shutil.which(_driver_cmd(driver_cmd)) + if not binary: + print("cua-driver: not installed. Run: hermes computer-use install") + return 2 + + print( + "Requesting Accessibility + Screen Recording for CuaDriver.\n" + "macOS will show a dialog attributed to CuaDriver (com.trycua.driver) — " + "approve it, then return here." + ) + try: + return int(subprocess.run([binary, "permissions", "grant"], env=_child_env()).returncode) + except KeyboardInterrupt: # pragma: no cover - interactive + return 130 + except Exception as exc: # pragma: no cover - defensive + print(f"cua-driver permissions grant failed: {exc}", file=sys.stderr) + return 2 From 807b69629532366530b386b24c4d575df3fb8f1e Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 17:38:47 -0500 Subject: [PATCH 21/26] fix(computer-use): vision capture returns an image on cua-driver >=0.5.x Vision mode called a `screenshot` MCP tool that cua-driver dropped in 0.5.x (full-window PNG capture was folded into `get_window_state`). The driver replied "Unknown tool: screenshot", so `images` came back empty, `png_b64` stayed None, and capture returned a 0x0 result with no image on every call. `som`/`ax` were unaffected because they already use `get_window_state`, which masked the regression. Route vision by capability: - driver advertises `screenshot` (older builds) -> use it (no AX walk) - otherwise -> call `get_window_state` but discard the AX tree/elements, returning only the PNG so vision stays free of element noise - capabilities not yet discovered -> try `screenshot`, fall back to `get_window_state` on an empty image, so the path self-heals Add `_image_from_tool_result` to pull the PNG from either an MCP image content-part or `structuredContent.screenshot_png_b64`, and use it on the som path too so the image won't silently drop on driver builds that deliver it via structuredContent instead of a content part. Verified live (vision: 1568x954, 0 elements; som: image + 527 elements) and with unit coverage of all four routing cases. --- tools/computer_use/cua_backend.py | 139 +++++++++++++++++++++++++----- 1 file changed, 118 insertions(+), 21 deletions(-) diff --git a/tools/computer_use/cua_backend.py b/tools/computer_use/cua_backend.py index b46785d2e95..5acf28faf98 100644 --- a/tools/computer_use/cua_backend.py +++ b/tools/computer_use/cua_backend.py @@ -723,6 +723,28 @@ class _CuaDriverSession: return capability in self._capabilities.get(tool, set()) return any(capability in caps for caps in self._capabilities.values()) + def _has_tool(self, name: str) -> bool: + """Return True when ``tools/list`` advertised a tool by this name. + + Used to route capture(): cua-driver dropped the standalone + ``screenshot`` tool and folded full-window PNG capture into + ``get_window_state`` (whose own description notes it "Also captures + a PNG screenshot of the specified window"). Older drivers that still + expose ``screenshot`` keep using it; newer ones fall through to + ``get_window_state``. + + Returns False when discovery hasn't populated the map yet — callers + treat that as "unknown" and probe defensively rather than trusting it. + """ + return name in self._capabilities + + @property + def capabilities_discovered(self) -> bool: + """True once ``tools/list`` populated the per-tool map. When False, + ``_has_tool`` answers are not trustworthy (discovery failed or the + session hasn't started) and capture() should probe defensively.""" + return bool(self._capabilities) + @property def capability_version(self) -> str: """Driver-advertised capability vocabulary version (empty string @@ -825,6 +847,45 @@ def _extract_tool_result(mcp_result: Any) -> Dict[str, Any]: } +def _image_from_tool_result(out: Dict[str, Any]) -> tuple[Optional[str], Optional[str]]: + """Pull a (png_b64, mime_type) pair out of a flattened tool result. + + cua-driver delivers window screenshots in two shapes depending on tool + + transport: + + * As an MCP ``image`` content part — surfaced by ``_extract_tool_result`` + in ``out["images"]`` with a parallel ``image_mime_types`` entry. This + is what ``get_window_state`` emits over the stdio MCP transport. + * As a base64 field inside ``structuredContent`` — + ``screenshot_png_b64`` (+ ``screenshot_mime_type``). This is what + ``get_window_state`` returns when its structured payload carries the + image instead of a content part (newer driver builds; also the shape + seen via the ``cua-driver call`` CLI surface). + + Checking both makes capture() robust to either delivery shape, so the + image never silently drops just because the driver moved it between the + content list and structuredContent. Returns ``(None, None)`` when neither + location carries an image. + """ + images = out.get("images") or [] + if images and images[0]: + mimes = out.get("image_mime_types") or [] + mime = mimes[0] if mimes and mimes[0] else None + return images[0], mime + + structured = out.get("structuredContent") or {} + b64 = structured.get("screenshot_png_b64") or structured.get("png_b64") + if b64: + mime = ( + structured.get("screenshot_mime_type") + or structured.get("mime_type") + or None + ) + return b64, mime + + return None, None + + # --------------------------------------------------------------------------- # The backend itself # --------------------------------------------------------------------------- @@ -1003,25 +1064,61 @@ class CuaDriverBackend(ComputerUseBackend): window_title = "" if mode == "vision": - # screenshot tool: just the PNG, no AX walk. - sc_out = self._session.call_tool( - "screenshot", - { - "window_id": self._active_window_id, - "format": "jpeg", - "quality": 85, - "session": self._session_id, - }, + # Plain screenshot, no AX walk. cua-driver dropped the standalone + # `screenshot` tool (≥0.5.x) and folded full-window PNG capture + # into `get_window_state`. Route accordingly: + # * Driver advertises `screenshot` (older builds) → use it; it's + # the cheapest path (no AX tree walked server-side). + # * Otherwise (current drivers) → call `get_window_state` but + # DISCARD the AX tree/elements, returning only the PNG. Vision + # mode's whole contract is "just the pixels, no element noise", + # so we drop everything but the image. + # When capability discovery hasn't run (empty map), we don't trust + # a negative `_has_tool` answer — we still try `screenshot` first + # and fall back if the driver rejects it, so the path self-heals on + # any driver version. + use_screenshot = ( + self._session._has_tool("screenshot") + or not self._session.capabilities_discovered ) - if sc_out["images"]: - png_b64 = sc_out["images"][0] - # Pick up the explicit mimeType cua-driver attaches to image - # parts (Surface 7). Empty string means the driver didn't - # carry one — callers will fall back to magic-byte sniffing. - mimes = sc_out.get("image_mime_types") or [] - image_mime_type = mimes[0] if mimes and mimes[0] else None + sc_out: Optional[Dict[str, Any]] = None + if use_screenshot: + sc_out = self._session.call_tool( + "screenshot", + { + "window_id": self._active_window_id, + "format": "jpeg", + "quality": 85, + "session": self._session_id, + }, + ) + png_b64, image_mime_type = _image_from_tool_result(sc_out) + if not png_b64: + # Driver had no usable `screenshot` (e.g. "Unknown tool: + # screenshot" on ≥0.5.x, or an empty image part). Fall + # through to the get_window_state path below. + sc_out = None + + if sc_out is None: + gws_out = self._session.call_tool( + "get_window_state", + { + "pid": self._active_pid, + "window_id": self._active_window_id, + "session": self._session_id, + }, + ) + png_b64, image_mime_type = _image_from_tool_result(gws_out) + # Still grab the window title — it's cheap and useful in the + # vision response — but deliberately leave `elements` empty so + # vision stays free of AX-tree noise. + text = gws_out["data"] if isinstance(gws_out["data"], str) else "" + _, tree = _split_tree_text(text) + wt = re.search(r'AXWindow\s+"([^"]+)"', tree) + if wt: + window_title = wt.group(1) else: - # get_window_state: AX tree + optional screenshot. + # get_window_state: AX tree + screenshot. gws_out = self._session.call_tool( "get_window_state", { @@ -1058,10 +1155,10 @@ class CuaDriverBackend(ComputerUseBackend): if e.element_token } - if gws_out["images"]: - png_b64 = gws_out["images"][0] - mimes = gws_out.get("image_mime_types") or [] - image_mime_type = mimes[0] if mimes and mimes[0] else None + # Image may arrive as an MCP image part or inside + # structuredContent (screenshot_png_b64) depending on the driver + # build — _image_from_tool_result handles both. + png_b64, image_mime_type = _image_from_tool_result(gws_out) # Extract window title from the AX tree first AXWindow line. wt = re.search(r'AXWindow\s+"([^"]+)"', tree) From 2dfcead68367c93c256a966d8314ca36fb2d679f Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 17:48:43 -0500 Subject: [PATCH 22/26] feat(computer-use): make the preflight cross-platform (win/linux) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The card was macOS-only. cua-driver also runs on Windows and Linux, so fold `cua-driver doctor` (cross-platform binary/health probes) into a single OS-aware `ready` signal: - macOS: ready == both TCC grants; keeps the permission rows + grant flow. - Windows/Linux: no TCC toggles, so ready == driver health, with a per-OS note (SmartScreen/UIAccess on Windows; X11/XWayland on Linux). `computer_use_status()` replaces the macOS-only `permissions_status()` and surfaces `platform`, `ready`, `can_grant`, and the doctor `checks` (non-ok ones render as warnings). CLI `permissions status`, the REST endpoint, and the desktop card all key off the one payload. Grant stays macOS-only (400 elsewhere — nothing to grant). --- .../src/app/settings/computer-use-panel.tsx | 121 +++++++++++------ apps/desktop/src/hermes.ts | 1 + apps/desktop/src/types/hermes.ts | 27 +++- hermes_cli/main.py | 43 +++--- hermes_cli/web_server.py | 36 ++--- tools/computer_use/permissions.py | 126 ++++++++++++------ 6 files changed, 229 insertions(+), 125 deletions(-) diff --git a/apps/desktop/src/app/settings/computer-use-panel.tsx b/apps/desktop/src/app/settings/computer-use-panel.tsx index 826ce80ae62..ada5c08e3ad 100644 --- a/apps/desktop/src/app/settings/computer-use-panel.tsx +++ b/apps/desktop/src/app/settings/computer-use-panel.tsx @@ -15,18 +15,32 @@ interface ComputerUsePanelProps { onConfiguredChange?: () => void } -function PermissionRow({ granted, label, hint }: { granted: boolean | null; label: string; hint: string }) { - const tone = granted === true ? 'primary' : 'muted' +// Per-OS one-liner shown when there's no TCC grant flow (Windows/Linux). macOS +// drives the permission rows instead, so it has no entry here. +const PLATFORM_NOTE: Record = { + linux: 'Drives your desktop via the X11/XWayland accessibility stack — no permission prompt.', + win32: 'First run may trigger a Windows SmartScreen prompt for the cua-driver UIAccess worker — allow it.' +} + +function tone(granted: boolean | null) { + return granted === true ? 'primary' : 'muted' +} + +function GrantIcon({ granted }: { granted: boolean | null }) { const Icon = granted === true ? Check : granted === false ? X : AlertTriangle + return +} + +function PermissionRow({ granted, label, hint }: { granted: boolean | null; label: string; hint: string }) { return (
{label}

{hint}

- - + + {granted === true ? 'Granted' : granted === false ? 'Not granted' : 'Unknown'}
@@ -34,17 +48,17 @@ function PermissionRow({ granted, label, hint }: { granted: boolean | null; labe } /** - * Computer Use preflight card. + * Cross-platform Computer Use preflight card. * - * Computer Use drives the Mac through cua-driver, whose Accessibility + - * Screen Recording grants attach to cua-driver's OWN TCC identity - * (`com.trycua.driver` / the installed CuaDriver.app) — not the Hermes - * desktop app. So this card reflects the driver's real grant state and - * triggers a grant via `cua-driver permissions grant`, which launches - * CuaDriver via LaunchServices so the macOS dialog is attributed correctly. + * cua-driver runs on macOS, Windows, and Linux, but readiness differs: macOS + * needs two TCC grants (Accessibility + Screen Recording) that attach to + * cua-driver's own `com.trycua.driver` identity — not Hermes — and are + * requested via `cua-driver permissions grant` (dialog attributed to + * CuaDriver). Windows/Linux have no TCC toggles, so readiness is driver health + * from `cua-driver doctor`. The backend folds both into one `ready` signal. * - * Binary install/upgrade still lives in the cua-driver provider's post-setup - * runner below this card (the generic ToolsetConfigPanel). + * Binary install/upgrade stays in the cua-driver provider's post-setup runner + * below this card (the generic ToolsetConfigPanel). */ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) { const [status, setStatus] = useState(null) @@ -54,8 +68,7 @@ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) const refresh = useCallback(async () => { try { - const next = await getComputerUseStatus() - setStatus(next) + setStatus(await getComputerUseStatus()) } catch (err) { notifyError(err, 'Could not read Computer Use status') } finally { @@ -67,9 +80,7 @@ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) activeRef.current = true void refresh() - return () => { - activeRef.current = false - } + return () => void (activeRef.current = false) }, [refresh]) const grant = useCallback(async () => { @@ -90,8 +101,7 @@ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) message: 'macOS will show a permission dialog attributed to CuaDriver. Approve it, then return here.' }) - // Poll the grant action until it exits (the driver waits for the user to - // flip the switch), then re-read the live permission state. + // The driver waits for the user to flip the switch — poll until it exits. for (let attempt = 0; attempt < 150 && activeRef.current; attempt += 1) { await new Promise(resolve => window.setTimeout(resolve, 1500)) @@ -138,7 +148,7 @@ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) if (!status.platform_supported) { return (

- Computer Use permissions are managed on macOS. On this platform, enable the cua-driver provider below. + Computer Use isn't supported on this platform ({status.platform}).

) } @@ -146,22 +156,26 @@ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps) if (!status.installed) { return (

- Install the cua-driver backend below to drive macOS. After installing, grant Accessibility and Screen - Recording here. + Install the cua-driver backend below to drive this machine. + {status.can_grant && ' Then grant Accessibility and Screen Recording here.'}

) } - const allGranted = status.accessibility === true && status.screen_recording === true + const failingChecks = status.checks.filter(c => c.status !== 'ok') return (
-

- Grants attach to CuaDriver's own identity (com.trycua.driver), not Hermes — so the dialog is - attributed to the process that drives your Mac. -

+ {status.can_grant ? ( +

+ Grants attach to CuaDriver's own identity (com.trycua.driver), not Hermes — so the dialog is + attributed to the process that drives your Mac. +

+ ) : ( +

{PLATFORM_NOTE[status.platform] ?? ''}

+ )} {status.version &&

{status.version}

}
- - + {status.can_grant ? ( + <> + + + + ) : ( +
+ Driver health + + + {status.ready === true ? 'Ready' : status.ready === false ? 'Not ready' : 'Unknown'} + +
+ )} + + {failingChecks.map(c => ( +

+ + {c.label}: {c.message} +

+ ))} {status.error && (

@@ -188,16 +221,18 @@ export function ComputerUsePanel({ onConfiguredChange }: ComputerUsePanelProps)

)} - {allGranted ? ( + {status.ready ? (
Computer Use is ready. Ask the agent to capture an app and click around.
) : ( - + status.can_grant && ( + + ) )}
) diff --git a/apps/desktop/src/hermes.ts b/apps/desktop/src/hermes.ts index 04340b0a549..a7b5ae14307 100644 --- a/apps/desktop/src/hermes.ts +++ b/apps/desktop/src/hermes.ts @@ -60,6 +60,7 @@ export type { AudioTranscriptionResponse, AuxiliaryModelsResponse, BackendUpdateCheckResponse, + ComputerUseCheck, ComputerUsePermissionSource, ComputerUseStatus, ConfigFieldSchema, diff --git a/apps/desktop/src/types/hermes.ts b/apps/desktop/src/types/hermes.ts index b860ea8e89d..338ed2d3544 100644 --- a/apps/desktop/src/types/hermes.ts +++ b/apps/desktop/src/types/hermes.ts @@ -581,11 +581,11 @@ export interface ToolsetConfig { /** Shape of `GET /api/tools/computer-use/status`. * - * Computer Use drives the Mac through cua-driver, whose Accessibility + - * Screen Recording grants attach to cua-driver's OWN TCC identity - * (`com.trycua.driver`), not the Hermes app. Permission booleans are - * `null` when unknown (binary missing, or no CuaDriver daemon running to - * answer for its own identity). */ + * cua-driver runs on macOS, Windows, and Linux. `ready` is the single OS-aware + * readiness signal: on macOS both TCC grants (Accessibility + Screen + * Recording, which attach to cua-driver's own `com.trycua.driver` identity, + * not Hermes); elsewhere, driver health from `cua-driver doctor`. `null` + * means unknown (binary missing / probe failed). */ export interface ComputerUsePermissionSource { attribution?: string executable?: string @@ -594,13 +594,28 @@ export interface ComputerUsePermissionSource { responsible_ppid?: number } +export interface ComputerUseCheck { + label: string + status: string + message: string +} + export interface ComputerUseStatus { - /** macOS is the only platform with the TCC permission model cua-driver gates. */ + /** `sys.platform`: "darwin" | "win32" | "linux" | ... */ + platform: string + /** cua-driver has a runtime backend for this platform. */ platform_supported: boolean /** cua-driver binary resolved on PATH. */ installed: boolean /** e.g. "cua-driver 0.5.1", or null when unknown. */ version: string | null + /** Unified readiness — both TCC grants (macOS) or driver health (else). */ + ready: boolean | null + /** Whether a permission grant flow exists (macOS-only TCC). */ + can_grant: boolean + /** Cross-platform `cua-driver doctor` probes. */ + checks: ComputerUseCheck[] + /** macOS TCC detail — `null` off macOS or when unknown. */ accessibility: boolean | null screen_recording: boolean | null screen_recording_capturable: boolean | null diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 906497055c8..9c0d53247f3 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -12598,27 +12598,32 @@ def main(): sys.exit(request_permissions_grant()) if perms_action == "status": import json as _json - from tools.computer_use.permissions import permissions_status - st = permissions_status() + from tools.computer_use.permissions import computer_use_status + st = computer_use_status() if bool(getattr(args, "json", False)): print(_json.dumps(st, indent=2, sort_keys=True)) - else: - if not st["installed"]: - print("cua-driver: not installed") - print(" Run: hermes computer-use install") - elif not st["platform_supported"]: - print("Computer Use permissions are managed on macOS only.") - else: - def _glyph(v): - return "✅" if v is True else ("❌" if v is False else "•") - print(f"cua-driver: {st.get('version') or 'installed'}") - print(f" {_glyph(st['accessibility'])} Accessibility") - print(f" {_glyph(st['screen_recording'])} Screen Recording") - if st.get("error"): - print(f" ⚠ {st['error']}") - if st["accessibility"] is not True or st["screen_recording"] is not True: - print(" Grant: hermes computer-use permissions grant") - sys.exit(0 if st.get("accessibility") and st.get("screen_recording") else 1) + sys.exit(0 if st["ready"] else 1) + if not st["platform_supported"]: + print(f"Computer Use is not supported on {st['platform']}.") + sys.exit(1) + if not st["installed"]: + print("cua-driver: not installed. Run: hermes computer-use install") + sys.exit(1) + glyph = lambda v: "✅" if v is True else ("❌" if v is False else "•") # noqa: E731 + print(f"cua-driver: {st['version'] or 'installed'} ({st['platform']})") + if st["can_grant"]: # macOS TCC permissions + print(f" {glyph(st['accessibility'])} Accessibility") + print(f" {glyph(st['screen_recording'])} Screen Recording") + if not st["ready"]: + print(" Grant: hermes computer-use permissions grant") + else: # no TCC model — readiness is driver health + print(f" {glyph(st['ready'])} driver health (no permission toggles on {st['platform']})") + for c in st["checks"]: + if c["status"] != "ok": + print(f" ⚠ {c['label']}: {c['message']}") + if st["error"]: + print(f" ⚠ {st['error']}") + sys.exit(0 if st["ready"] else 1) computer_use_perms.print_help() return # No subcommand → show help diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 5a6b764e00f..c6a6b065589 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -10673,43 +10673,45 @@ async def run_toolset_post_setup( # --------------------------------------------------------------------------- -# Computer Use (cua-driver) — install + macOS permission state +# Computer Use (cua-driver) — cross-platform readiness + macOS permission grant # -# Computer Use drives the Mac through cua-driver, whose Accessibility + -# Screen Recording grants attach to cua-driver's OWN TCC identity -# (com.trycua.driver / the installed CuaDriver.app) — not the Hermes desktop -# app or this server. The desktop's Computer Use card reflects that state and -# triggers a grant via the same `cua-driver permissions grant` flow the CLI -# uses, so no Hermes-side entitlement is involved. +# cua-driver runs on macOS, Windows, and Linux. The desktop card reflects +# per-OS readiness: on macOS the Accessibility + Screen Recording TCC grants +# (which attach to cua-driver's OWN identity, com.trycua.driver — not Hermes, +# so no app entitlement is involved); elsewhere, driver health from +# `cua-driver doctor`. The grant flow is macOS-only (no TCC toggles to request +# on Windows/Linux). # --------------------------------------------------------------------------- @app.get("/api/tools/computer-use/status") async def get_computer_use_status(profile: Optional[str] = None): - """Report cua-driver install + macOS permission state for the desktop card. + """Cross-platform Computer Use readiness for the desktop card. - See ``tools.computer_use.permissions.permissions_status`` for the payload - shape. Read-only and fast (shells ``cua-driver permissions status``). + See ``tools.computer_use.permissions.computer_use_status`` for the payload + shape. Read-only and fast (shells ``cua-driver doctor`` + macOS + ``permissions status``). """ - from tools.computer_use.permissions import permissions_status + from tools.computer_use.permissions import computer_use_status with _profile_scope(profile): - return permissions_status() + return computer_use_status() @app.post("/api/tools/computer-use/permissions/grant") async def grant_computer_use_permissions(profile: Optional[str] = None): """Spawn ``hermes computer-use permissions grant`` as a background action. - ``cua-driver permissions grant`` launches CuaDriver via LaunchServices so - the macOS TCC dialog is attributed to com.trycua.driver, then waits for - the user to approve. The frontend polls ``GET /api/actions/computer-use- - grant/status`` for progress and re-reads ``/status`` once it exits. + macOS-only: ``cua-driver permissions grant`` launches CuaDriver via + LaunchServices so the TCC dialog is attributed to com.trycua.driver, then + waits for approval. The frontend polls ``GET /api/actions/computer-use- + grant/status`` and re-reads ``/status`` once it exits. Windows/Linux have + no TCC toggles to grant, so this returns 400 there. """ if sys.platform != "darwin": raise HTTPException( status_code=400, - detail="Computer Use permissions are managed on macOS only.", + detail="Computer Use permission grants are a macOS concept.", ) try: proc = _spawn_hermes_action( diff --git a/tools/computer_use/permissions.py b/tools/computer_use/permissions.py index 45a6ac2534d..e72208b796e 100644 --- a/tools/computer_use/permissions.py +++ b/tools/computer_use/permissions.py @@ -1,21 +1,24 @@ """ -macOS Accessibility + Screen Recording permission helpers for Computer Use. +Cross-platform Computer Use readiness + macOS permission helpers. -cua-driver 0.5+ owns the permission model. Crucially, the grants attach to -cua-driver's OWN TCC identity (``com.trycua.driver`` — the installed -``CuaDriver.app``), NOT the terminal, the Hermes CLI, or the Hermes desktop -app. So: +cua-driver runs on macOS, Windows, and Linux, but "ready to drive" means +something different on each: - * ``cua-driver permissions status --json`` reports the driver daemon's real - grant state, independent of who asks. - * ``cua-driver permissions grant`` launches CuaDriver via LaunchServices so - the macOS dialog is attributed to ``com.trycua.driver`` — the process that - actually does the work. + * macOS — explicit TCC grants (Accessibility + Screen Recording). cua-driver + reports/requests them via ``permissions status`` / ``permissions grant``. + The grants attach to cua-driver's OWN identity (``com.trycua.driver`` / + the installed ``CuaDriver.app``), NOT Hermes — so no Hermes entitlement is + involved, and ``grant`` launches CuaDriver via LaunchServices so the macOS + dialog is attributed correctly. + * Windows — no TCC toggles; the UIAccess worker (``cua-driver-uia.exe``) may + trip a SmartScreen prompt on first run. Readiness == driver health. + * Linux — assistive control via the X11/XWayland stack. Readiness == driver + health. -Because the permission lives with the cua-driver binary, the Hermes desktop -app needs no Accessibility / Screen Recording entitlements of its own. This is -a thin, testable client driven by the ``hermes computer-use permissions`` CLI -and the desktop ``/api/tools/computer-use/status`` endpoint. +The universal signal on every platform is ``cua-driver doctor --json`` (binary +integrity + platform support). ``computer_use_status`` folds that together with +the macOS permission detail into one payload for the desktop card, the +``hermes computer-use permissions`` CLI, and ``/api/tools/computer-use/status``. """ from __future__ import annotations @@ -25,8 +28,10 @@ import os import shutil import subprocess import sys -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional +# Platforms with a cua-driver runtime backend (mirrors the toolset platform_gate). +_RUNTIME_PLATFORMS = frozenset({"darwin", "win32", "linux"}) _BOOLS = ("accessibility", "screen_recording", "screen_recording_capturable") @@ -61,18 +66,65 @@ def _run(binary: str, *args: str, timeout: float) -> subprocess.CompletedProcess ) -def permissions_status(driver_cmd: Optional[str] = None) -> Dict[str, Any]: - """Computer Use install + macOS permission state for the desktop card. +def _json_out(binary: str, *args: str, timeout: float) -> Any: + """Run ``binary args`` and parse stdout as JSON, or ``None`` on any failure.""" + raw = (_run(binary, *args, timeout=timeout).stdout or "").strip() + return json.loads(raw) if raw else None - ``None`` permission values mean "unknown" — the driver binary is missing, - the platform has no TCC model, or no CuaDriver daemon is running to answer - for its own identity yet. + +def _doctor(binary: str) -> Optional[Dict[str, Any]]: + """``cua-driver doctor --json`` → ``{ok, checks:[{label,status,message}]}``.""" + try: + data = _json_out(binary, "doctor", "--json", timeout=12) + except Exception: + return None + if not isinstance(data, dict): + return None + checks: List[Dict[str, str]] = [ + { + "label": str(p.get("label", "")), + "status": str(p.get("status", "")), + "message": str(p.get("message", "")), + } + for p in data.get("probes", []) + if isinstance(p, dict) + ] + return {"ok": bool(data.get("ok")), "checks": checks} + + +def _mac_permissions(binary: str, out: Dict[str, Any]) -> None: + """Fold ``cua-driver permissions status --json`` booleans into ``out``.""" + try: + data = _json_out(binary, "permissions", "status", "--json", timeout=10) + except subprocess.TimeoutExpired: + out["error"] = "cua-driver permissions status timed out" + return + except Exception as exc: # spawn failure or malformed JSON + out["error"] = f"cua-driver permissions status failed: {exc}" + return + if isinstance(data, dict): + out.update({k: data[k] for k in _BOOLS if isinstance(data.get(k), bool)}) + if isinstance(data.get("source"), dict): + out["source"] = data["source"] + + +def computer_use_status(driver_cmd: Optional[str] = None) -> Dict[str, Any]: + """Unified, OS-aware Computer Use readiness for the desktop card. + + ``ready`` is the single signal the UI keys off: on macOS it's both TCC + grants; elsewhere it's driver health (no TCC model). ``None`` means + unknown (binary missing / probe failed). ``can_grant`` is macOS-only. """ + plat = sys.platform binary = shutil.which(_driver_cmd(driver_cmd)) out: Dict[str, Any] = { - "platform_supported": sys.platform == "darwin", + "platform": plat, + "platform_supported": plat in _RUNTIME_PLATFORMS, "installed": bool(binary), "version": None, + "ready": None, + "can_grant": plat == "darwin", + "checks": [], "source": None, "error": None, **{k: None for k in _BOOLS}, @@ -85,24 +137,17 @@ def permissions_status(driver_cmd: Optional[str] = None) -> Dict[str, Any]: except Exception: pass - # Permissions are a macOS concept; cua-driver only exposes the subcommand there. - if sys.platform != "darwin": - return out + doctor = _doctor(binary) + if doctor is not None: + out["checks"] = doctor["checks"] - try: - raw = (_run(binary, "permissions", "status", "--json", timeout=10).stdout or "").strip() - data = json.loads(raw) if raw else {} - except subprocess.TimeoutExpired: - out["error"] = "cua-driver permissions status timed out" - return out - except Exception as exc: # spawn failure or malformed JSON - out["error"] = f"cua-driver permissions status failed: {exc}" - return out - - if isinstance(data, dict): - out.update({k: data[k] for k in _BOOLS if isinstance(data.get(k), bool)}) - if isinstance(data.get("source"), dict): - out["source"] = data["source"] + if plat == "darwin": + _mac_permissions(binary, out) + if out["error"] is None: + out["ready"] = out["accessibility"] is True and out["screen_recording"] is True + elif doctor is not None: + # No TCC model off macOS — readiness is driver health. + out["ready"] = doctor["ok"] return out @@ -111,10 +156,11 @@ def request_permissions_grant(driver_cmd: Optional[str] = None) -> int: Launches CuaDriver via LaunchServices so the TCC dialog is attributed to ``com.trycua.driver``, then waits for the grant. Returns the driver's exit - code (0 ok), 2 if the binary is missing, 64 on an unsupported platform. + code (0 ok), 2 if the binary is missing, 64 on a non-macOS platform (which + has no TCC permission model to grant). """ if sys.platform != "darwin": - print("Computer Use permissions are managed on macOS only.") + print("Computer Use permissions are a macOS concept; nothing to grant here.") return 64 binary = shutil.which(_driver_cmd(driver_cmd)) From 3c1058e2e983c45856c4417e1c47d69843e778ed Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 17:59:18 -0500 Subject: [PATCH 23/26] fix(computer-use): set stdin=DEVNULL on cua-driver subprocess calls The subprocess-stdin guard (TUI gateway fd-inheritance protection) flagged the `permissions grant` call. None of the cua-driver probes/grant read stdin, so DEVNULL is correct; apply it to the shared `_run` helper and the grant call. --- tools/computer_use/permissions.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/computer_use/permissions.py b/tools/computer_use/permissions.py index e72208b796e..ab97b60ee66 100644 --- a/tools/computer_use/permissions.py +++ b/tools/computer_use/permissions.py @@ -63,6 +63,7 @@ def _run(binary: str, *args: str, timeout: float) -> subprocess.CompletedProcess text=True, timeout=timeout, env=_child_env(), + stdin=subprocess.DEVNULL, ) @@ -174,7 +175,13 @@ def request_permissions_grant(driver_cmd: Optional[str] = None) -> int: "approve it, then return here." ) try: - return int(subprocess.run([binary, "permissions", "grant"], env=_child_env()).returncode) + return int( + subprocess.run( + [binary, "permissions", "grant"], + env=_child_env(), + stdin=subprocess.DEVNULL, + ).returncode + ) except KeyboardInterrupt: # pragma: no cover - interactive return 130 except Exception as exc: # pragma: no cover - defensive From a6b670d4a251f98ca3bac91a867bb469f7ce4e93 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 18:19:36 -0500 Subject: [PATCH 24/26] fix(desktop): avoid stack overflow on embedded image replay Replace the giant embedded-image regex with a bounded scanner so opening sessions with multi-megabyte data URLs does not crash the renderer. --- apps/desktop/src/lib/embedded-images.test.ts | 9 ++ apps/desktop/src/lib/embedded-images.ts | 125 +++++++++++++++++-- 2 files changed, 121 insertions(+), 13 deletions(-) diff --git a/apps/desktop/src/lib/embedded-images.test.ts b/apps/desktop/src/lib/embedded-images.test.ts index 5e6df1c5061..c51742783b0 100644 --- a/apps/desktop/src/lib/embedded-images.test.ts +++ b/apps/desktop/src/lib/embedded-images.test.ts @@ -32,4 +32,13 @@ describe('extractEmbeddedImages', () => { expect(result.cleanedText).toBe('first mid tail') expect(result.images).toEqual([SAMPLE_PNG_DATA_URL, second]) }) + + it('handles multi-megabyte data URLs without overflowing the JS stack', () => { + const hugeDataUrl = 'data:image/png;base64,' + 'A'.repeat(8_000_000) + const result = extractEmbeddedImages(`describe this ${hugeDataUrl} thanks`) + + expect(result.cleanedText).toBe('describe this thanks') + expect(result.images).toHaveLength(1) + expect(result.images[0]).toHaveLength(hugeDataUrl.length) + }) }) diff --git a/apps/desktop/src/lib/embedded-images.ts b/apps/desktop/src/lib/embedded-images.ts index 3d990151353..cd68ce68292 100644 --- a/apps/desktop/src/lib/embedded-images.ts +++ b/apps/desktop/src/lib/embedded-images.ts @@ -1,7 +1,11 @@ -const EMBEDDED_IMAGE_RE = - /(\{\s*"type"\s*:\s*"image_url"\s*,\s*"image_url"\s*:\s*\{\s*"url"\s*:\s*")?(data:image\/[\w.+-]+;base64,[A-Za-z0-9+/=]{64,})("\s*\}\s*\})?/g - const DATA_URL_RE = /^data:([\w./+-]+);base64,(.*)$/i +const DATA_IMAGE_PREFIX = 'data:image/' +const BASE64_MARKER = ';base64,' +const MIN_EMBEDDED_IMAGE_BASE64_LENGTH = 64 +const JSON_IMAGE_OPEN_RE = /\{\s*"type"\s*:\s*"image_url"\s*,\s*"image_url"\s*:\s*\{\s*"url"\s*:\s*"$/ +const JSON_IMAGE_CLOSE_RE = /^"\s*\}\s*\}/ +const JSON_IMAGE_OPEN_MAX = 96 +const JSON_IMAGE_CLOSE_MAX = 16 export const DATA_IMAGE_URL_RE = /^data:image\/[\w.+-]+;base64,/i @@ -31,24 +35,119 @@ export function dataUrlToBlob(dataUrl: string): Blob | null { } } +function isImageMimeCode(code: number): boolean { + return ( + (code >= 48 && code <= 57) || + (code >= 65 && code <= 90) || + (code >= 97 && code <= 122) || + code === 43 || + code === 45 || + code === 46 || + code === 95 + ) +} + +function isBase64Code(code: number): boolean { + return ( + (code >= 48 && code <= 57) || + (code >= 65 && code <= 90) || + (code >= 97 && code <= 122) || + code === 43 || + code === 47 || + code === 61 + ) +} + +function readDataImageUrl(text: string, start: number): { end: number; url: string } | null { + if (!text.startsWith(DATA_IMAGE_PREFIX, start)) { + return null + } + + let cursor = start + DATA_IMAGE_PREFIX.length + + while (cursor < text.length && isImageMimeCode(text.charCodeAt(cursor))) { + cursor += 1 + } + + if (cursor === start + DATA_IMAGE_PREFIX.length || !text.startsWith(BASE64_MARKER, cursor)) { + return null + } + + cursor += BASE64_MARKER.length + const base64Start = cursor + + while (cursor < text.length && isBase64Code(text.charCodeAt(cursor))) { + cursor += 1 + } + + if (cursor - base64Start < MIN_EMBEDDED_IMAGE_BASE64_LENGTH) { + return null + } + + return { end: cursor, url: text.slice(start, cursor) } +} + +function embeddedImageRemovalRange(text: string, dataStart: number, dataEnd: number): { end: number; start: number } { + let start = dataStart + let end = dataEnd + const openSearchStart = Math.max(0, dataStart - JSON_IMAGE_OPEN_MAX) + const openMatch = text.slice(openSearchStart, dataStart).match(JSON_IMAGE_OPEN_RE) + + if (openMatch?.index !== undefined) { + const close = text.slice(dataEnd, dataEnd + JSON_IMAGE_CLOSE_MAX).match(JSON_IMAGE_CLOSE_RE) + + if (close) { + start = openSearchStart + openMatch.index + end = dataEnd + close[0].length + } + } + + return { end, start } +} + +function normalizeCleanedText(text: string): string { + return text.replace(/[ \t]+\n/g, '\n').replace(/\n{3,}/g, '\n\n').trim() +} + export function extractEmbeddedImages(text: string): EmbeddedImageExtraction { - if (!text || !text.includes('data:image/')) { + if (!text || !text.includes(DATA_IMAGE_PREFIX)) { return { cleanedText: text, images: [] } } const images: string[] = [] + const pieces: string[] = [] + let appendCursor = 0 + let searchCursor = 0 - const cleanedText = text - .replace(EMBEDDED_IMAGE_RE, (_match, _open, dataUrl: string) => { - images.push(dataUrl) + while (searchCursor < text.length) { + const dataStart = text.indexOf(DATA_IMAGE_PREFIX, searchCursor) - return '' - }) - .replace(/[ \t]+\n/g, '\n') - .replace(/\n{3,}/g, '\n\n') - .trim() + if (dataStart === -1) { + break + } - return { cleanedText, images } + const dataUrl = readDataImageUrl(text, dataStart) + + if (!dataUrl) { + searchCursor = dataStart + DATA_IMAGE_PREFIX.length + + continue + } + + const range = embeddedImageRemovalRange(text, dataStart, dataUrl.end) + pieces.push(text.slice(appendCursor, range.start)) + images.push(dataUrl.url) + appendCursor = range.end + searchCursor = range.end + } + + if (!images.length) { + return { cleanedText: text, images: [] } + } + + pieces.push(text.slice(appendCursor)) + + return { cleanedText: normalizeCleanedText(pieces.join('')), images } } export function embeddedImageUrls(text: string): string[] { From 88e136448d0820186d1f56b5093c40e71b3d71f5 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 18:23:21 -0500 Subject: [PATCH 25/26] fix(agent): shrink anthropic-native image history Retry image-size rejections by rewriting Anthropic base64 image source blocks, not just OpenAI-style image_url parts. --- agent/conversation_compression.py | 41 +++++++++++++++-- tests/run_agent/test_image_shrink_recovery.py | 46 +++++++++++++++++++ 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index 94fff283893..ba67f036954 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -805,10 +805,11 @@ def try_shrink_image_parts_in_messages( Pillow couldn't help (caller should surface the original error). Strategy: look for ``image_url`` / ``input_image`` parts carrying a - ``data:image/...;base64,...`` payload. For each one whose encoded - size exceeds 4 MB (a safe target that slides under Anthropic's 5 MB - ceiling with header overhead) or whose longest side exceeds - ``max_dimension``, write the base64 to a tempfile, call + ``data:image/...;base64,...`` payload, plus Anthropic-native + ``{"type": "image", "source": {"type": "base64", ...}}`` blocks. + For each one whose encoded size exceeds 4 MB (a safe target that slides + under Anthropic's 5 MB ceiling with header overhead) or whose longest side + exceeds ``max_dimension``, write the base64 to a tempfile, call ``vision_tools._resize_image_for_vision`` to produce a smaller data URL, and substitute it in place. @@ -964,6 +965,28 @@ def try_shrink_image_parts_in_messages( logger.warning("image-shrink recovery: re-encode failed — %s", exc) return None, triggered_by is not None + def _source_to_data_url(source: Any) -> Optional[str]: + if not isinstance(source, dict) or source.get("type") != "base64": + return None + data = source.get("data") + if not isinstance(data, str) or not data: + return None + media_type = str(source.get("media_type") or "image/jpeg").strip() + if not media_type.startswith("image/"): + media_type = "image/jpeg" + return f"data:{media_type};base64,{data}" + + def _write_data_url_to_source(source: dict, data_url: str) -> None: + header, _, data = data_url.partition(",") + media_type = "image/jpeg" + if header.startswith("data:"): + candidate = header[len("data:"):].split(";", 1)[0].strip() + if candidate.startswith("image/"): + media_type = candidate + source["type"] = "base64" + source["media_type"] = media_type + source["data"] = data + for msg in api_messages: if not isinstance(msg, dict): continue @@ -974,6 +997,16 @@ def try_shrink_image_parts_in_messages( if not isinstance(part, dict): continue ptype = part.get("type") + if ptype == "image": + source = part.get("source") + url = _source_to_data_url(source) + resized, unshrinkable = _shrink_data_url(url or "") + if resized and isinstance(source, dict): + _write_data_url_to_source(source, resized) + changed_count += 1 + elif unshrinkable: + unshrinkable_oversized += 1 + continue if ptype not in {"image_url", "input_image"}: continue image_value = part.get("image_url") diff --git a/tests/run_agent/test_image_shrink_recovery.py b/tests/run_agent/test_image_shrink_recovery.py index 24f8b7e242d..bdbb905d66e 100644 --- a/tests/run_agent/test_image_shrink_recovery.py +++ b/tests/run_agent/test_image_shrink_recovery.py @@ -260,6 +260,52 @@ class TestShrinkImagePartsHelper: assert seen["max_dimension"] == 2000 assert msgs[0]["content"][0]["image_url"]["url"] == shrunk + def test_anthropic_base64_image_source_rewritten(self, monkeypatch): + """Anthropic-native image blocks are shrinkable after adapter conversion.""" + agent = _make_agent() + _install_fake_pillow(monkeypatch, (2501, 100), shrunk_size=(1500, 60)) + original = _big_png_data_url(100) + _, _, original_data = original.partition(",") + shrunk = "data:image/jpeg;base64," + "N" * 1000 + seen = {} + + def _fake_resize(path, mime_type=None, max_base64_bytes=None, max_dimension=None): + seen["mime_type"] = mime_type + seen["max_dimension"] = max_dimension + return shrunk + + monkeypatch.setattr( + "tools.vision_tools._resize_image_for_vision", + _fake_resize, + raising=False, + ) + + msgs = [{ + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": original_data, + }, + }, + ], + }] + changed = agent._try_shrink_image_parts_in_messages( + msgs, + max_dimension=2000, + ) + source = msgs[0]["content"][0]["source"] + + assert changed is True + assert seen["mime_type"] == "image/png" + assert seen["max_dimension"] == 2000 + assert source["type"] == "base64" + assert source["media_type"] == "image/jpeg" + assert source["data"] == "N" * 1000 + def test_oversized_input_image_string_shape_rewritten(self, monkeypatch): """OpenAI Responses shape: {type: input_image, image_url: "data:..."}.""" agent = _make_agent() From 3fffecbdafec0bcb08a7335da4e15181bc6ff5d6 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Mon, 22 Jun 2026 18:33:46 -0500 Subject: [PATCH 26/26] feat(desktop): add timeline rail for long chat threads Adds a compact right-edge prompt timeline for long desktop chat sessions, with hover previews, click-to-jump, active/hover row states, and pane hover-reveal suppression so the rail can live at the hard edge without opening side panels. --- .../assistant-ui/thread-timeline-data.test.ts | 51 ++++ .../assistant-ui/thread-timeline-data.ts | 75 +++++ .../assistant-ui/thread-timeline.tsx | 272 ++++++++++++++++++ .../src/components/assistant-ui/thread.tsx | 14 +- .../src/components/pane-shell/pane-shell.tsx | 11 +- apps/desktop/src/store/panes.ts | 2 + 6 files changed, 421 insertions(+), 4 deletions(-) create mode 100644 apps/desktop/src/components/assistant-ui/thread-timeline-data.test.ts create mode 100644 apps/desktop/src/components/assistant-ui/thread-timeline-data.ts create mode 100644 apps/desktop/src/components/assistant-ui/thread-timeline.tsx diff --git a/apps/desktop/src/components/assistant-ui/thread-timeline-data.test.ts b/apps/desktop/src/components/assistant-ui/thread-timeline-data.test.ts new file mode 100644 index 00000000000..a3cc48da56a --- /dev/null +++ b/apps/desktop/src/components/assistant-ui/thread-timeline-data.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from 'vitest' + +import { activeTimelineIndex, deriveTimelineEntries, timelinePreview } from './thread-timeline-data' + +describe('timelinePreview', () => { + it('collapses whitespace to a single line', () => { + expect(timelinePreview('hello\n\n world\tagain')).toBe('hello world again') + }) + + it('truncates with an ellipsis past the limit', () => { + const out = timelinePreview('abcdefghij', 5) + expect(out).toBe('abcd…') + expect(out.length).toBe(5) + }) +}) + +describe('deriveTimelineEntries', () => { + it('keeps non-empty user prompts in order', () => { + expect( + deriveTimelineEntries([ + { id: 'u1', role: 'user', text: 'first' }, + { id: 'a1', role: 'assistant', text: 'answer' }, + { id: 'u2', role: 'user', text: ' second ' } + ]) + ).toEqual([ + { id: 'u1', preview: 'first' }, + { id: 'u2', preview: 'second' } + ]) + }) + + it('drops blanks and background-process notifications', () => { + expect( + deriveTimelineEntries([ + { id: 'u1', role: 'user', text: ' ' }, + { id: 'u2', role: 'user', text: '[IMPORTANT: Background process 123 finished]' }, + { id: 'u3', role: 'user', text: 'real prompt' } + ]).map(e => e.id) + ).toEqual(['u3']) + }) +}) + +describe('activeTimelineIndex', () => { + it('returns the last prompt scrolled to or above the top edge', () => { + expect(activeTimelineIndex([-400, -10, 320])).toBe(1) + }) + + it('falls back to the first rendered entry', () => { + expect(activeTimelineIndex([null, 120, 480])).toBe(1) + expect(activeTimelineIndex([null, null])).toBe(0) + }) +}) diff --git a/apps/desktop/src/components/assistant-ui/thread-timeline-data.ts b/apps/desktop/src/components/assistant-ui/thread-timeline-data.ts new file mode 100644 index 00000000000..e52d1d7c780 --- /dev/null +++ b/apps/desktop/src/components/assistant-ui/thread-timeline-data.ts @@ -0,0 +1,75 @@ +// Pure timeline helpers — no React/DOM; tested in thread-timeline-data.test.ts. + +export interface TimelineSourceMessage { + id: string + role: string + text: string +} + +export interface TimelineEntry { + id: string + preview: string +} + +// Injected as user messages for alternation; not human prompts (thread.tsx). +const PROCESS_NOTIFICATION_RE = /^\[IMPORTANT: Background process [\s\S]*\]$/ + +const PREVIEW_MAX = 120 + +export function timelinePreview(text: string, max: number = PREVIEW_MAX): string { + const collapsed = text.replace(/\s+/g, ' ').trim() + + if (collapsed.length <= max) { + return collapsed + } + + return `${collapsed.slice(0, max - 1).trimEnd()}…` +} + +export function deriveTimelineEntries(messages: readonly TimelineSourceMessage[]): TimelineEntry[] { + const entries: TimelineEntry[] = [] + + for (const message of messages) { + if (message.role !== 'user') { + continue + } + + const text = message.text.trim() + + if (!text || PROCESS_NOTIFICATION_RE.test(text)) { + continue + } + + entries.push({ id: message.id, preview: timelinePreview(text) }) + } + + return entries +} + +/** Last user prompt at/above the viewport top (with slack); else first rendered. */ +export function activeTimelineIndex(offsets: readonly (number | null)[], slack: number = 8): number { + let active = -1 + let firstRendered = -1 + + for (let i = 0; i < offsets.length; i++) { + const offset = offsets[i] + + if (offset == null) { + continue + } + + if (firstRendered === -1) { + firstRendered = i + } + + if (offset <= slack) { + active = i + } + } + + if (active !== -1) { + return active + } + + return firstRendered === -1 ? 0 : firstRendered +} diff --git a/apps/desktop/src/components/assistant-ui/thread-timeline.tsx b/apps/desktop/src/components/assistant-ui/thread-timeline.tsx new file mode 100644 index 00000000000..e330cb6d755 --- /dev/null +++ b/apps/desktop/src/components/assistant-ui/thread-timeline.tsx @@ -0,0 +1,272 @@ +import { useAuiState } from '@assistant-ui/react' +import { type FC, useCallback, useEffect, useMemo, useRef, useState } from 'react' + +import { composerPanelCard } from '@/components/chat/composer-dock' +import { triggerHaptic } from '@/lib/haptics' +import { cn } from '@/lib/utils' +import { setPaneHoverRevealSuppressed } from '@/store/panes' + +import { + activeTimelineIndex, + deriveTimelineEntries, + type TimelineEntry, + type TimelineSourceMessage +} from './thread-timeline-data' + +const MIN_ENTRIES = 4 +const VIEWPORT = '[data-slot="aui_thread-viewport"]' +const HOVER_CLOSE_MS = 140 + +const ROW_CLASS = + 'relative flex w-full min-w-0 max-w-full cursor-pointer select-none overflow-hidden rounded-md px-2 py-1 text-left outline-hidden transition-colors duration-100 ease-out hover:bg-(--ui-row-hover-background) hover:transition-none' + +const POPOVER_SHELL = cn( + 'absolute right-full top-1/2 z-50 mr-1.5 max-h-[min(22rem,calc(100vh-8rem))] w-80 max-w-[min(20rem,calc(100vw-2rem))] -translate-y-1/2 overflow-x-hidden overflow-y-auto overscroll-contain p-1 text-popover-foreground transition-[opacity,transform] duration-100 ease-out group-hover/timeline:transition-none', + composerPanelCard, + // Solid fill — composerPanelCard is deliberately translucent; without this, + // directive chips in the transcript bleed through and look like popover overflow. + 'bg-(--composer-fill)' +) + +function userPromptText(content: unknown): string { + if (typeof content === 'string') { + return content + } + + if (!Array.isArray(content)) { + return '' + } + + let out = '' + + for (const part of content) { + if (typeof part === 'string') { + out += part + + continue + } + + if (!part || typeof part !== 'object') { + continue + } + + const row = part as { text?: unknown; type?: unknown } + + if ((!row.type || row.type === 'text') && typeof row.text === 'string') { + out += row.text + } + } + + return out +} + +function scrollToPrompt(id: string) { + const viewport = document.querySelector(VIEWPORT) + const node = viewport?.querySelector(`[data-message-id="${CSS.escape(id)}"]`) + + if (!viewport || !node) { + return + } + + const top = viewport.scrollTop + (node.getBoundingClientRect().top - viewport.getBoundingClientRect().top) - 8 + + triggerHaptic('selection') + viewport.scrollTo({ behavior: 'smooth', top: Math.max(0, top) }) +} + +/** Right-edge prompt rail — hover previews, click to jump. ≥4 user turns only. */ +export const ThreadTimeline: FC = () => { + const sourceSignature = useAuiState(s => { + const rows: TimelineSourceMessage[] = [] + + for (const message of s.thread.messages) { + if (message.role !== 'user') { + continue + } + + rows.push({ id: message.id, role: 'user', text: userPromptText(message.content) }) + } + + return JSON.stringify(rows) + }) + + const entries = useMemo( + () => deriveTimelineEntries(JSON.parse(sourceSignature) as TimelineSourceMessage[]), + [sourceSignature] + ) + + const [activeIndex, setActiveIndex] = useState(0) + const [hoverIndex, setHoverIndex] = useState(null) + const [open, setOpen] = useState(false) + const closeTimerRef = useRef(undefined) + + const keepOpen = useCallback(() => { + window.clearTimeout(closeTimerRef.current) + setPaneHoverRevealSuppressed(true) + setOpen(true) + }, []) + + const closeSoon = useCallback(() => { + window.clearTimeout(closeTimerRef.current) + setHoverIndex(null) + setPaneHoverRevealSuppressed(false) + closeTimerRef.current = window.setTimeout(() => setOpen(false), HOVER_CLOSE_MS) + }, []) + + useEffect( + () => () => { + window.clearTimeout(closeTimerRef.current) + setPaneHoverRevealSuppressed(false) + }, + [] + ) + + useEffect(() => { + if (entries.length < MIN_ENTRIES) { + setPaneHoverRevealSuppressed(false) + } + }, [entries.length]) + + useEffect(() => { + const viewport = document.querySelector(VIEWPORT) + + if (!viewport || entries.length === 0) { + return + } + + let raf = 0 + + const compute = () => { + raf = 0 + + const top = viewport.getBoundingClientRect().top + + const offsets = entries.map(entry => { + const node = viewport.querySelector(`[data-message-id="${CSS.escape(entry.id)}"]`) + + return node ? node.getBoundingClientRect().top - top : null + }) + + const next = activeTimelineIndex(offsets) + + setActiveIndex(prev => (prev === next ? prev : next)) + } + + const onScroll = () => { + if (!raf) { + raf = requestAnimationFrame(compute) + } + } + + compute() + viewport.addEventListener('scroll', onScroll, { passive: true }) + + return () => { + viewport.removeEventListener('scroll', onScroll) + + if (raf) { + cancelAnimationFrame(raf) + } + } + }, [entries]) + + if (entries.length < MIN_ENTRIES) { + return null + } + + return ( +
+ + +
+ ) +} + +const TimelinePopover: FC<{ + activeIndex: number + entries: TimelineEntry[] + hoverIndex: number | null + onHover: (index: number) => void + onJump: (id: string) => void + open: boolean +}> = ({ activeIndex, entries, hoverIndex, onHover, onJump, open }) => ( +
+ {entries.map((entry, index) => { + const hovered = index === hoverIndex + const active = index === activeIndex + + return ( + + ) + })} +
+) + +const TimelineTicks: FC<{ + activeIndex: number + entries: TimelineEntry[] + onHover: (index: number) => void + onJump: (id: string) => void +}> = ({ activeIndex, entries, onHover, onJump }) => ( +
+ {entries.map((entry, index) => ( + + ))} +
+) diff --git a/apps/desktop/src/components/assistant-ui/thread.tsx b/apps/desktop/src/components/assistant-ui/thread.tsx index 1ac97c200ca..6057307dec3 100644 --- a/apps/desktop/src/components/assistant-ui/thread.tsx +++ b/apps/desktop/src/components/assistant-ui/thread.tsx @@ -64,6 +64,7 @@ import { ClarifyTool } from '@/components/assistant-ui/clarify-tool' import { DirectiveContent, hermesDirectiveFormatter } from '@/components/assistant-ui/directive-text' import { MarkdownText, MarkdownTextContent } from '@/components/assistant-ui/markdown-text' import { ThreadMessageList } from '@/components/assistant-ui/thread-list' +import { ThreadTimeline } from '@/components/assistant-ui/thread-timeline' import { ToolFallback, ToolGroupSlot } from '@/components/assistant-ui/tool-fallback' import { TooltipIconButton } from '@/components/assistant-ui/tooltip-icon-button' import { UserMessageText } from '@/components/assistant-ui/user-message-text' @@ -212,6 +213,7 @@ export const Thread: FC<{ sessionKey={sessionKey} /> {loading === 'session' && } + ) } @@ -797,7 +799,15 @@ function messageAttachmentRefs(value: unknown): string[] { return value.every(ref => typeof ref === 'string') ? value : EMPTY_ATTACHMENT_REFS } -function StickyHumanMessageContainer({ attachments, children }: { attachments?: ReactNode; children: ReactNode }) { +function StickyHumanMessageContainer({ + attachments, + children, + messageId +}: { + attachments?: ReactNode + children: ReactNode + messageId?: string +}) { return ( // Fragment, not a wrapper: a wrapping element becomes the sticky's // containing block (it'd stick within its own height = never). The bubble @@ -806,6 +816,7 @@ function StickyHumanMessageContainer({ attachments, children }: { attachments?: <>
@@ -990,6 +1001,7 @@ const UserMessage: FC<{ return ( (null) // Keyboard (mod+b / mod+j) pins the reveal open while collapsed; hover is CSS. @@ -378,7 +379,10 @@ export function Pane({ >