diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index bba93214f..db7603498 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -900,10 +900,16 @@ class BasePlatformAdapter(ABC): self._fatal_error_retryable = True self._fatal_error_handler: Optional[Callable[["BasePlatformAdapter"], Awaitable[None] | None]] = None - # Track active message handlers per session for interrupt support - # Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt) + # Track active message handlers per session for interrupt support. + # _active_sessions stores the per-session interrupt Event; _session_tasks + # maps session → the specific Task currently processing it so that + # session-terminating commands (/stop, /new, /reset) can cancel the + # right task and release the adapter-level guard deterministically. + # Without the owner-task map, an old task's finally block could delete + # a newer task's guard, leaving stale busy state. self._active_sessions: Dict[str, asyncio.Event] = {} self._pending_messages: Dict[str, MessageEvent] = {} + self._session_tasks: Dict[str, asyncio.Task] = {} # Background message-processing tasks spawned by handle_message(). # Gateway shutdown cancels these so an old gateway instance doesn't keep # working on a task after --replace or manual restarts. @@ -1680,6 +1686,222 @@ class BasePlatformAdapter(ABC): return f"{existing_text}\n\n{new_text}".strip() return existing_text + # ------------------------------------------------------------------ + # Session task + guard ownership helpers + # ------------------------------------------------------------------ + # These were introduced together with the _session_tasks owner map to + # make session lifecycle reconciliation deterministic across (a) the + # normal completion path, (b) /stop/ /new/ /reset bypass commands, + # and (c) stale-lock self-heal on the next inbound message. + + def _release_session_guard( + self, + session_key: str, + *, + guard: Optional[asyncio.Event] = None, + ) -> None: + """Release the adapter-level guard for a session. + + When ``guard`` is provided, only release the entry if it still points + at that exact Event. This lets reset-like commands swap in a temporary + guard while the old processing task unwinds, without having the old + task's cleanup accidentally clear the replacement guard. + """ + current_guard = self._active_sessions.get(session_key) + if current_guard is None: + return + if guard is not None and current_guard is not guard: + return + del self._active_sessions[session_key] + + def _session_task_is_stale(self, session_key: str) -> bool: + """Return True if the owner task for ``session_key`` is done/cancelled. + + A lock is "stale" when the adapter still has ``_active_sessions[key]`` + AND a known owner task in ``_session_tasks`` that has already exited. + When there is no owner task at all, that usually means the guard was + installed by some path other than handle_message() (tests sometimes + install guards directly) — don't treat that as stale. The on-entry + self-heal only needs to handle the production split-brain case where + an owner task was recorded, then exited without clearing its guard. + """ + task = self._session_tasks.get(session_key) + if task is None: + return False + done = getattr(task, "done", None) + return bool(done and done()) + + def _heal_stale_session_lock(self, session_key: str) -> bool: + """Clear a stale session lock if the owner task is already gone. + + Returns True if a stale lock was healed. Returns False if there is + no lock, or the owner task is still alive (the normal busy case). + + This is the on-entry safety net sidbin's issue #11016 analysis calls + for: without it, a split-brain — adapter still thinks the session is + active, but nothing is actually processing — traps the chat in + infinite "Interrupting current task..." until the gateway is + restarted. + """ + if session_key not in self._active_sessions: + return False + if not self._session_task_is_stale(session_key): + return False + logger.warning( + "[%s] Healing stale session lock for %s (owner task is done/absent)", + self.name, + session_key, + ) + self._active_sessions.pop(session_key, None) + self._pending_messages.pop(session_key, None) + self._session_tasks.pop(session_key, None) + return True + + def _start_session_processing( + self, + event: MessageEvent, + session_key: str, + *, + interrupt_event: Optional[asyncio.Event] = None, + ) -> bool: + """Spawn a background processing task under the given session guard. + + Returns True on success. If the runtime stubs ``create_task`` with a + non-Task sentinel (some tests do this), the guard is rolled back and + False is returned so the caller isn't left holding a half-installed + session lock. + """ + guard = interrupt_event or asyncio.Event() + self._active_sessions[session_key] = guard + + task = asyncio.create_task(self._process_message_background(event, session_key)) + self._session_tasks[session_key] = task + try: + self._background_tasks.add(task) + except TypeError: + # Tests stub create_task() with lightweight sentinels that are not + # hashable and do not support lifecycle callbacks. + self._session_tasks.pop(session_key, None) + self._release_session_guard(session_key, guard=guard) + return False + if hasattr(task, "add_done_callback"): + task.add_done_callback(self._background_tasks.discard) + task.add_done_callback(self._expected_cancelled_tasks.discard) + return True + + async def cancel_session_processing( + self, + session_key: str, + *, + release_guard: bool = True, + discard_pending: bool = True, + ) -> None: + """Cancel in-flight processing for a single session. + + ``release_guard=False`` keeps the adapter-level session guard in place + so reset-like commands can finish atomically before follow-up messages + are allowed to start a fresh background task. + """ + task = self._session_tasks.pop(session_key, None) + if task is not None and not task.done(): + logger.debug( + "[%s] Cancelling active processing for session %s", + self.name, + session_key, + ) + self._expected_cancelled_tasks.add(task) + task.cancel() + try: + await task + except asyncio.CancelledError: + pass + except Exception: + logger.debug( + "[%s] Session cancellation raised while unwinding %s", + self.name, + session_key, + exc_info=True, + ) + if discard_pending: + self._pending_messages.pop(session_key, None) + if release_guard: + self._release_session_guard(session_key) + + async def _drain_pending_after_session_command( + self, + session_key: str, + command_guard: asyncio.Event, + ) -> None: + """Resume the latest queued follow-up once a session command completes. + + Called at the tail of /stop, /new, and /reset dispatch. Releases the + command-scoped guard, then — if a follow-up message landed while the + command was running — spawns a fresh processing task for it. + """ + pending_event = self._pending_messages.pop(session_key, None) + self._release_session_guard(session_key, guard=command_guard) + if pending_event is None: + return + self._start_session_processing(pending_event, session_key) + + async def _dispatch_active_session_command( + self, + event: MessageEvent, + session_key: str, + cmd: str, + ) -> None: + """Dispatch a reset-like bypass command while preserving guard ordering. + + /stop, /new, and /reset must: + 1. Keep the session guard installed while the runner processes the + command (so a racing follow-up message stays queued, not + dispatched as a second parallel run). + 2. Cancel the old in-flight adapter task only AFTER the runner has + finished handling the command (so the runner sees consistent + state and its response is sent in order). + 3. Release the command-scoped guard and drain the latest queued + follow-up exactly once, after 1 and 2 complete. + """ + logger.debug( + "[%s] Command '/%s' bypassing active-session guard for %s", + self.name, + cmd, + session_key, + ) + + current_guard = self._active_sessions.get(session_key) + command_guard = asyncio.Event() + self._active_sessions[session_key] = command_guard + thread_meta = {"thread_id": event.source.thread_id} if event.source.thread_id else None + + try: + response = await self._message_handler(event) + # Old adapter task (if any) is cancelled AFTER the runner has + # fully handled the command — keeps ordering deterministic. + await self.cancel_session_processing( + session_key, + release_guard=False, + discard_pending=False, + ) + if response: + await self._send_with_retry( + chat_id=event.source.chat_id, + content=response, + reply_to=event.message_id, + metadata=thread_meta, + ) + except Exception: + # On failure, restore the original guard if one still exists so + # we don't leave the session in a half-reset state. + if self._active_sessions.get(session_key) is command_guard: + if session_key in self._session_tasks and current_guard is not None: + self._active_sessions[session_key] = current_guard + else: + self._release_session_guard(session_key, guard=command_guard) + raise + + await self._drain_pending_after_session_command(session_key, command_guard) + async def handle_message(self, event: MessageEvent) -> None: """ Process an incoming message. @@ -1696,7 +1918,15 @@ class BasePlatformAdapter(ABC): group_sessions_per_user=self.config.extra.get("group_sessions_per_user", True), thread_sessions_per_user=self.config.extra.get("thread_sessions_per_user", False), ) - + + # On-entry self-heal: if the adapter still has an _active_sessions + # entry for this key but the owner task has already exited (done or + # cancelled), the lock is stale. Clear it and fall through to + # normal dispatch so the user isn't trapped behind a dead guard — + # this is the split-brain tail described in issue #11016. + if session_key in self._active_sessions: + self._heal_stale_session_lock(session_key) + # Check if there's already an active handler for this session if session_key in self._active_sessions: # Certain commands must bypass the active-session guard and be @@ -1713,6 +1943,23 @@ class BasePlatformAdapter(ABC): from hermes_cli.commands import should_bypass_active_session if should_bypass_active_session(cmd): + # /stop, /new, /reset must cancel the in-flight adapter task + # and preserve ordering of queued follow-ups. Route those + # through the dedicated handoff path that serializes + # cancellation + runner response + pending drain. + if cmd in ("stop", "new", "reset"): + try: + await self._dispatch_active_session_command(event, session_key, cmd) + except Exception as e: + logger.error( + "[%s] Command '/%s' dispatch failed: %s", + self.name, cmd, e, exc_info=True, + ) + return + + # Other bypass commands (/approve, /deny, /status, + # /background, /restart) just need direct dispatch — they + # don't cancel the running task. logger.debug( "[%s] Command '/%s' bypassing active-session guard for %s", self.name, cmd, session_key, @@ -1758,19 +2005,9 @@ class BasePlatformAdapter(ABC): # starts would also pass the _active_sessions check and spawn a # duplicate task. (grammY sequentialize / aiogram EventIsolation # pattern — set the guard synchronously, not inside the task.) - self._active_sessions[session_key] = asyncio.Event() - - # Spawn background task to process this message - task = asyncio.create_task(self._process_message_background(event, session_key)) - try: - self._background_tasks.add(task) - except TypeError: - # Some tests stub create_task() with lightweight sentinels that are not - # hashable and do not support lifecycle callbacks. - return - if hasattr(task, "add_done_callback"): - task.add_done_callback(self._background_tasks.discard) - task.add_done_callback(self._expected_cancelled_tasks.discard) + # _start_session_processing installs the guard AND the owner-task + # mapping atomically so stale-lock detection works. + self._start_session_processing(event, session_key) @staticmethod def _get_human_delay() -> float: @@ -2130,6 +2367,9 @@ class BasePlatformAdapter(ABC): drain_task = asyncio.create_task( self._process_message_background(late_pending, session_key) ) + # Hand ownership of the session to the drain task so stale-lock + # detection keeps working while it runs. + self._session_tasks[session_key] = drain_task try: self._background_tasks.add(drain_task) drain_task.add_done_callback(self._background_tasks.discard) @@ -2139,9 +2379,14 @@ class BasePlatformAdapter(ABC): # Leave _active_sessions[session_key] populated — the drain # task's own lifecycle will clean it up. else: - # Clean up session tracking - if session_key in self._active_sessions: - del self._active_sessions[session_key] + # Clean up session tracking. Guard-match both deletes so a + # reset-like command that already swapped in its own + # command_guard (and cancelled us) can't be accidentally + # cleared by our unwind. The command owns the session now. + current_task = asyncio.current_task() + if current_task is not None and self._session_tasks.get(session_key) is current_task: + del self._session_tasks[session_key] + self._release_session_guard(session_key, guard=interrupt_event) async def cancel_background_tasks(self) -> None: """Cancel any in-flight background message-processing tasks. @@ -2171,6 +2416,7 @@ class BasePlatformAdapter(ABC): # will be in self._background_tasks now. Re-check. self._background_tasks.clear() self._expected_cancelled_tasks.clear() + self._session_tasks.clear() self._pending_messages.clear() self._active_sessions.clear() diff --git a/gateway/run.py b/gateway/run.py index 0110e8cad..dcee18e51 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -8665,7 +8665,12 @@ class GatewayRunner: override = self._session_model_overrides.get(session_key) return override is not None and override.get("model") == agent_model - def _release_running_agent_state(self, session_key: str) -> None: + def _release_running_agent_state( + self, + session_key: str, + *, + run_generation: Optional[int] = None, + ) -> bool: """Pop ALL per-running-agent state entries for ``session_key``. Replaces ad-hoc ``del self._running_agents[key]`` calls scattered @@ -8681,13 +8686,25 @@ class GatewayRunner: across turns (``_session_model_overrides``, ``_voice_mode``, ``_pending_approvals``, ``_update_prompt_pending``) is NOT touched here — those have their own lifecycles. + + When ``run_generation`` is provided, only clear the slot if that + generation is still current for the session. This prevents an + older async run whose generation was bumped by /stop or /new from + clobbering a newer run's state during its own unwind. Returns + True when the slot was cleared, False when an ownership guard + blocked it. """ if not session_key: - return + return False + if run_generation is not None and not self._is_session_run_current( + session_key, run_generation + ): + return False self._running_agents.pop(session_key, None) self._running_agents_ts.pop(session_key, None) if hasattr(self, "_busy_ack_ts"): self._busy_ack_ts.pop(session_key, None) + return True def _clear_session_boundary_security_state(self, session_key: str) -> None: """Clear approval state that must not survive a real conversation switch.""" @@ -10249,10 +10266,24 @@ class GatewayRunner: # Wait for agent to be created while agent_holder[0] is None: await asyncio.sleep(0.05) - if session_key: - self._running_agents[session_key] = agent_holder[0] - if self._draining: - self._update_runtime_status("draining") + if not session_key: + return + # Only promote the sentinel to the real agent if this run is still + # current. If /stop or /new bumped the generation while we were + # spinning up, leave the newer run's slot alone — we'll be + # discarded by the stale-result check in _handle_message_with_agent. + if run_generation is not None and not self._is_session_run_current( + session_key, run_generation + ): + logger.info( + "Skipping stale agent promotion for %s — generation %s is no longer current", + (session_key or "")[:20], + run_generation, + ) + return + self._running_agents[session_key] = agent_holder[0] + if self._draining: + self._update_runtime_status("draining") tracking_task = asyncio.create_task(track_agent()) @@ -10758,7 +10789,14 @@ class GatewayRunner: # Clean up tracking tracking_task.cancel() if session_key: - self._release_running_agent_state(session_key) + # Only release the slot if this run's generation still owns + # it. A /stop or /new that bumped the generation while we + # were unwinding has already installed its own state; this + # guard prevents an old run from clobbering it on the way + # out. + self._release_running_agent_state( + session_key, run_generation=run_generation + ) if self._draining: self._update_runtime_status("draining") diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 91232dc0d..6d4c49fd4 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -394,17 +394,23 @@ DEFAULT_CONFIG = { # (bash doesn't source bashrc in non-interactive login mode) or # zsh-specific files like ``~/.zshrc`` / ``~/.zprofile``. # Paths support ``~`` / ``${VAR}``. Missing files are silently - # skipped. When empty, Hermes auto-appends ``~/.bashrc`` if the + # skipped. When empty, Hermes auto-sources ``~/.profile``, + # ``~/.bash_profile``, and ``~/.bashrc`` (in that order) if the # snapshot shell is bash (this is the ``auto_source_bashrc`` # behaviour — disable with that key if you want strict login-only # semantics). "shell_init_files": [], - # When true (default), Hermes sources ``~/.bashrc`` in the login - # shell used to build the environment snapshot. This captures - # PATH additions, shell functions, and aliases defined in the - # user's bashrc — which a plain ``bash -l -c`` would otherwise - # miss because bash skips bashrc in non-interactive login mode. - # Turn this off if you have a bashrc that misbehaves when sourced + # When true (default), Hermes sources the user's shell rc files + # (``~/.profile``, ``~/.bash_profile``, ``~/.bashrc``) in the + # login shell used to build the environment snapshot. This + # captures PATH additions, shell functions, and aliases — which a + # plain ``bash -l -c`` would otherwise miss because bash skips + # bashrc in non-interactive login mode, and because a default + # Debian/Ubuntu ``~/.bashrc`` short-circuits on non-interactive + # sources. ``~/.profile`` and ``~/.bash_profile`` are tried first + # because ``n`` / ``nvm`` / ``asdf`` installers typically write + # their PATH exports there without an interactivity guard. Turn + # this off if your rc files misbehave when sourced # non-interactively (e.g. one that hard-exits on TTY checks). "auto_source_bashrc": True, "docker_image": "nikolaik/python-nodejs:python3.11-nodejs20", diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 8b360087c..7796cc575 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -761,6 +761,21 @@ def get_systemd_unit_path(system: bool = False) -> Path: return Path.home() / ".config" / "systemd" / "user" / f"{name}.service" +class UserSystemdUnavailableError(RuntimeError): + """Raised when ``systemctl --user`` cannot reach the user D-Bus session. + + Typically hit on fresh RHEL/Debian SSH sessions where linger is disabled + and no user@.service is running, so ``/run/user/$UID/bus`` never exists. + Carries a user-facing remediation message in ``args[0]``. + """ + + +def _user_dbus_socket_path() -> Path: + """Return the expected per-user D-Bus socket path (regardless of existence).""" + xdg = os.environ.get("XDG_RUNTIME_DIR") or f"/run/user/{os.getuid()}" + return Path(xdg) / "bus" + + def _ensure_user_systemd_env() -> None: """Ensure DBUS_SESSION_BUS_ADDRESS and XDG_RUNTIME_DIR are set for systemctl --user. @@ -783,6 +798,126 @@ def _ensure_user_systemd_env() -> None: os.environ["DBUS_SESSION_BUS_ADDRESS"] = f"unix:path={bus_path}" +def _wait_for_user_dbus_socket(timeout: float = 3.0) -> bool: + """Poll for the user D-Bus socket to appear, up to ``timeout`` seconds. + + Linger-enabled user@.service can take a second or two to spawn the socket + after ``loginctl enable-linger`` runs. Returns True once the socket exists. + """ + import time + + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if _user_dbus_socket_path().exists(): + _ensure_user_systemd_env() + return True + time.sleep(0.2) + return _user_dbus_socket_path().exists() + + +def _preflight_user_systemd(*, auto_enable_linger: bool = True) -> None: + """Ensure ``systemctl --user`` will reach the user D-Bus session bus. + + No-op when the bus socket is already there (the common case on desktops + and linger-enabled servers). On fresh SSH sessions where the socket is + missing: + + * If linger is already enabled, wait briefly for user@.service to spawn + the socket. + * If linger is disabled and ``auto_enable_linger`` is True, try + ``loginctl enable-linger $USER`` (works as non-root when polkit permits + it, otherwise needs sudo). + * If the socket is still missing afterwards, raise + :class:`UserSystemdUnavailableError` with a precise remediation message. + + Callers should treat the exception as a terminal condition for user-scope + systemd operations and surface the message to the user. + """ + _ensure_user_systemd_env() + bus_path = _user_dbus_socket_path() + if bus_path.exists(): + return + + import getpass + + username = getpass.getuser() + linger_enabled, linger_detail = get_systemd_linger_status() + + if linger_enabled is True: + if _wait_for_user_dbus_socket(timeout=3.0): + return + # Linger is on but socket still missing — unusual; fall through to error. + _raise_user_systemd_unavailable( + username, + reason="User D-Bus socket is missing even though linger is enabled.", + fix_hint=( + f" systemctl start user@{os.getuid()}.service\n" + " (may require sudo; try again after the command succeeds)" + ), + ) + + if auto_enable_linger and shutil.which("loginctl"): + try: + result = subprocess.run( + ["loginctl", "enable-linger", username], + capture_output=True, + text=True, + check=False, + timeout=30, + ) + except Exception as exc: + _raise_user_systemd_unavailable( + username, + reason=f"loginctl enable-linger failed ({exc}).", + fix_hint=f" sudo loginctl enable-linger {username}", + ) + else: + if result.returncode == 0: + if _wait_for_user_dbus_socket(timeout=5.0): + print(f"✓ Enabled linger for {username} — user D-Bus now available") + return + # enable-linger succeeded but the socket never appeared. + _raise_user_systemd_unavailable( + username, + reason="Linger was enabled, but the user D-Bus socket did not appear.", + fix_hint=( + " Log out and log back in, then re-run the command.\n" + f" Or reboot and run: systemctl --user start {get_service_name()}" + ), + ) + detail = (result.stderr or result.stdout or f"exit {result.returncode}").strip() + _raise_user_systemd_unavailable( + username, + reason=f"loginctl enable-linger was denied: {detail}", + fix_hint=f" sudo loginctl enable-linger {username}", + ) + + _raise_user_systemd_unavailable( + username, + reason=( + "User D-Bus session is not available " + f"({linger_detail or 'linger disabled'})." + ), + fix_hint=f" sudo loginctl enable-linger {username}", + ) + + +def _raise_user_systemd_unavailable(username: str, *, reason: str, fix_hint: str) -> None: + """Build a user-facing error message and raise UserSystemdUnavailableError.""" + msg = ( + f"{reason}\n" + " systemctl --user cannot reach the user D-Bus session in this shell.\n" + "\n" + " To fix:\n" + f"{fix_hint}\n" + "\n" + " Alternative: run the gateway in the foreground (stays up until\n" + " you exit / close the terminal):\n" + " hermes gateway run" + ) + raise UserSystemdUnavailableError(msg) + + def _systemctl_cmd(system: bool = False) -> list[str]: if not system: _ensure_user_systemd_env() @@ -1623,6 +1758,11 @@ def systemd_start(system: bool = False): system = _select_systemd_scope(system) if system: _require_root_for_system_service("start") + else: + # Fail fast with actionable guidance if the user D-Bus session is not + # reachable (common on fresh RHEL/Debian SSH sessions without linger). + # Raises UserSystemdUnavailableError with a remediation message. + _preflight_user_systemd() refresh_systemd_unit_if_needed(system=system) _run_systemctl(["start", get_service_name()], system=system, check=True, timeout=30) print(f"✓ {_service_scope_label(system).capitalize()} service started") @@ -1642,6 +1782,8 @@ def systemd_restart(system: bool = False): system = _select_systemd_scope(system) if system: _require_root_for_system_service("restart") + else: + _preflight_user_systemd() refresh_systemd_unit_if_needed(system=system) from gateway.status import get_running_pid @@ -3516,6 +3658,10 @@ def gateway_setup(): systemd_start() elif is_macos(): launchd_start() + except UserSystemdUnavailableError as e: + print_error(" Failed to start — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except subprocess.CalledProcessError as e: print_error(f" Failed to start: {e}") else: @@ -3580,6 +3726,10 @@ def gateway_setup(): else: stop_profile_gateway() print_info("Start manually: hermes gateway") + except UserSystemdUnavailableError as e: + print_error(" Restart failed — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except subprocess.CalledProcessError as e: print_error(f" Restart failed: {e}") elif service_installed: @@ -3589,6 +3739,10 @@ def gateway_setup(): systemd_start() elif is_macos(): launchd_start() + except UserSystemdUnavailableError as e: + print_error(" Start failed — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except subprocess.CalledProcessError as e: print_error(f" Start failed: {e}") else: @@ -3612,6 +3766,10 @@ def gateway_setup(): systemd_start(system=installed_scope == "system") else: launchd_start() + except UserSystemdUnavailableError as e: + print_error(" Start failed — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except subprocess.CalledProcessError as e: print_error(f" Start failed: {e}") except subprocess.CalledProcessError as e: @@ -3649,6 +3807,18 @@ def gateway_setup(): def gateway_command(args): """Handle gateway subcommands.""" + try: + return _gateway_command_inner(args) + except UserSystemdUnavailableError as e: + # Clean, actionable message instead of a traceback when the user D-Bus + # session is unreachable (fresh SSH shell, no linger, container, etc.). + print_error("User systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") + sys.exit(1) + + +def _gateway_command_inner(args): subcmd = getattr(args, 'gateway_command', None) # Default to run if no subcommand diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 1fe5ae058..362961689 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -2334,6 +2334,7 @@ def setup_gateway(config: dict): launchd_install, launchd_start, launchd_restart, + UserSystemdUnavailableError, ) service_installed = _is_service_installed() @@ -2357,6 +2358,10 @@ def setup_gateway(config: dict): systemd_restart() elif _is_macos: launchd_restart() + except UserSystemdUnavailableError as e: + print_error(" Restart failed — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except Exception as e: print_error(f" Restart failed: {e}") elif service_installed: @@ -2366,6 +2371,10 @@ def setup_gateway(config: dict): systemd_start() elif _is_macos: launchd_start() + except UserSystemdUnavailableError as e: + print_error(" Start failed — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except Exception as e: print_error(f" Start failed: {e}") elif supports_service_manager: @@ -2389,6 +2398,10 @@ def setup_gateway(config: dict): systemd_start(system=installed_scope == "system") elif _is_macos: launchd_start() + except UserSystemdUnavailableError as e: + print_error(" Start failed — user systemd not reachable:") + for line in str(e).splitlines(): + print(f" {line}") except Exception as e: print_error(f" Start failed: {e}") except Exception as e: diff --git a/pyproject.toml b/pyproject.toml index 104a9ee5e..0b22ba714 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ [project.optional-dependencies] modal = ["modal>=1.0.0,<2"] daytona = ["daytona>=0.148.0,<1"] -dev = ["debugpy>=1.8.0,<2", "pytest>=9.0.2,<10", "pytest-asyncio>=1.3.0,<2", "pytest-xdist>=3.0,<4", "mcp>=1.2.0,<2", "ty>=0.0.1a29,<0.0.22"] +dev = ["debugpy>=1.8.0,<2", "pytest>=9.0.2,<10", "pytest-asyncio>=1.3.0,<2", "pytest-xdist>=3.0,<4", "mcp>=1.2.0,<2", "ty>=0.0.1a29,<0.0.22", "ruff"] messaging = ["python-telegram-bot[webhooks]>=22.6,<23", "discord.py[voice]>=2.7.1,<3", "aiohttp>=3.13.3,<4", "slack-bolt>=1.18.0,<2", "slack-sdk>=3.27.0,<4", "qrcode>=7.0,<8"] cron = ["croniter>=6.0.0,<7"] slack = ["slack-bolt>=1.18.0,<2", "slack-sdk>=3.27.0,<4"] diff --git a/scripts/release.py b/scripts/release.py index aa564b714..a168c921b 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -109,8 +109,11 @@ AUTHOR_MAP = { "134848055+UNLINEARITY@users.noreply.github.com": "UNLINEARITY", "ben.burtenshaw@gmail.com": "burtenshaw", "roopaknijhara@gmail.com": "rnijhara", + "josephzcan@gmail.com": "j0sephz", # contributors (manual mapping from git names) "ahmedsherif95@gmail.com": "asheriif", + "dyxushuai@gmail.com": "dyxushuai", + "33860762+etcircle@users.noreply.github.com": "etcircle", "liujinkun@bytedance.com": "liujinkun2025", "dmayhem93@gmail.com": "dmahan93", "fr@tecompanytea.com": "ifrederico", diff --git a/tests/gateway/test_session_split_brain_11016.py b/tests/gateway/test_session_split_brain_11016.py new file mode 100644 index 000000000..1076a77c4 --- /dev/null +++ b/tests/gateway/test_session_split_brain_11016.py @@ -0,0 +1,399 @@ +"""Regression tests for issue #11016 — Telegram sessions trapped in +repeated 'Interrupting current task...' while /stop reports no active task. + +Covers three layers of the fix: + +1. Adapter-side task ownership (_session_tasks map): /stop, /new, /reset + actually cancel the in-flight adapter task and release the guard in + order, so follow-up messages reach the new session. + +2. Adapter-side on-entry self-heal: if _active_sessions still has an + entry but the recorded owner task is already done/cancelled, clear it + on the next inbound message rather than trapping the user. + +3. Runner-side generation guard: a stale async run can't promote itself + into _running_agents after /stop/ /new bumped the generation, and + can't clear a newer run's slot on the way out. +""" + +import asyncio +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from gateway.config import GatewayConfig, Platform, PlatformConfig +from gateway.platforms.base import ( + BasePlatformAdapter, + MessageEvent, + MessageType, +) +from gateway.run import GatewayRunner, _AGENT_PENDING_SENTINEL +from gateway.session import SessionSource, build_session_key + + +# --------------------------------------------------------------------------- +# Adapter helpers +# --------------------------------------------------------------------------- + + +class _StubAdapter(BasePlatformAdapter): + async def connect(self): + pass + + async def disconnect(self): + pass + + async def send(self, chat_id, text, **kwargs): + pass + + async def get_chat_info(self, chat_id): + return {} + + +def _make_adapter(): + config = PlatformConfig(enabled=True, token="test-token") + adapter = _StubAdapter(config, Platform.TELEGRAM) + adapter.sent_responses = [] + + async def _mock_send_retry(chat_id, content, **kwargs): + adapter.sent_responses.append(content) + + adapter._send_with_retry = _mock_send_retry + return adapter + + +def _make_event(text="hello", chat_id="12345"): + source = SessionSource( + platform=Platform.TELEGRAM, chat_id=chat_id, chat_type="dm" + ) + return MessageEvent(text=text, message_type=MessageType.TEXT, source=source) + + +def _session_key(chat_id="12345"): + source = SessionSource( + platform=Platform.TELEGRAM, chat_id=chat_id, chat_type="dm" + ) + return build_session_key(source) + + +# --------------------------------------------------------------------------- +# Runner helpers +# --------------------------------------------------------------------------- + + +def _make_runner(): + runner = object.__new__(GatewayRunner) + runner.config = GatewayConfig( + platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")} + ) + runner.adapters = {} + runner._running_agents = {} + runner._running_agents_ts = {} + runner._session_run_generation = {} + runner._pending_messages = {} + runner._draining = False + runner._update_runtime_status = MagicMock() + return runner + + +# =========================================================================== +# Layer 1: Adapter-side session cancellation on /stop /new /reset +# =========================================================================== + + +class TestAdapterSessionCancellation: + @pytest.mark.asyncio + @pytest.mark.parametrize("command_text", ["/stop", "/new", "/reset"]) + async def test_command_cancels_active_task_and_unblocks_follow_up( + self, command_text + ): + """/stop /new /reset must cancel the adapter task and let follow-ups through.""" + adapter = _make_adapter() + sk = _session_key() + processing_started = asyncio.Event() + processing_cancelled = asyncio.Event() + blocked_first_message = True + + async def _handler(event): + nonlocal blocked_first_message + cmd = event.get_command() + if cmd in {"stop", "new", "reset", "model"}: + return f"handled:{cmd}" + + if blocked_first_message: + blocked_first_message = False + processing_started.set() + try: + await asyncio.Event().wait() + except asyncio.CancelledError: + processing_cancelled.set() + raise + return f"handled:text:{event.text}" + + adapter._message_handler = _handler + + await adapter.handle_message(_make_event("hello world")) + await processing_started.wait() + await asyncio.sleep(0) + + assert sk in adapter._active_sessions + assert sk in adapter._session_tasks + + await adapter.handle_message(_make_event(command_text)) + + assert processing_cancelled.is_set(), ( + f"{command_text} did not cancel the active processing task" + ) + assert sk not in adapter._active_sessions + assert sk not in adapter._pending_messages + assert sk not in adapter._session_tasks + expected = command_text.lstrip("/") + assert any(f"handled:{expected}" in r for r in adapter.sent_responses) + + # Follow-up must go through normally now that the session is clean. + await adapter.handle_message( + _make_event("/model xiaomi/mimo-v2-pro --provider nous") + ) + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert any("handled:model" in r for r in adapter.sent_responses), ( + f"follow-up /model stayed blocked after {command_text}" + ) + assert sk not in adapter._pending_messages + + @pytest.mark.asyncio + async def test_new_keeps_guard_until_command_finishes_then_runs_follow_up(self): + """/new must finish runner logic before cancelling old work or releasing the guard.""" + adapter = _make_adapter() + sk = _session_key() + processing_started = asyncio.Event() + command_started = asyncio.Event() + allow_command_finish = asyncio.Event() + follow_up_processed = asyncio.Event() + call_order = [] + + async def _handler(event): + cmd = event.get_command() + if cmd == "new": + call_order.append("command:start") + command_started.set() + await allow_command_finish.wait() + call_order.append("command:end") + return "handled:new" + + if event.text == "hello world": + processing_started.set() + try: + await asyncio.Event().wait() + except asyncio.CancelledError: + call_order.append("original:cancelled") + raise + + if event.text == "after reset": + call_order.append("followup:processed") + follow_up_processed.set() + return f"handled:text:{event.text}" + + adapter._message_handler = _handler + + await adapter.handle_message(_make_event("hello world")) + await processing_started.wait() + + command_task = asyncio.create_task(adapter.handle_message(_make_event("/new"))) + await command_started.wait() + await asyncio.sleep(0) + + assert sk in adapter._active_sessions + + await adapter.handle_message(_make_event("after reset")) + await asyncio.sleep(0) + await asyncio.sleep(0) + + assert sk in adapter._active_sessions, "guard must stay active while /new is still running" + assert sk in adapter._pending_messages, "follow-up should stay queued until /new finishes" + assert not follow_up_processed.is_set(), "follow-up ran before /new completed" + assert "original:cancelled" not in call_order, "old task was cancelled before runner completed /new" + + allow_command_finish.set() + await command_task + await asyncio.wait_for(follow_up_processed.wait(), timeout=1.0) + + assert any("handled:new" in r for r in adapter.sent_responses) + assert call_order.index("command:end") < call_order.index("original:cancelled") + assert call_order.index("original:cancelled") < call_order.index("followup:processed") + assert sk not in adapter._pending_messages + + +# =========================================================================== +# Layer 2: Adapter-side on-entry self-heal for stale session locks +# =========================================================================== + + +class TestStaleSessionLockSelfHeal: + @pytest.mark.asyncio + async def test_stale_lock_with_done_task_is_healed_on_next_message(self): + """A split-brain guard (owner task done but entry still live) heals on next inbound.""" + adapter = _make_adapter() + sk = _session_key() + + # Simulate the production split-brain: an _active_sessions entry + # remains AND a recorded owner task, but that task is already done. + async def _done(): + return None + + done_task = asyncio.create_task(_done()) + await done_task + assert done_task.done() + + adapter._active_sessions[sk] = asyncio.Event() + adapter._session_tasks[sk] = done_task + + assert adapter._session_task_is_stale(sk) + + async def _handler(event): + return f"handled:{event.get_command() or 'text'}" + + adapter._message_handler = _handler + + # An ordinary message should heal the stale lock, then fall through + # to normal dispatch. User gets a reply instead of a busy ack. + await adapter.handle_message(_make_event("hello")) + # Drain any spawned background tasks. + for _ in range(5): + await asyncio.sleep(0) + + assert any("handled:text" in r for r in adapter.sent_responses), ( + "stale lock trapped a normal message — split-brain not healed" + ) + + def test_no_owner_task_is_not_treated_as_stale(self): + """If _session_tasks has no entry at all, the guard isn't stale. + + Tests and rare legitimate code paths install _active_sessions + entries directly. Auto-healing those would break real fixtures. + """ + adapter = _make_adapter() + sk = _session_key() + + adapter._active_sessions[sk] = asyncio.Event() + # No _session_tasks entry. + + assert adapter._session_task_is_stale(sk) is False + assert adapter._heal_stale_session_lock(sk) is False + + def test_live_owner_task_is_not_stale(self): + """When the owner task is alive, do NOT heal — agent is really busy.""" + adapter = _make_adapter() + sk = _session_key() + + fake_task = MagicMock() + fake_task.done.return_value = False + adapter._active_sessions[sk] = asyncio.Event() + adapter._session_tasks[sk] = fake_task + + assert adapter._session_task_is_stale(sk) is False + assert adapter._heal_stale_session_lock(sk) is False + # Lock still in place. + assert sk in adapter._active_sessions + assert sk in adapter._session_tasks + + +# =========================================================================== +# Layer 3: Runner-side generation guard on slot promotion + release +# =========================================================================== + + +class TestRunnerSessionGenerationGuard: + def test_release_without_generation_behaves_as_before(self): + runner = _make_runner() + sk = "agent:main:telegram:dm:12345" + runner._running_agents[sk] = "agent" + runner._running_agents_ts[sk] = 1.0 + assert runner._release_running_agent_state(sk) is True + assert sk not in runner._running_agents + assert sk not in runner._running_agents_ts + + def test_release_with_current_generation_clears_slot(self): + runner = _make_runner() + sk = "agent:main:telegram:dm:12345" + gen = runner._begin_session_run_generation(sk) + runner._running_agents[sk] = "agent" + runner._running_agents_ts[sk] = 1.0 + + assert runner._release_running_agent_state(sk, run_generation=gen) is True + assert sk not in runner._running_agents + + def test_release_with_stale_generation_blocks(self): + runner = _make_runner() + sk = "agent:main:telegram:dm:12345" + stale_gen = runner._begin_session_run_generation(sk) + # /stop bumps the generation — stale run's generation is no longer current. + runner._invalidate_session_run_generation(sk, reason="stop") + # The fresh run lands next; imagine it has its own state installed. + runner._running_agents[sk] = "fresh_agent" + runner._running_agents_ts[sk] = 2.0 + + # Stale run's unwind MUST NOT clobber the fresh run's state. + released = runner._release_running_agent_state(sk, run_generation=stale_gen) + + assert released is False + assert runner._running_agents[sk] == "fresh_agent" + assert runner._running_agents_ts[sk] == 2.0 + + def test_is_session_run_current_tracks_bumps(self): + runner = _make_runner() + sk = "agent:main:telegram:dm:12345" + gen1 = runner._begin_session_run_generation(sk) + assert runner._is_session_run_current(sk, gen1) is True + + runner._invalidate_session_run_generation(sk, reason="test") + assert runner._is_session_run_current(sk, gen1) is False + + gen2 = runner._begin_session_run_generation(sk) + assert gen2 > gen1 + assert runner._is_session_run_current(sk, gen2) is True + + +# =========================================================================== +# Layer 1 (regression): old task's finally must NOT delete a newer guard +# =========================================================================== + + +class TestOldTaskCannotClobberNewerGuard: + """Direct regression for the unconditional-delete bug. + + Before the guard-match fix, a task in its finally would delete + ``_active_sessions[session_key]`` unconditionally — even if a + /stop/ /new command had already swapped in its own command_guard + (which then gets clobbered, opening a race for follow-up messages). + """ + + def test_release_session_guard_matches_on_event_identity(self): + adapter = _make_adapter() + sk = _session_key() + + old_guard = asyncio.Event() + new_guard = asyncio.Event() + # Command swapped in a newer guard. + adapter._active_sessions[sk] = new_guard + + # Old task tries to release using its captured (stale) guard. + adapter._release_session_guard(sk, guard=old_guard) + + # The newer guard survives. + assert adapter._active_sessions.get(sk) is new_guard + + # Now the command itself releases using the matching guard. + adapter._release_session_guard(sk, guard=new_guard) + assert sk not in adapter._active_sessions + + def test_release_session_guard_without_guard_releases_unconditionally(self): + adapter = _make_adapter() + sk = _session_key() + adapter._active_sessions[sk] = asyncio.Event() + # Callers that don't know the guard (e.g. cancel_session_processing's + # default path) still work. + adapter._release_session_guard(sk) + assert sk not in adapter._active_sessions + diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py index fda893e1e..68554a496 100644 --- a/tests/hermes_cli/test_gateway_service.py +++ b/tests/hermes_cli/test_gateway_service.py @@ -5,6 +5,8 @@ import pwd from pathlib import Path from types import SimpleNamespace +import pytest + import hermes_cli.gateway as gateway_cli from gateway.restart import ( DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT, @@ -1083,6 +1085,116 @@ class TestEnsureUserSystemdEnv: assert calls == [] +class TestPreflightUserSystemd: + """Tests for _preflight_user_systemd() — D-Bus reachability before systemctl --user. + + Covers issue #5130 / Rick's RHEL 9.6 SSH scenario: setup tries to start the + gateway via ``systemctl --user start`` in a shell with no user D-Bus session, + which previously failed with a raw ``CalledProcessError`` and no remediation. + """ + + def test_noop_when_bus_socket_exists(self, monkeypatch): + """Socket already there (desktop / linger + prior login) → no-op.""" + monkeypatch.setattr( + gateway_cli, "_user_dbus_socket_path", + lambda: type("P", (), {"exists": lambda self: True})(), + ) + # Should not raise, no subprocess calls needed. + gateway_cli._preflight_user_systemd() + + def test_raises_when_linger_disabled_and_loginctl_denied(self, monkeypatch): + """Rick's scenario: no D-Bus, no linger, non-root SSH → clear error.""" + monkeypatch.setattr( + gateway_cli, "_user_dbus_socket_path", + lambda: type("P", (), {"exists": lambda self: False})(), + ) + monkeypatch.setattr( + gateway_cli, "get_systemd_linger_status", lambda: (False, ""), + ) + monkeypatch.setattr(gateway_cli.shutil, "which", lambda _: "/usr/bin/loginctl") + + class _Result: + returncode = 1 + stdout = "" + stderr = "Interactive authentication required." + + monkeypatch.setattr( + gateway_cli.subprocess, "run", lambda *a, **kw: _Result(), + ) + + with pytest.raises(gateway_cli.UserSystemdUnavailableError) as exc_info: + gateway_cli._preflight_user_systemd() + + msg = str(exc_info.value) + assert "sudo loginctl enable-linger" in msg + assert "hermes gateway run" in msg # foreground fallback mentioned + assert "Interactive authentication required" in msg + + def test_raises_when_loginctl_missing(self, monkeypatch): + """No loginctl binary at all → suggest sudo install + manual fix.""" + monkeypatch.setattr( + gateway_cli, "_user_dbus_socket_path", + lambda: type("P", (), {"exists": lambda self: False})(), + ) + monkeypatch.setattr( + gateway_cli, "get_systemd_linger_status", + lambda: (None, "loginctl not found"), + ) + monkeypatch.setattr(gateway_cli.shutil, "which", lambda _: None) + + with pytest.raises(gateway_cli.UserSystemdUnavailableError) as exc_info: + gateway_cli._preflight_user_systemd() + + assert "sudo loginctl enable-linger" in str(exc_info.value) + + def test_linger_enabled_but_socket_still_missing(self, monkeypatch): + """Edge case: linger says yes but the bus socket never came up.""" + monkeypatch.setattr( + gateway_cli, "_user_dbus_socket_path", + lambda: type("P", (), {"exists": lambda self: False})(), + ) + monkeypatch.setattr( + gateway_cli, "get_systemd_linger_status", lambda: (True, ""), + ) + monkeypatch.setattr( + gateway_cli, "_wait_for_user_dbus_socket", lambda timeout=3.0: False, + ) + + with pytest.raises(gateway_cli.UserSystemdUnavailableError) as exc_info: + gateway_cli._preflight_user_systemd() + + assert "linger is enabled" in str(exc_info.value) + + def test_enable_linger_succeeds_and_socket_appears(self, monkeypatch, capsys): + """Happy remediation path: polkit allows enable-linger, socket spawns.""" + monkeypatch.setattr( + gateway_cli, "_user_dbus_socket_path", + lambda: type("P", (), {"exists": lambda self: False})(), + ) + monkeypatch.setattr( + gateway_cli, "get_systemd_linger_status", lambda: (False, ""), + ) + monkeypatch.setattr(gateway_cli.shutil, "which", lambda _: "/usr/bin/loginctl") + + class _OkResult: + returncode = 0 + stdout = "" + stderr = "" + + monkeypatch.setattr( + gateway_cli.subprocess, "run", lambda *a, **kw: _OkResult(), + ) + monkeypatch.setattr( + gateway_cli, "_wait_for_user_dbus_socket", + lambda timeout=5.0: True, + ) + + # Should not raise. + gateway_cli._preflight_user_systemd() + out = capsys.readouterr().out + assert "Enabled linger" in out + + class TestProfileArg: """Tests for _profile_arg — returns '--profile ' for named profiles.""" diff --git a/tests/tools/test_local_shell_init.py b/tests/tools/test_local_shell_init.py index 96e26e735..7dabaadf1 100644 --- a/tests/tools/test_local_shell_init.py +++ b/tests/tools/test_local_shell_init.py @@ -34,8 +34,59 @@ class TestResolveShellInitFiles: assert resolved == [str(bashrc)] + def test_auto_sources_profile_when_present(self, tmp_path, monkeypatch): + """~/.profile is where ``n`` / ``nvm`` installers typically write + their PATH export on Debian/Ubuntu, and it has no interactivity + guard so a non-interactive source actually runs it. + """ + profile = tmp_path / ".profile" + profile.write_text('export PATH="$HOME/n/bin:$PATH"\n') + monkeypatch.setenv("HOME", str(tmp_path)) + + with patch( + "tools.environments.local._read_terminal_shell_init_config", + return_value=([], True), + ): + resolved = _resolve_shell_init_files() + + assert resolved == [str(profile)] + + def test_auto_sources_bash_profile_when_present(self, tmp_path, monkeypatch): + bash_profile = tmp_path / ".bash_profile" + bash_profile.write_text('export MARKER=bp\n') + monkeypatch.setenv("HOME", str(tmp_path)) + + with patch( + "tools.environments.local._read_terminal_shell_init_config", + return_value=([], True), + ): + resolved = _resolve_shell_init_files() + + assert resolved == [str(bash_profile)] + + def test_auto_sources_profile_before_bashrc(self, tmp_path, monkeypatch): + """Both files present: profile runs first so PATH exports in + profile take effect even if bashrc short-circuits on the + non-interactive ``case $- in *i*) ;; *) return;; esac`` guard. + """ + profile = tmp_path / ".profile" + profile.write_text('export FROM_PROFILE=1\n') + bash_profile = tmp_path / ".bash_profile" + bash_profile.write_text('export FROM_BASH_PROFILE=1\n') + bashrc = tmp_path / ".bashrc" + bashrc.write_text('export FROM_BASHRC=1\n') + monkeypatch.setenv("HOME", str(tmp_path)) + + with patch( + "tools.environments.local._read_terminal_shell_init_config", + return_value=([], True), + ): + resolved = _resolve_shell_init_files() + + assert resolved == [str(profile), str(bash_profile), str(bashrc)] + def test_skips_bashrc_when_missing(self, tmp_path, monkeypatch): - # No bashrc written. + # No rc files written. monkeypatch.setenv("HOME", str(tmp_path)) with patch( @@ -49,6 +100,8 @@ class TestResolveShellInitFiles: def test_auto_source_bashrc_off_suppresses_default(self, tmp_path, monkeypatch): bashrc = tmp_path / ".bashrc" bashrc.write_text('export MARKER=seen\n') + profile = tmp_path / ".profile" + profile.write_text('export MARKER=p\n') monkeypatch.setenv("HOME", str(tmp_path)) with patch( @@ -160,3 +213,58 @@ class TestSnapshotEndToEnd: output = result.get("output", "") assert "PROBE=probe-ok" in output assert "/opt/shell-init-probe/bin" in output + + def test_profile_path_export_survives_bashrc_interactive_guard( + self, tmp_path, monkeypatch + ): + """Reproduces the Debian/Ubuntu + ``n``/``nvm`` case. + + Setup: + - ``~/.bashrc`` starts with ``case $- in *i*) ;; *) return;; esac`` + (the default on Debian/Ubuntu) and would happily export a PATH + entry below that guard — but never gets there because a + non-interactive source short-circuits. + - ``~/.profile`` exports ``$HOME/fake-n/bin`` onto PATH, no guard. + + Expectation: auto-sourced rc list picks up ``~/.profile`` before + ``~/.bashrc``, so the snapshot ends up with ``fake-n/bin`` on PATH + even though the bashrc export is silently skipped. + """ + fake_n_bin = tmp_path / "fake-n" / "bin" + fake_n_bin.mkdir(parents=True) + + profile = tmp_path / ".profile" + profile.write_text( + f'export PATH="{fake_n_bin}:$PATH"\n' + 'export FROM_PROFILE=profile-ok\n' + ) + bashrc = tmp_path / ".bashrc" + bashrc.write_text( + 'case $- in\n' + ' *i*) ;;\n' + ' *) return;;\n' + 'esac\n' + 'export FROM_BASHRC=bashrc-should-not-appear\n' + ) + + monkeypatch.setenv("HOME", str(tmp_path)) + + with patch( + "tools.environments.local._read_terminal_shell_init_config", + return_value=([], True), + ): + env = LocalEnvironment(cwd=str(tmp_path), timeout=15) + try: + result = env.execute( + 'echo "PATH=$PATH"; ' + 'echo "FROM_PROFILE=$FROM_PROFILE"; ' + 'echo "FROM_BASHRC=$FROM_BASHRC"' + ) + finally: + env.cleanup() + + output = result.get("output", "") + assert "FROM_PROFILE=profile-ok" in output + assert str(fake_n_bin) in output + # bashrc short-circuited on the interactive guard — its export never ran + assert "FROM_BASHRC=bashrc-should-not-appear" not in output diff --git a/tests/tools/test_skills_guard.py b/tests/tools/test_skills_guard.py index 6fcd05b31..12c527ca7 100644 --- a/tests/tools/test_skills_guard.py +++ b/tests/tools/test_skills_guard.py @@ -174,20 +174,27 @@ class TestShouldAllowInstall: assert allowed is True assert "agent-created" in reason - def test_dangerous_agent_created_asks(self): - """Agent-created skills with dangerous verdict return None (ask for confirmation).""" + def test_dangerous_agent_created_allowed(self): + """Agent-created skills bypass verdict gating — agent can already + execute the same code via terminal(), so skill_manage allows all + verdicts. This prevents friction when the agent writes skills that + mention risky keywords in prose (e.g. describing cache-busting or + persistence semantics in a PR-review skill).""" f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")] allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f)) - assert allowed is None - assert "Requires confirmation" in reason + assert allowed is True + assert "agent-created" in reason - def test_force_overrides_dangerous_for_agent_created(self): + def test_force_noop_for_agent_created_dangerous(self): + """With agent-created dangerous mapped to 'allow', force becomes a + no-op — the allow branch returns first. Force still works for any + trust level that maps to block (community/trusted).""" f = [Finding("x", "critical", "c", "f", 1, "m", "d")] allowed, reason = should_allow_install( self._result("agent-created", "dangerous", f), force=True ) assert allowed is True - assert "Force-installed" in reason + assert "agent-created" in reason # --------------------------------------------------------------------------- diff --git a/tests/tools/test_skills_sync.py b/tests/tools/test_skills_sync.py index 683f6503b..347366e6a 100644 --- a/tests/tools/test_skills_sync.py +++ b/tests/tools/test_skills_sync.py @@ -402,6 +402,86 @@ class TestSyncSkills: assert (user_skill / "SKILL.md").read_text() == "# User modified" + def test_collision_does_not_poison_manifest(self, tmp_path): + """Collision with an unmanifested user skill must NOT record bundled_hash. + + Otherwise the next sync compares user_hash against the recorded + bundled_hash, finds a mismatch, and permanently flags the skill as + 'user-modified' — even though the user never touched a bundled copy. + """ + bundled = self._setup_bundled(tmp_path) + skills_dir = tmp_path / "user_skills" + manifest_file = skills_dir / ".bundled_manifest" + + # Pre-existing user skill (e.g. from hub, custom, or leftover) that + # happens to share a name with a newly bundled skill. + user_skill = skills_dir / "category" / "new-skill" + user_skill.mkdir(parents=True) + (user_skill / "SKILL.md").write_text("# From hub — unrelated to bundled") + + with self._patches(bundled, skills_dir, manifest_file): + sync_skills(quiet=True) + + # User file must survive (existing invariant). + assert (user_skill / "SKILL.md").read_text() == ( + "# From hub — unrelated to bundled" + ) + + # Manifest must NOT contain the skill — it was never synced from bundled. + with patch("tools.skills_sync.MANIFEST_FILE", manifest_file): + manifest = _read_manifest() + assert "new-skill" not in manifest, ( + "Collision path wrote bundled_hash to the manifest even though " + "the on-disk copy is unrelated to bundled. This poisons update " + "detection: the next sync will mark the skill as 'user-modified'." + ) + + def test_collision_does_not_trigger_false_user_modified_on_resync(self, tmp_path): + """End-to-end: after a collision, a second sync must not flag user_modified. + + Pre-fix bug: first sync wrote bundled_hash to the manifest; second + sync then diffed user_hash vs bundled_hash, mismatched, and shoved + the skill into the user_modified bucket forever. + """ + bundled = self._setup_bundled(tmp_path) + skills_dir = tmp_path / "user_skills" + manifest_file = skills_dir / ".bundled_manifest" + + user_skill = skills_dir / "category" / "new-skill" + user_skill.mkdir(parents=True) + (user_skill / "SKILL.md").write_text("# From hub — unrelated to bundled") + + with self._patches(bundled, skills_dir, manifest_file): + sync_skills(quiet=True) # first sync: collision path + result2 = sync_skills(quiet=True) # second sync: must not flag + + assert "new-skill" not in result2["user_modified"], ( + "Second sync after a collision falsely flagged the user's skill " + "as 'user-modified' — the manifest was poisoned on the first sync." + ) + + def test_collision_prints_reset_hint(self, tmp_path, capsys): + """Non-quiet sync must print a reset hint when a collision is skipped. + + Silent skip hides the fact that a bundled skill shipped but was + shadowed by the user's local copy. The hint tells the user the + exact command to take the bundled version instead. + """ + bundled = self._setup_bundled(tmp_path) + skills_dir = tmp_path / "user_skills" + manifest_file = skills_dir / ".bundled_manifest" + + user_skill = skills_dir / "category" / "new-skill" + user_skill.mkdir(parents=True) + (user_skill / "SKILL.md").write_text("# From hub — unrelated to bundled") + + with self._patches(bundled, skills_dir, manifest_file): + sync_skills(quiet=False) + + captured = capsys.readouterr().out + assert "new-skill" in captured + assert "hermes skills reset new-skill" in captured + def test_nonexistent_bundled_dir(self, tmp_path): with patch("tools.skills_sync._get_bundled_dir", return_value=tmp_path / "nope"): result = sync_skills(quiet=True) diff --git a/tools/environments/local.py b/tools/environments/local.py index e4ef27829..4aa6b64e2 100644 --- a/tools/environments/local.py +++ b/tools/environments/local.py @@ -247,10 +247,22 @@ def _resolve_shell_init_files() -> list[str]: if explicit: candidates.extend(explicit) elif auto_bashrc and not _IS_WINDOWS: - # Bash's login-shell invocation does NOT source ~/.bashrc by default, - # so tools like nvm / asdf / pyenv that self-install there stay - # invisible to the snapshot without this nudge. - candidates.append("~/.bashrc") + # Build a login-shell-ish source list so tools like n / nvm / asdf / + # pyenv that self-install into the user's shell rc land on PATH in + # the captured snapshot. + # + # ~/.profile and ~/.bash_profile run first because they have no + # interactivity guard — installers like ``n`` and ``nvm`` append + # their PATH export there on most distros, and a non-interactive + # ``. ~/.profile`` picks that up. + # + # ~/.bashrc runs last. On Debian/Ubuntu the default bashrc starts + # with ``case $- in *i*) ;; *) return;; esac`` and exits early + # when sourced non-interactively, which is why sourcing bashrc + # alone misses nvm/n PATH additions placed below that guard. We + # still include it so users who put PATH logic in bashrc (and + # stripped the guard, or never had one) keep working. + candidates.extend(["~/.profile", "~/.bash_profile", "~/.bashrc"]) resolved: list[str] = [] for raw in candidates: diff --git a/tools/skills_guard.py b/tools/skills_guard.py index 3513f46f0..fadbb8173 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -43,7 +43,11 @@ INSTALL_POLICY = { "builtin": ("allow", "allow", "allow"), "trusted": ("allow", "allow", "block"), "community": ("allow", "block", "block"), - "agent-created": ("allow", "allow", "ask"), + # Agent-created skills run in the same process as the agent that + # wrote them — the agent could already execute the same code via + # terminal(), so a dangerous-pattern gate on skill_manage adds + # friction without meaningful security. Allow all verdicts. + "agent-created": ("allow", "allow", "allow"), } VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2} diff --git a/tools/skills_sync.py b/tools/skills_sync.py index 867566b6c..cb7955c01 100644 --- a/tools/skills_sync.py +++ b/tools/skills_sync.py @@ -206,9 +206,25 @@ def sync_skills(quiet: bool = False) -> dict: # ── New skill — never offered before ── try: if dest.exists(): - # User already has a skill with the same name — don't overwrite + # User already has a skill with the same name — don't overwrite. + # Only baseline in the manifest when the on-disk copy is + # byte-identical to bundled (e.g. a reset that re-syncs, or + # a coincidentally identical install); that case is harmless + # to track. If the copy differs (custom skill, hub-installed, + # or user-edited) skip the manifest write: recording + # bundled_hash there would poison update detection by making + # user_hash != origin_hash read as "user-modified" on every + # subsequent sync, permanently blocking bundled updates. skipped += 1 - manifest[skill_name] = bundled_hash + if _dir_hash(dest) == bundled_hash: + manifest[skill_name] = bundled_hash + elif not quiet: + print( + f" ⚠ {skill_name}: bundled version shipped but you " + f"already have a local skill by this name — yours " + f"was kept. Run `hermes skills reset {skill_name}` " + f"to replace it with the bundled version." + ) else: dest.parent.mkdir(parents=True, exist_ok=True) shutil.copytree(skill_src, dest) diff --git a/uv.lock b/uv.lock index 9b3e90697..080aefeb1 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-16T11:37:14.546044Z" +exclude-newer = "2026-04-16T11:49:00.318115Z" exclude-newer-span = "P7D" [[package]] @@ -1927,6 +1927,7 @@ all = [ { name = "python-telegram-bot", extra = ["webhooks"] }, { name = "pywinpty", marker = "sys_platform == 'win32'" }, { name = "qrcode" }, + { name = "ruff" }, { name = "simple-term-menu" }, { name = "slack-bolt" }, { name = "slack-sdk" }, @@ -1952,6 +1953,7 @@ dev = [ { name = "pytest" }, { name = "pytest-asyncio" }, { name = "pytest-xdist" }, + { name = "ruff" }, { name = "ty" }, ] dingtalk = [ @@ -2119,6 +2121,7 @@ requires-dist = [ { name = "qrcode", marker = "extra == 'messaging'", specifier = ">=7.0,<8" }, { name = "requests", specifier = ">=2.33.0,<3" }, { name = "rich", specifier = ">=14.3.3,<15" }, + { name = "ruff", marker = "extra == 'dev'" }, { name = "simple-term-menu", marker = "extra == 'cli'", specifier = ">=1.0,<2" }, { name = "slack-bolt", marker = "extra == 'messaging'", specifier = ">=1.18.0,<2" }, { name = "slack-bolt", marker = "extra == 'slack'", specifier = ">=1.18.0,<2" }, @@ -4659,6 +4662,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, ] +[[package]] +name = "ruff" +version = "0.15.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/d9/aa3f7d59a10ef6b14fe3431706f854dbf03c5976be614a9796d36326810c/ruff-0.15.10.tar.gz", hash = "sha256:d1f86e67ebfdef88e00faefa1552b5e510e1d35f3be7d423dc7e84e63788c94e", size = 4631728, upload-time = "2026-04-09T14:06:09.884Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/00/a1c2fdc9939b2c03691edbda290afcd297f1f389196172826b03d6b6a595/ruff-0.15.10-py3-none-linux_armv6l.whl", hash = "sha256:0744e31482f8f7d0d10a11fcbf897af272fefdfcb10f5af907b18c2813ff4d5f", size = 10563362, upload-time = "2026-04-09T14:06:21.189Z" }, + { url = "https://files.pythonhosted.org/packages/5c/15/006990029aea0bebe9d33c73c3e28c80c391ebdba408d1b08496f00d422d/ruff-0.15.10-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b1e7c16ea0ff5a53b7c2df52d947e685973049be1cdfe2b59a9c43601897b22e", size = 10951122, upload-time = "2026-04-09T14:06:02.236Z" }, + { url = "https://files.pythonhosted.org/packages/f2/c0/4ac978fe874d0618c7da647862afe697b281c2806f13ce904ad652fa87e4/ruff-0.15.10-py3-none-macosx_11_0_arm64.whl", hash = "sha256:93cc06a19e5155b4441dd72808fdf84290d84ad8a39ca3b0f994363ade4cebb1", size = 10314005, upload-time = "2026-04-09T14:06:00.026Z" }, + { url = "https://files.pythonhosted.org/packages/da/73/c209138a5c98c0d321266372fc4e33ad43d506d7e5dd817dd89b60a8548f/ruff-0.15.10-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e1dd04312997c99ea6965df66a14fb4f03ba978564574ffc68b0d61fd3989e", size = 10643450, upload-time = "2026-04-09T14:05:42.137Z" }, + { url = "https://files.pythonhosted.org/packages/ec/76/0deec355d8ec10709653635b1f90856735302cb8e149acfdf6f82a5feb70/ruff-0.15.10-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8154d43684e4333360fedd11aaa40b1b08a4e37d8ffa9d95fee6fa5b37b6fab1", size = 10379597, upload-time = "2026-04-09T14:05:49.984Z" }, + { url = "https://files.pythonhosted.org/packages/dc/be/86bba8fc8798c081e28a4b3bb6d143ccad3fd5f6f024f02002b8f08a9fa3/ruff-0.15.10-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ab88715f3a6deb6bde6c227f3a123410bec7b855c3ae331b4c006189e895cef", size = 11146645, upload-time = "2026-04-09T14:06:12.246Z" }, + { url = "https://files.pythonhosted.org/packages/a8/89/140025e65911b281c57be1d385ba1d932c2366ca88ae6663685aed8d4881/ruff-0.15.10-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a768ff5969b4f44c349d48edf4ab4f91eddb27fd9d77799598e130fb628aa158", size = 12030289, upload-time = "2026-04-09T14:06:04.776Z" }, + { url = "https://files.pythonhosted.org/packages/88/de/ddacca9545a5e01332567db01d44bd8cf725f2db3b3d61a80550b48308ea/ruff-0.15.10-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ee3ef42dab7078bda5ff6a1bcba8539e9857deb447132ad5566a038674540d0", size = 11496266, upload-time = "2026-04-09T14:05:55.485Z" }, + { url = "https://files.pythonhosted.org/packages/bc/bb/7ddb00a83760ff4a83c4e2fc231fd63937cc7317c10c82f583302e0f6586/ruff-0.15.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51cb8cc943e891ba99989dd92d61e29b1d231e14811db9be6440ecf25d5c1609", size = 11256418, upload-time = "2026-04-09T14:05:57.69Z" }, + { url = "https://files.pythonhosted.org/packages/dc/8d/55de0d35aacf6cd50b6ee91ee0f291672080021896543776f4170fc5c454/ruff-0.15.10-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:e59c9bdc056a320fb9ea1700a8d591718b8faf78af065484e801258d3a76bc3f", size = 11288416, upload-time = "2026-04-09T14:05:44.695Z" }, + { url = "https://files.pythonhosted.org/packages/68/cf/9438b1a27426ec46a80e0a718093c7f958ef72f43eb3111862949ead3cc1/ruff-0.15.10-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:136c00ca2f47b0018b073f28cb5c1506642a830ea941a60354b0e8bc8076b151", size = 10621053, upload-time = "2026-04-09T14:05:52.782Z" }, + { url = "https://files.pythonhosted.org/packages/4c/50/e29be6e2c135e9cd4cb15fbade49d6a2717e009dff3766dd080fcb82e251/ruff-0.15.10-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:8b80a2f3c9c8a950d6237f2ca12b206bccff626139be9fa005f14feb881a1ae8", size = 10378302, upload-time = "2026-04-09T14:06:14.361Z" }, + { url = "https://files.pythonhosted.org/packages/18/2f/e0b36a6f99c51bb89f3a30239bc7bf97e87a37ae80aa2d6542d6e5150364/ruff-0.15.10-py3-none-musllinux_1_2_i686.whl", hash = "sha256:e3e53c588164dc025b671c9df2462429d60357ea91af7e92e9d56c565a9f1b07", size = 10850074, upload-time = "2026-04-09T14:06:16.581Z" }, + { url = "https://files.pythonhosted.org/packages/11/08/874da392558ce087a0f9b709dc6ec0d60cbc694c1c772dab8d5f31efe8cb/ruff-0.15.10-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b0c52744cf9f143a393e284125d2576140b68264a93c6716464e129a3e9adb48", size = 11358051, upload-time = "2026-04-09T14:06:18.948Z" }, + { url = "https://files.pythonhosted.org/packages/e4/46/602938f030adfa043e67112b73821024dc79f3ab4df5474c25fa4c1d2d14/ruff-0.15.10-py3-none-win32.whl", hash = "sha256:d4272e87e801e9a27a2e8df7b21011c909d9ddd82f4f3281d269b6ba19789ca5", size = 10588964, upload-time = "2026-04-09T14:06:07.14Z" }, + { url = "https://files.pythonhosted.org/packages/25/b6/261225b875d7a13b33a6d02508c39c28450b2041bb01d0f7f1a83d569512/ruff-0.15.10-py3-none-win_amd64.whl", hash = "sha256:28cb32d53203242d403d819fd6983152489b12e4a3ae44993543d6fe62ab42ed", size = 11745044, upload-time = "2026-04-09T14:05:39.473Z" }, + { url = "https://files.pythonhosted.org/packages/58/ed/dea90a65b7d9e69888890fb14c90d7f51bf0c1e82ad800aeb0160e4bacfd/ruff-0.15.10-py3-none-win_arm64.whl", hash = "sha256:601d1610a9e1f1c2165a4f561eeaa2e2ea1e97f3287c5aa258d3dab8b57c6188", size = 11035607, upload-time = "2026-04-09T14:05:47.593Z" }, +] + [[package]] name = "s3transfer" version = "0.16.0"