fix(gateway): auto-restart when source files change out from under us (#17648) (#18409)

Long-running gateway processes that survive 'hermes update' keep pre-update modules cached in sys.modules. When new tool files on disk then try to 'from hermes_cli.config import cfg_get' (added in PR #17304), the import resolves against the stale module object and raises ImportError — hitting users on Matrix, Telegram, Feishu, and other platforms. Two defenses: 1. Gateway self-check (gateway/run.py). On __init__, snapshot the newest mtime across sentinel source files (hermes_cli/config.py, run_agent.py, gateway/run.py, etc.). On every inbound message, re-read those mtimes; if any is newer than boot time + 2s slack, request a graceful restart via the normal drain path and return a one-line ack to the user. Idempotent, works regardless of how the update happened (hermes update, manual git pull, installer). 2. Post-restart survivor sweep ('hermes update'). After the existing restart loop, sleep 3s, rescan for gateway PIDs we already tried to kill, and SIGKILL any survivors. The detached profile watchers and systemd then relaunch with fresh code instead of waiting out the 120s watcher timeout. Closes #17648.
2026-06-16 09:31:37 +00:00 · 2026-05-01 09:50:08 -07:00 · 2026-05-01 09:50:08 -07:00 · f99676e315
commit f99676e315
parent 77c0bc6b13
3 changed files with 399 additions and 0 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -69,6 +69,46 @@ _PLATFORM_CONNECT_TIMEOUT_SECS_DEFAULT = 30.0
 _AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60


+# --- Stale-code self-check ------------------------------------------------
+# Long-running gateway processes that survive an ``hermes update`` keep the
+# old ``hermes_cli.config`` (and friends) cached in ``sys.modules``.  When
+# the updated tool files on disk then try to ``from hermes_cli.config
+# import cfg_get`` (added in PR #17304), the import resolves against the
+# already-loaded stale module object and raises ``ImportError`` — see
+# Issue #17648.  Rather than papering over the import failure site-by-site
+# in every tool file, detect the stale state centrally and auto-restart
+# so the gateway reloads with fresh code.  The sentinel files below are
+# the canonical repo-level markers that every update touches; if any is
+# newer than the gateway's boot time, we know the running process is out
+# of date.
+_STALE_CODE_SENTINELS: tuple[str, ...] = (
+    "hermes_cli/config.py",
+    "hermes_cli/__init__.py",
+    "run_agent.py",
+    "gateway/run.py",
+    "pyproject.toml",
+)
+
+
+def _compute_repo_mtime(repo_root: Path) -> float:
+    """Return the newest mtime across the stale-code sentinel files.
+
+    Missing files are ignored (they may not exist on older checkouts).
+    Returns 0.0 if no sentinel file is readable — treat that as "can't
+    tell", which downstream callers interpret as "not stale" to avoid
+    false-positive restart loops.
+    """
+    newest = 0.0
+    for rel in _STALE_CODE_SENTINELS:
+        try:
+            st = (repo_root / rel).stat()
+        except (OSError, FileNotFoundError):
+            continue
+        if st.st_mtime > newest:
+            newest = st.st_mtime
+    return newest
+
+
 def _coerce_gateway_timestamp(value: Any) -> Optional[float]:
    """Best-effort conversion of stored gateway timestamps to epoch seconds.

@ -840,6 +880,12 @@ class GatewayRunner:
    _stop_task: Optional[asyncio.Task] = None
    _session_model_overrides: Dict[str, Dict[str, str]] = {}
    _session_reasoning_overrides: Dict[str, Dict[str, Any]] = {}
+    # Stale-code self-check defaults (see _detect_stale_code()).  Class-level
+    # so tests that construct GatewayRunner via ``object.__new__`` without
+    # running __init__ don't crash when _handle_message reads these.
+    _boot_wall_time: float = 0.0
+    _boot_repo_mtime: float = 0.0
+    _stale_code_restart_triggered: bool = False

    def __init__(self, config: Optional[GatewayConfig] = None):
        global _gateway_runner_ref
@ -848,6 +894,22 @@ class GatewayRunner:
        self._warn_if_docker_media_delivery_is_risky()
        _gateway_runner_ref = _weakref.ref(self)

+        # Boot-time snapshot used by the stale-code self-check.  Captured
+        # before any work happens so post-update file writes are guaranteed
+        # to have newer mtimes.  See _detect_stale_code() / Issue #17648.
+        try:
+            self._boot_wall_time: float = time.time()
+            self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
+            self._boot_repo_mtime: float = _compute_repo_mtime(
+                self._repo_root_for_staleness,
+            )
+        except Exception:
+            self._boot_wall_time = 0.0
+            self._repo_root_for_staleness = Path(".")
+            self._boot_repo_mtime = 0.0
+        self._stale_code_notified: set[str] = set()
+        self._stale_code_restart_triggered: bool = False
+
        # Load ephemeral config from config.yaml / env vars.
        # Both are injected at API-call time only and never persisted.
        self._prefill_messages = self._load_prefill_messages()
@ -2392,6 +2454,63 @@ class GatewayRunner:
        task.add_done_callback(self._background_tasks.discard)
        return True

+    def _detect_stale_code(self) -> bool:
+        """Return True if source files on disk are newer than the running process.
+
+        A gateway that survives ``hermes update`` (manual SIGTERM never
+        escalated, systemd restart race, detached-process respawn failed,
+        etc.) keeps pre-update modules cached in ``sys.modules``.  Later
+        imports of names added post-update — e.g. ``cfg_get`` from PR
+        #17304 — raise ImportError against the stale module object (see
+        Issue #17648).  Detecting this at the source — "the code on disk
+        is newer than me" — lets us auto-restart instead of serving
+        broken responses until the user notices and runs
+        ``hermes gateway restart`` manually.
+
+        Returns False when the boot-time snapshot is unavailable or no
+        sentinel file is readable, to avoid false-positive restart loops
+        in unusual checkouts (sparse clones, read-only filesystems).
+        """
+        if not self._boot_wall_time or not self._boot_repo_mtime:
+            return False
+        try:
+            current = _compute_repo_mtime(self._repo_root_for_staleness)
+        except Exception:
+            return False
+        if current <= 0.0:
+            return False
+        # 2-second slack guards against filesystems with coarse mtime
+        # resolution (FAT32, some NFS mounts).  Real updates always move
+        # the newest-file mtime forward by minutes, so this doesn't hide
+        # genuine staleness.
+        return current > self._boot_repo_mtime + 2.0
+
+    def _trigger_stale_code_restart(self) -> None:
+        """Idempotently kick off a graceful restart after stale-code detection.
+
+        Runs at most once per process.  The restart request goes through
+        the normal drain path so in-flight agent turns finish before the
+        process exits; the service manager (systemd / launchd / detached
+        profile watcher) then respawns with fresh code.  On manual
+        ``hermes gateway run`` installs without a supervisor, the
+        process exits and the user must restart by hand — but they get a
+        user-visible message telling them so.
+        """
+        if self._stale_code_restart_triggered:
+            return
+        self._stale_code_restart_triggered = True
+        logger.warning(
+            "Stale-code self-check: source files newer than gateway boot "
+            "time (boot=%.0f, newest=%.0f) — requesting graceful restart. "
+            "See Issue #17648.",
+            self._boot_repo_mtime,
+            _compute_repo_mtime(self._repo_root_for_staleness),
+        )
+        try:
+            self.request_restart(detached=False, via_service=True)
+        except Exception as exc:
+            logger.error("Stale-code restart request failed: %s", exc)
+
    async def start(self) -> bool:
        """
        Start the gateway and all configured platform adapters.
@ -4190,6 +4309,27 @@ class GatewayRunner:
        """
        source = event.source

+        # Stale-code self-check (Issue #17648).  A gateway that survives
+        # ``hermes update`` keeps old modules cached in sys.modules; the
+        # first inbound message is our earliest safe chance to detect
+        # this and restart gracefully before we dispatch to the agent
+        # and hit ImportError on freshly-added names (e.g. cfg_get).
+        # Idempotent — runs the real check at most once per message, and
+        # request_restart() no-ops after the first call.
+        try:
+            if self._detect_stale_code():
+                self._trigger_stale_code_restart()
+                # Acknowledge to the user so they don't see a silent
+                # drop; the gateway will be back up in a moment via the
+                # service manager / profile-watcher respawn.
+                return (
+                    "⟳ Gateway code was updated in the background — "
+                    "restarting this gateway so your next message runs "
+                    "on the new code. Please retry in a moment."
+                )
+        except Exception as _stale_exc:
+            logger.debug("Stale-code self-check failed: %s", _stale_exc)
+
        # Internal events (e.g. background-process completion notifications)
        # are system-generated and must skip user authorization.
        is_internal = bool(getattr(event, "internal", False))
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@ -7548,6 +7548,42 @@ def _cmd_update_impl(args, gateway_mode: bool):
                # No gateways were running — nothing to do
                pass

+            # --- Post-restart survivor sweep -----------------------------
+            # Issue #17648: some gateways ignore SIGTERM (stuck drain,
+            # blocked I/O, PID dead but zombie).  The detached profile
+            # watchers wait 120s for the old PID to exit — if it never
+            # does, no respawn happens and the user keeps hitting
+            # ImportError against a stale sys.modules.  Give the
+            # graceful paths a brief window to complete, then SIGKILL
+            # any remaining pre-update PIDs so the watcher / service
+            # manager can relaunch with fresh code.
+            try:
+                _time.sleep(3.0)
+                _service_pids_after = _get_service_pids()
+                _surviving = find_gateway_pids(
+                    exclude_pids=_service_pids_after, all_profiles=True,
+                )
+                # Scope to PIDs we already tried to kill during this
+                # update (killed_pids).  Anything new is a gateway that
+                # started AFTER our restart attempt — respecting user
+                # intent, we don't kill those.
+                _stuck = [pid for pid in _surviving if pid in killed_pids]
+                if _stuck:
+                    print()
+                    print(
+                        f"  ⚠ {len(_stuck)} gateway process(es) ignored SIGTERM — force-killing"
+                    )
+                    for pid in _stuck:
+                        try:
+                            os.kill(pid, _signal.SIGKILL)
+                        except (ProcessLookupError, PermissionError):
+                            pass
+                    # Give the OS a beat to reap the processes so the
+                    # watchers see them exit and respawn.
+                    _time.sleep(1.5)
+            except Exception as _sweep_exc:
+                logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc)
+
        except Exception as e:
            logger.debug("Gateway restart during update failed: %s", e)

--- a/tests/gateway/test_stale_code_self_check.py
+++ b/tests/gateway/test_stale_code_self_check.py
@ -0,0 +1,223 @@
+"""Tests for the gateway stale-code self-check (Issue #17648).
+
+A gateway that survives ``hermes update`` keeps pre-update modules cached
+in ``sys.modules``.  Later imports of names added post-update (e.g.
+``cfg_get`` from PR #17304) raise ImportError against the stale module
+object.  The self-check in ``GatewayRunner._detect_stale_code()`` detects
+this by comparing boot-time sentinel-file mtimes against current ones,
+and ``_trigger_stale_code_restart()`` triggers a graceful restart.
+"""
+
+import os
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from gateway.run import (
+    GatewayRunner,
+    _compute_repo_mtime,
+    _STALE_CODE_SENTINELS,
+)
+
+
+def _make_tmp_repo(tmp_path: Path) -> Path:
+    """Create a fake repo with all stale-code sentinel files."""
+    for rel in _STALE_CODE_SENTINELS:
+        p = tmp_path / rel
+        p.parent.mkdir(parents=True, exist_ok=True)
+        p.write_text("# test sentinel\n")
+    return tmp_path
+
+
+def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):
+    """Bare GatewayRunner with just the stale-check attributes set."""
+    runner = object.__new__(GatewayRunner)
+    runner._repo_root_for_staleness = repo_root
+    runner._boot_wall_time = boot_wall
+    runner._boot_repo_mtime = boot_mtime
+    runner._stale_code_notified = set()
+    runner._stale_code_restart_triggered = False
+    return runner
+
+
+def test_compute_repo_mtime_returns_newest(tmp_path):
+    """_compute_repo_mtime returns the newest mtime across sentinel files."""
+    repo = _make_tmp_repo(tmp_path)
+
+    # Stamp a baseline mtime across all sentinels
+    baseline = time.time() - 100
+    for rel in _STALE_CODE_SENTINELS:
+        os.utime(repo / rel, (baseline, baseline))
+
+    # Touch one file forward
+    newer = time.time()
+    os.utime(repo / "hermes_cli/config.py", (newer, newer))
+
+    result = _compute_repo_mtime(repo)
+    assert abs(result - newer) < 1.0  # within 1s (filesystem mtime resolution)
+
+
+def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
+    """Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""
+    # tmp_path has none of the sentinels
+    assert _compute_repo_mtime(tmp_path) == 0.0
+
+
+def test_compute_repo_mtime_partial_files_still_works(tmp_path):
+    """Partial sentinel presence still returns newest of the readable ones."""
+    (tmp_path / "hermes_cli").mkdir()
+    target = tmp_path / "hermes_cli" / "config.py"
+    target.write_text("# partial\n")
+    target_mtime = time.time() - 50
+    os.utime(target, (target_mtime, target_mtime))
+
+    result = _compute_repo_mtime(tmp_path)
+    assert abs(result - target_mtime) < 1.0
+
+
+def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):
+    """No boot snapshot → can't tell → not stale (no restart loop)."""
+    repo = _make_tmp_repo(tmp_path)
+    runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)
+    assert runner._detect_stale_code() is False
+
+
+def test_detect_stale_code_false_when_files_unchanged(tmp_path):
+    """Source files at boot mtime → not stale."""
+    repo = _make_tmp_repo(tmp_path)
+    # Freeze all sentinels to the same mtime
+    baseline = time.time() - 100
+    for rel in _STALE_CODE_SENTINELS:
+        os.utime(repo / rel, (baseline, baseline))
+
+    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
+    assert runner._detect_stale_code() is False
+
+
+def test_detect_stale_code_true_after_update(tmp_path):
+    """Sentinel files newer than boot snapshot → stale."""
+    repo = _make_tmp_repo(tmp_path)
+    baseline = time.time() - 100
+    for rel in _STALE_CODE_SENTINELS:
+        os.utime(repo / rel, (baseline, baseline))
+
+    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
+
+    # Simulate hermes update touching config.py
+    new_mtime = time.time()
+    os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))
+
+    assert runner._detect_stale_code() is True
+
+
+def test_detect_stale_code_ignores_subsecond_drift(tmp_path):
+    """2-second slack prevents false positives on coarse-mtime filesystems."""
+    repo = _make_tmp_repo(tmp_path)
+    baseline = time.time() - 100
+    for rel in _STALE_CODE_SENTINELS:
+        os.utime(repo / rel, (baseline, baseline))
+
+    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
+
+    # Touch config.py 1s newer — within the 2s slack → not stale
+    os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
+    assert runner._detect_stale_code() is False
+
+    # Touch 5s newer → stale
+    os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
+    assert runner._detect_stale_code() is True
+
+
+def test_trigger_stale_code_restart_is_idempotent(tmp_path):
+    """Calling _trigger_stale_code_restart twice only requests restart once."""
+    repo = _make_tmp_repo(tmp_path)
+    runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
+
+    calls = []
+
+    def fake_request_restart(*, detached=False, via_service=False):
+        calls.append((detached, via_service))
+        return True
+
+    runner.request_restart = fake_request_restart
+
+    runner._trigger_stale_code_restart()
+    runner._trigger_stale_code_restart()
+    runner._trigger_stale_code_restart()
+
+    assert len(calls) == 1
+    assert runner._stale_code_restart_triggered is True
+
+
+def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
+    """If request_restart raises, we swallow and mark as triggered anyway."""
+    repo = _make_tmp_repo(tmp_path)
+    runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
+
+    def boom(*, detached=False, via_service=False):
+        raise RuntimeError("no event loop")
+
+    runner.request_restart = boom
+
+    # Should not raise
+    runner._trigger_stale_code_restart()
+
+    # Marked triggered so we don't retry on every subsequent message
+    assert runner._stale_code_restart_triggered is True
+
+
+def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):
+    """If the repo root vanishes after boot, return False (don't loop)."""
+    repo = _make_tmp_repo(tmp_path)
+    baseline = time.time() - 100
+    for rel in _STALE_CODE_SENTINELS:
+        os.utime(repo / rel, (baseline, baseline))
+
+    runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
+
+    # Remove all sentinel files — _compute_repo_mtime returns 0.0
+    for rel in _STALE_CODE_SENTINELS:
+        (repo / rel).unlink(missing_ok=True)
+
+    assert runner._detect_stale_code() is False
+
+
+def test_class_level_defaults_prevent_uninitialized_access():
+    """Partial construction via object.__new__ must not crash _detect_stale_code."""
+    runner = object.__new__(GatewayRunner)
+    # Don't set any instance attrs — class-level defaults should kick in
+    runner._repo_root_for_staleness = Path(".")
+    # _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)
+    assert runner._detect_stale_code() is False
+    # _stale_code_restart_triggered falls through to class default (False)
+    assert runner._stale_code_restart_triggered is False
+
+
+def test_init_captures_boot_snapshot(monkeypatch, tmp_path):
+    """GatewayRunner.__init__ captures a usable stale-code baseline."""
+    # Stub out the heavy parts of __init__ we don't need.  We only want
+    # to prove the stale-code snapshot is captured before anything else.
+    from gateway import run as run_mod
+
+    calls = {}
+
+    def fake_compute(repo_root):
+        calls["repo_root"] = repo_root
+        return 1234567890.0
+
+    monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)
+
+    # Build a runner without running the full __init__ — then manually
+    # exercise the stale-check init block that __init__ contains.
+    runner = object.__new__(GatewayRunner)
+    runner._boot_wall_time = time.time()
+    runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
+    runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
+    runner._stale_code_notified = set()
+    runner._stale_code_restart_triggered = False
+
+    assert runner._boot_repo_mtime == 1234567890.0
+    assert calls["repo_root"] == runner._repo_root_for_staleness
+    assert runner._boot_wall_time > 0