diff --git a/gateway/run.py b/gateway/run.py index 5a2d0a1442..29dfa884ec 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -69,6 +69,46 @@ _PLATFORM_CONNECT_TIMEOUT_SECS_DEFAULT = 30.0 _AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60 +# --- Stale-code self-check ------------------------------------------------ +# Long-running gateway processes that survive an ``hermes update`` keep the +# old ``hermes_cli.config`` (and friends) cached in ``sys.modules``. When +# the updated tool files on disk then try to ``from hermes_cli.config +# import cfg_get`` (added in PR #17304), the import resolves against the +# already-loaded stale module object and raises ``ImportError`` — see +# Issue #17648. Rather than papering over the import failure site-by-site +# in every tool file, detect the stale state centrally and auto-restart +# so the gateway reloads with fresh code. The sentinel files below are +# the canonical repo-level markers that every update touches; if any is +# newer than the gateway's boot time, we know the running process is out +# of date. +_STALE_CODE_SENTINELS: tuple[str, ...] = ( + "hermes_cli/config.py", + "hermes_cli/__init__.py", + "run_agent.py", + "gateway/run.py", + "pyproject.toml", +) + + +def _compute_repo_mtime(repo_root: Path) -> float: + """Return the newest mtime across the stale-code sentinel files. + + Missing files are ignored (they may not exist on older checkouts). + Returns 0.0 if no sentinel file is readable — treat that as "can't + tell", which downstream callers interpret as "not stale" to avoid + false-positive restart loops. + """ + newest = 0.0 + for rel in _STALE_CODE_SENTINELS: + try: + st = (repo_root / rel).stat() + except (OSError, FileNotFoundError): + continue + if st.st_mtime > newest: + newest = st.st_mtime + return newest + + def _coerce_gateway_timestamp(value: Any) -> Optional[float]: """Best-effort conversion of stored gateway timestamps to epoch seconds. @@ -840,6 +880,12 @@ class GatewayRunner: _stop_task: Optional[asyncio.Task] = None _session_model_overrides: Dict[str, Dict[str, str]] = {} _session_reasoning_overrides: Dict[str, Dict[str, Any]] = {} + # Stale-code self-check defaults (see _detect_stale_code()). Class-level + # so tests that construct GatewayRunner via ``object.__new__`` without + # running __init__ don't crash when _handle_message reads these. + _boot_wall_time: float = 0.0 + _boot_repo_mtime: float = 0.0 + _stale_code_restart_triggered: bool = False def __init__(self, config: Optional[GatewayConfig] = None): global _gateway_runner_ref @@ -848,6 +894,22 @@ class GatewayRunner: self._warn_if_docker_media_delivery_is_risky() _gateway_runner_ref = _weakref.ref(self) + # Boot-time snapshot used by the stale-code self-check. Captured + # before any work happens so post-update file writes are guaranteed + # to have newer mtimes. See _detect_stale_code() / Issue #17648. + try: + self._boot_wall_time: float = time.time() + self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent + self._boot_repo_mtime: float = _compute_repo_mtime( + self._repo_root_for_staleness, + ) + except Exception: + self._boot_wall_time = 0.0 + self._repo_root_for_staleness = Path(".") + self._boot_repo_mtime = 0.0 + self._stale_code_notified: set[str] = set() + self._stale_code_restart_triggered: bool = False + # Load ephemeral config from config.yaml / env vars. # Both are injected at API-call time only and never persisted. self._prefill_messages = self._load_prefill_messages() @@ -2392,6 +2454,63 @@ class GatewayRunner: task.add_done_callback(self._background_tasks.discard) return True + def _detect_stale_code(self) -> bool: + """Return True if source files on disk are newer than the running process. + + A gateway that survives ``hermes update`` (manual SIGTERM never + escalated, systemd restart race, detached-process respawn failed, + etc.) keeps pre-update modules cached in ``sys.modules``. Later + imports of names added post-update — e.g. ``cfg_get`` from PR + #17304 — raise ImportError against the stale module object (see + Issue #17648). Detecting this at the source — "the code on disk + is newer than me" — lets us auto-restart instead of serving + broken responses until the user notices and runs + ``hermes gateway restart`` manually. + + Returns False when the boot-time snapshot is unavailable or no + sentinel file is readable, to avoid false-positive restart loops + in unusual checkouts (sparse clones, read-only filesystems). + """ + if not self._boot_wall_time or not self._boot_repo_mtime: + return False + try: + current = _compute_repo_mtime(self._repo_root_for_staleness) + except Exception: + return False + if current <= 0.0: + return False + # 2-second slack guards against filesystems with coarse mtime + # resolution (FAT32, some NFS mounts). Real updates always move + # the newest-file mtime forward by minutes, so this doesn't hide + # genuine staleness. + return current > self._boot_repo_mtime + 2.0 + + def _trigger_stale_code_restart(self) -> None: + """Idempotently kick off a graceful restart after stale-code detection. + + Runs at most once per process. The restart request goes through + the normal drain path so in-flight agent turns finish before the + process exits; the service manager (systemd / launchd / detached + profile watcher) then respawns with fresh code. On manual + ``hermes gateway run`` installs without a supervisor, the + process exits and the user must restart by hand — but they get a + user-visible message telling them so. + """ + if self._stale_code_restart_triggered: + return + self._stale_code_restart_triggered = True + logger.warning( + "Stale-code self-check: source files newer than gateway boot " + "time (boot=%.0f, newest=%.0f) — requesting graceful restart. " + "See Issue #17648.", + self._boot_repo_mtime, + _compute_repo_mtime(self._repo_root_for_staleness), + ) + try: + self.request_restart(detached=False, via_service=True) + except Exception as exc: + logger.error("Stale-code restart request failed: %s", exc) + async def start(self) -> bool: """ Start the gateway and all configured platform adapters. @@ -4190,6 +4309,27 @@ class GatewayRunner: """ source = event.source + # Stale-code self-check (Issue #17648). A gateway that survives + # ``hermes update`` keeps old modules cached in sys.modules; the + # first inbound message is our earliest safe chance to detect + # this and restart gracefully before we dispatch to the agent + # and hit ImportError on freshly-added names (e.g. cfg_get). + # Idempotent — runs the real check at most once per message, and + # request_restart() no-ops after the first call. + try: + if self._detect_stale_code(): + self._trigger_stale_code_restart() + # Acknowledge to the user so they don't see a silent + # drop; the gateway will be back up in a moment via the + # service manager / profile-watcher respawn. + return ( + "⟳ Gateway code was updated in the background — " + "restarting this gateway so your next message runs " + "on the new code. Please retry in a moment." + ) + except Exception as _stale_exc: + logger.debug("Stale-code self-check failed: %s", _stale_exc) + # Internal events (e.g. background-process completion notifications) # are system-generated and must skip user authorization. is_internal = bool(getattr(event, "internal", False)) diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 92e932dab6..856d85c636 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -7548,6 +7548,42 @@ def _cmd_update_impl(args, gateway_mode: bool): # No gateways were running — nothing to do pass + # --- Post-restart survivor sweep ----------------------------- + # Issue #17648: some gateways ignore SIGTERM (stuck drain, + # blocked I/O, PID dead but zombie). The detached profile + # watchers wait 120s for the old PID to exit — if it never + # does, no respawn happens and the user keeps hitting + # ImportError against a stale sys.modules. Give the + # graceful paths a brief window to complete, then SIGKILL + # any remaining pre-update PIDs so the watcher / service + # manager can relaunch with fresh code. + try: + _time.sleep(3.0) + _service_pids_after = _get_service_pids() + _surviving = find_gateway_pids( + exclude_pids=_service_pids_after, all_profiles=True, + ) + # Scope to PIDs we already tried to kill during this + # update (killed_pids). Anything new is a gateway that + # started AFTER our restart attempt — respecting user + # intent, we don't kill those. + _stuck = [pid for pid in _surviving if pid in killed_pids] + if _stuck: + print() + print( + f" ⚠ {len(_stuck)} gateway process(es) ignored SIGTERM — force-killing" + ) + for pid in _stuck: + try: + os.kill(pid, _signal.SIGKILL) + except (ProcessLookupError, PermissionError): + pass + # Give the OS a beat to reap the processes so the + # watchers see them exit and respawn. + _time.sleep(1.5) + except Exception as _sweep_exc: + logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc) + except Exception as e: logger.debug("Gateway restart during update failed: %s", e) diff --git a/tests/gateway/test_stale_code_self_check.py b/tests/gateway/test_stale_code_self_check.py new file mode 100644 index 0000000000..5289f575d4 --- /dev/null +++ b/tests/gateway/test_stale_code_self_check.py @@ -0,0 +1,223 @@ +"""Tests for the gateway stale-code self-check (Issue #17648). + +A gateway that survives ``hermes update`` keeps pre-update modules cached +in ``sys.modules``. Later imports of names added post-update (e.g. +``cfg_get`` from PR #17304) raise ImportError against the stale module +object. The self-check in ``GatewayRunner._detect_stale_code()`` detects +this by comparing boot-time sentinel-file mtimes against current ones, +and ``_trigger_stale_code_restart()`` triggers a graceful restart. +""" + +import os +import time +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from gateway.run import ( + GatewayRunner, + _compute_repo_mtime, + _STALE_CODE_SENTINELS, +) + + +def _make_tmp_repo(tmp_path: Path) -> Path: + """Create a fake repo with all stale-code sentinel files.""" + for rel in _STALE_CODE_SENTINELS: + p = tmp_path / rel + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text("# test sentinel\n") + return tmp_path + + +def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float): + """Bare GatewayRunner with just the stale-check attributes set.""" + runner = object.__new__(GatewayRunner) + runner._repo_root_for_staleness = repo_root + runner._boot_wall_time = boot_wall + runner._boot_repo_mtime = boot_mtime + runner._stale_code_notified = set() + runner._stale_code_restart_triggered = False + return runner + + +def test_compute_repo_mtime_returns_newest(tmp_path): + """_compute_repo_mtime returns the newest mtime across sentinel files.""" + repo = _make_tmp_repo(tmp_path) + + # Stamp a baseline mtime across all sentinels + baseline = time.time() - 100 + for rel in _STALE_CODE_SENTINELS: + os.utime(repo / rel, (baseline, baseline)) + + # Touch one file forward + newer = time.time() + os.utime(repo / "hermes_cli/config.py", (newer, newer)) + + result = _compute_repo_mtime(repo) + assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution) + + +def test_compute_repo_mtime_missing_files_returns_zero(tmp_path): + """Missing sentinel files return 0.0 (treated as 'can't tell' upstream).""" + # tmp_path has none of the sentinels + assert _compute_repo_mtime(tmp_path) == 0.0 + + +def test_compute_repo_mtime_partial_files_still_works(tmp_path): + """Partial sentinel presence still returns newest of the readable ones.""" + (tmp_path / "hermes_cli").mkdir() + target = tmp_path / "hermes_cli" / "config.py" + target.write_text("# partial\n") + target_mtime = time.time() - 50 + os.utime(target, (target_mtime, target_mtime)) + + result = _compute_repo_mtime(tmp_path) + assert abs(result - target_mtime) < 1.0 + + +def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path): + """No boot snapshot → can't tell → not stale (no restart loop).""" + repo = _make_tmp_repo(tmp_path) + runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0) + assert runner._detect_stale_code() is False + + +def test_detect_stale_code_false_when_files_unchanged(tmp_path): + """Source files at boot mtime → not stale.""" + repo = _make_tmp_repo(tmp_path) + # Freeze all sentinels to the same mtime + baseline = time.time() - 100 + for rel in _STALE_CODE_SENTINELS: + os.utime(repo / rel, (baseline, baseline)) + + runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) + assert runner._detect_stale_code() is False + + +def test_detect_stale_code_true_after_update(tmp_path): + """Sentinel files newer than boot snapshot → stale.""" + repo = _make_tmp_repo(tmp_path) + baseline = time.time() - 100 + for rel in _STALE_CODE_SENTINELS: + os.utime(repo / rel, (baseline, baseline)) + + runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) + + # Simulate hermes update touching config.py + new_mtime = time.time() + os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime)) + + assert runner._detect_stale_code() is True + + +def test_detect_stale_code_ignores_subsecond_drift(tmp_path): + """2-second slack prevents false positives on coarse-mtime filesystems.""" + repo = _make_tmp_repo(tmp_path) + baseline = time.time() - 100 + for rel in _STALE_CODE_SENTINELS: + os.utime(repo / rel, (baseline, baseline)) + + runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) + + # Touch config.py 1s newer — within the 2s slack → not stale + os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0)) + assert runner._detect_stale_code() is False + + # Touch 5s newer → stale + os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0)) + assert runner._detect_stale_code() is True + + +def test_trigger_stale_code_restart_is_idempotent(tmp_path): + """Calling _trigger_stale_code_restart twice only requests restart once.""" + repo = _make_tmp_repo(tmp_path) + runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) + + calls = [] + + def fake_request_restart(*, detached=False, via_service=False): + calls.append((detached, via_service)) + return True + + runner.request_restart = fake_request_restart + + runner._trigger_stale_code_restart() + runner._trigger_stale_code_restart() + runner._trigger_stale_code_restart() + + assert len(calls) == 1 + assert runner._stale_code_restart_triggered is True + + +def test_trigger_stale_code_restart_survives_request_failure(tmp_path): + """If request_restart raises, we swallow and mark as triggered anyway.""" + repo = _make_tmp_repo(tmp_path) + runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) + + def boom(*, detached=False, via_service=False): + raise RuntimeError("no event loop") + + runner.request_restart = boom + + # Should not raise + runner._trigger_stale_code_restart() + + # Marked triggered so we don't retry on every subsequent message + assert runner._stale_code_restart_triggered is True + + +def test_detect_stale_code_handles_disappearing_repo_root(tmp_path): + """If the repo root vanishes after boot, return False (don't loop).""" + repo = _make_tmp_repo(tmp_path) + baseline = time.time() - 100 + for rel in _STALE_CODE_SENTINELS: + os.utime(repo / rel, (baseline, baseline)) + + runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) + + # Remove all sentinel files — _compute_repo_mtime returns 0.0 + for rel in _STALE_CODE_SENTINELS: + (repo / rel).unlink(missing_ok=True) + + assert runner._detect_stale_code() is False + + +def test_class_level_defaults_prevent_uninitialized_access(): + """Partial construction via object.__new__ must not crash _detect_stale_code.""" + runner = object.__new__(GatewayRunner) + # Don't set any instance attrs — class-level defaults should kick in + runner._repo_root_for_staleness = Path(".") + # _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0) + assert runner._detect_stale_code() is False + # _stale_code_restart_triggered falls through to class default (False) + assert runner._stale_code_restart_triggered is False + + +def test_init_captures_boot_snapshot(monkeypatch, tmp_path): + """GatewayRunner.__init__ captures a usable stale-code baseline.""" + # Stub out the heavy parts of __init__ we don't need. We only want + # to prove the stale-code snapshot is captured before anything else. + from gateway import run as run_mod + + calls = {} + + def fake_compute(repo_root): + calls["repo_root"] = repo_root + return 1234567890.0 + + monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute) + + # Build a runner without running the full __init__ — then manually + # exercise the stale-check init block that __init__ contains. + runner = object.__new__(GatewayRunner) + runner._boot_wall_time = time.time() + runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent + runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness) + runner._stale_code_notified = set() + runner._stale_code_restart_triggered = False + + assert runner._boot_repo_mtime == 1234567890.0 + assert calls["repo_root"] == runner._repo_root_for_staleness + assert runner._boot_wall_time > 0