fix(gateway): auto-restart when source files change out from under us (#17648) (#18409)

Long-running gateway processes that survive 'hermes update' keep
pre-update modules cached in sys.modules. When new tool files on
disk then try to 'from hermes_cli.config import cfg_get' (added in
PR #17304), the import resolves against the stale module object
and raises ImportError — hitting users on Matrix, Telegram, Feishu,
and other platforms.

Two defenses:

1. Gateway self-check (gateway/run.py). On __init__, snapshot the
   newest mtime across sentinel source files (hermes_cli/config.py,
   run_agent.py, gateway/run.py, etc.). On every inbound message,
   re-read those mtimes; if any is newer than boot time + 2s slack,
   request a graceful restart via the normal drain path and return
   a one-line ack to the user. Idempotent, works regardless of how
   the update happened (hermes update, manual git pull, installer).

2. Post-restart survivor sweep ('hermes update'). After the existing
   restart loop, sleep 3s, rescan for gateway PIDs we already tried
   to kill, and SIGKILL any survivors. The detached profile watchers
   and systemd then relaunch with fresh code instead of waiting out
   the 120s watcher timeout.

Closes #17648.
This commit is contained in:
Teknium 2026-05-01 09:50:08 -07:00 committed by GitHub
parent 77c0bc6b13
commit f99676e315
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 399 additions and 0 deletions

View file

@ -69,6 +69,46 @@ _PLATFORM_CONNECT_TIMEOUT_SECS_DEFAULT = 30.0
_AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60
# --- Stale-code self-check ------------------------------------------------
# Long-running gateway processes that survive an ``hermes update`` keep the
# old ``hermes_cli.config`` (and friends) cached in ``sys.modules``. When
# the updated tool files on disk then try to ``from hermes_cli.config
# import cfg_get`` (added in PR #17304), the import resolves against the
# already-loaded stale module object and raises ``ImportError`` — see
# Issue #17648. Rather than papering over the import failure site-by-site
# in every tool file, detect the stale state centrally and auto-restart
# so the gateway reloads with fresh code. The sentinel files below are
# the canonical repo-level markers that every update touches; if any is
# newer than the gateway's boot time, we know the running process is out
# of date.
_STALE_CODE_SENTINELS: tuple[str, ...] = (
"hermes_cli/config.py",
"hermes_cli/__init__.py",
"run_agent.py",
"gateway/run.py",
"pyproject.toml",
)
def _compute_repo_mtime(repo_root: Path) -> float:
"""Return the newest mtime across the stale-code sentinel files.
Missing files are ignored (they may not exist on older checkouts).
Returns 0.0 if no sentinel file is readable treat that as "can't
tell", which downstream callers interpret as "not stale" to avoid
false-positive restart loops.
"""
newest = 0.0
for rel in _STALE_CODE_SENTINELS:
try:
st = (repo_root / rel).stat()
except (OSError, FileNotFoundError):
continue
if st.st_mtime > newest:
newest = st.st_mtime
return newest
def _coerce_gateway_timestamp(value: Any) -> Optional[float]:
"""Best-effort conversion of stored gateway timestamps to epoch seconds.
@ -840,6 +880,12 @@ class GatewayRunner:
_stop_task: Optional[asyncio.Task] = None
_session_model_overrides: Dict[str, Dict[str, str]] = {}
_session_reasoning_overrides: Dict[str, Dict[str, Any]] = {}
# Stale-code self-check defaults (see _detect_stale_code()). Class-level
# so tests that construct GatewayRunner via ``object.__new__`` without
# running __init__ don't crash when _handle_message reads these.
_boot_wall_time: float = 0.0
_boot_repo_mtime: float = 0.0
_stale_code_restart_triggered: bool = False
def __init__(self, config: Optional[GatewayConfig] = None):
global _gateway_runner_ref
@ -848,6 +894,22 @@ class GatewayRunner:
self._warn_if_docker_media_delivery_is_risky()
_gateway_runner_ref = _weakref.ref(self)
# Boot-time snapshot used by the stale-code self-check. Captured
# before any work happens so post-update file writes are guaranteed
# to have newer mtimes. See _detect_stale_code() / Issue #17648.
try:
self._boot_wall_time: float = time.time()
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
self._boot_repo_mtime: float = _compute_repo_mtime(
self._repo_root_for_staleness,
)
except Exception:
self._boot_wall_time = 0.0
self._repo_root_for_staleness = Path(".")
self._boot_repo_mtime = 0.0
self._stale_code_notified: set[str] = set()
self._stale_code_restart_triggered: bool = False
# Load ephemeral config from config.yaml / env vars.
# Both are injected at API-call time only and never persisted.
self._prefill_messages = self._load_prefill_messages()
@ -2392,6 +2454,63 @@ class GatewayRunner:
task.add_done_callback(self._background_tasks.discard)
return True
def _detect_stale_code(self) -> bool:
"""Return True if source files on disk are newer than the running process.
A gateway that survives ``hermes update`` (manual SIGTERM never
escalated, systemd restart race, detached-process respawn failed,
etc.) keeps pre-update modules cached in ``sys.modules``. Later
imports of names added post-update e.g. ``cfg_get`` from PR
#17304 — raise ImportError against the stale module object (see
Issue #17648). Detecting this at the source — "the code on disk
is newer than me" — lets us auto-restart instead of serving
broken responses until the user notices and runs
``hermes gateway restart`` manually.
Returns False when the boot-time snapshot is unavailable or no
sentinel file is readable, to avoid false-positive restart loops
in unusual checkouts (sparse clones, read-only filesystems).
"""
if not self._boot_wall_time or not self._boot_repo_mtime:
return False
try:
current = _compute_repo_mtime(self._repo_root_for_staleness)
except Exception:
return False
if current <= 0.0:
return False
# 2-second slack guards against filesystems with coarse mtime
# resolution (FAT32, some NFS mounts). Real updates always move
# the newest-file mtime forward by minutes, so this doesn't hide
# genuine staleness.
return current > self._boot_repo_mtime + 2.0
def _trigger_stale_code_restart(self) -> None:
"""Idempotently kick off a graceful restart after stale-code detection.
Runs at most once per process. The restart request goes through
the normal drain path so in-flight agent turns finish before the
process exits; the service manager (systemd / launchd / detached
profile watcher) then respawns with fresh code. On manual
``hermes gateway run`` installs without a supervisor, the
process exits and the user must restart by hand but they get a
user-visible message telling them so.
"""
if self._stale_code_restart_triggered:
return
self._stale_code_restart_triggered = True
logger.warning(
"Stale-code self-check: source files newer than gateway boot "
"time (boot=%.0f, newest=%.0f) — requesting graceful restart. "
"See Issue #17648.",
self._boot_repo_mtime,
_compute_repo_mtime(self._repo_root_for_staleness),
)
try:
self.request_restart(detached=False, via_service=True)
except Exception as exc:
logger.error("Stale-code restart request failed: %s", exc)
async def start(self) -> bool:
"""
Start the gateway and all configured platform adapters.
@ -4190,6 +4309,27 @@ class GatewayRunner:
"""
source = event.source
# Stale-code self-check (Issue #17648). A gateway that survives
# ``hermes update`` keeps old modules cached in sys.modules; the
# first inbound message is our earliest safe chance to detect
# this and restart gracefully before we dispatch to the agent
# and hit ImportError on freshly-added names (e.g. cfg_get).
# Idempotent — runs the real check at most once per message, and
# request_restart() no-ops after the first call.
try:
if self._detect_stale_code():
self._trigger_stale_code_restart()
# Acknowledge to the user so they don't see a silent
# drop; the gateway will be back up in a moment via the
# service manager / profile-watcher respawn.
return (
"⟳ Gateway code was updated in the background — "
"restarting this gateway so your next message runs "
"on the new code. Please retry in a moment."
)
except Exception as _stale_exc:
logger.debug("Stale-code self-check failed: %s", _stale_exc)
# Internal events (e.g. background-process completion notifications)
# are system-generated and must skip user authorization.
is_internal = bool(getattr(event, "internal", False))

View file

@ -7548,6 +7548,42 @@ def _cmd_update_impl(args, gateway_mode: bool):
# No gateways were running — nothing to do
pass
# --- Post-restart survivor sweep -----------------------------
# Issue #17648: some gateways ignore SIGTERM (stuck drain,
# blocked I/O, PID dead but zombie). The detached profile
# watchers wait 120s for the old PID to exit — if it never
# does, no respawn happens and the user keeps hitting
# ImportError against a stale sys.modules. Give the
# graceful paths a brief window to complete, then SIGKILL
# any remaining pre-update PIDs so the watcher / service
# manager can relaunch with fresh code.
try:
_time.sleep(3.0)
_service_pids_after = _get_service_pids()
_surviving = find_gateway_pids(
exclude_pids=_service_pids_after, all_profiles=True,
)
# Scope to PIDs we already tried to kill during this
# update (killed_pids). Anything new is a gateway that
# started AFTER our restart attempt — respecting user
# intent, we don't kill those.
_stuck = [pid for pid in _surviving if pid in killed_pids]
if _stuck:
print()
print(
f"{len(_stuck)} gateway process(es) ignored SIGTERM — force-killing"
)
for pid in _stuck:
try:
os.kill(pid, _signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass
# Give the OS a beat to reap the processes so the
# watchers see them exit and respawn.
_time.sleep(1.5)
except Exception as _sweep_exc:
logger.debug("Post-restart survivor sweep failed: %s", _sweep_exc)
except Exception as e:
logger.debug("Gateway restart during update failed: %s", e)

View file

@ -0,0 +1,223 @@
"""Tests for the gateway stale-code self-check (Issue #17648).
A gateway that survives ``hermes update`` keeps pre-update modules cached
in ``sys.modules``. Later imports of names added post-update (e.g.
``cfg_get`` from PR #17304) raise ImportError against the stale module
object. The self-check in ``GatewayRunner._detect_stale_code()`` detects
this by comparing boot-time sentinel-file mtimes against current ones,
and ``_trigger_stale_code_restart()`` triggers a graceful restart.
"""
import os
import time
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from gateway.run import (
GatewayRunner,
_compute_repo_mtime,
_STALE_CODE_SENTINELS,
)
def _make_tmp_repo(tmp_path: Path) -> Path:
"""Create a fake repo with all stale-code sentinel files."""
for rel in _STALE_CODE_SENTINELS:
p = tmp_path / rel
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text("# test sentinel\n")
return tmp_path
def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float):
"""Bare GatewayRunner with just the stale-check attributes set."""
runner = object.__new__(GatewayRunner)
runner._repo_root_for_staleness = repo_root
runner._boot_wall_time = boot_wall
runner._boot_repo_mtime = boot_mtime
runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False
return runner
def test_compute_repo_mtime_returns_newest(tmp_path):
"""_compute_repo_mtime returns the newest mtime across sentinel files."""
repo = _make_tmp_repo(tmp_path)
# Stamp a baseline mtime across all sentinels
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
# Touch one file forward
newer = time.time()
os.utime(repo / "hermes_cli/config.py", (newer, newer))
result = _compute_repo_mtime(repo)
assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution)
def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
"""Missing sentinel files return 0.0 (treated as 'can't tell' upstream)."""
# tmp_path has none of the sentinels
assert _compute_repo_mtime(tmp_path) == 0.0
def test_compute_repo_mtime_partial_files_still_works(tmp_path):
"""Partial sentinel presence still returns newest of the readable ones."""
(tmp_path / "hermes_cli").mkdir()
target = tmp_path / "hermes_cli" / "config.py"
target.write_text("# partial\n")
target_mtime = time.time() - 50
os.utime(target, (target_mtime, target_mtime))
result = _compute_repo_mtime(tmp_path)
assert abs(result - target_mtime) < 1.0
def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path):
"""No boot snapshot → can't tell → not stale (no restart loop)."""
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0)
assert runner._detect_stale_code() is False
def test_detect_stale_code_false_when_files_unchanged(tmp_path):
"""Source files at boot mtime → not stale."""
repo = _make_tmp_repo(tmp_path)
# Freeze all sentinels to the same mtime
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
assert runner._detect_stale_code() is False
def test_detect_stale_code_true_after_update(tmp_path):
"""Sentinel files newer than boot snapshot → stale."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Simulate hermes update touching config.py
new_mtime = time.time()
os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))
assert runner._detect_stale_code() is True
def test_detect_stale_code_ignores_subsecond_drift(tmp_path):
"""2-second slack prevents false positives on coarse-mtime filesystems."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Touch config.py 1s newer — within the 2s slack → not stale
os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
assert runner._detect_stale_code() is False
# Touch 5s newer → stale
os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
assert runner._detect_stale_code() is True
def test_trigger_stale_code_restart_is_idempotent(tmp_path):
"""Calling _trigger_stale_code_restart twice only requests restart once."""
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
calls = []
def fake_request_restart(*, detached=False, via_service=False):
calls.append((detached, via_service))
return True
runner.request_restart = fake_request_restart
runner._trigger_stale_code_restart()
runner._trigger_stale_code_restart()
runner._trigger_stale_code_restart()
assert len(calls) == 1
assert runner._stale_code_restart_triggered is True
def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
"""If request_restart raises, we swallow and mark as triggered anyway."""
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0)
def boom(*, detached=False, via_service=False):
raise RuntimeError("no event loop")
runner.request_restart = boom
# Should not raise
runner._trigger_stale_code_restart()
# Marked triggered so we don't retry on every subsequent message
assert runner._stale_code_restart_triggered is True
def test_detect_stale_code_handles_disappearing_repo_root(tmp_path):
"""If the repo root vanishes after boot, return False (don't loop)."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Remove all sentinel files — _compute_repo_mtime returns 0.0
for rel in _STALE_CODE_SENTINELS:
(repo / rel).unlink(missing_ok=True)
assert runner._detect_stale_code() is False
def test_class_level_defaults_prevent_uninitialized_access():
"""Partial construction via object.__new__ must not crash _detect_stale_code."""
runner = object.__new__(GatewayRunner)
# Don't set any instance attrs — class-level defaults should kick in
runner._repo_root_for_staleness = Path(".")
# _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0)
assert runner._detect_stale_code() is False
# _stale_code_restart_triggered falls through to class default (False)
assert runner._stale_code_restart_triggered is False
def test_init_captures_boot_snapshot(monkeypatch, tmp_path):
"""GatewayRunner.__init__ captures a usable stale-code baseline."""
# Stub out the heavy parts of __init__ we don't need. We only want
# to prove the stale-code snapshot is captured before anything else.
from gateway import run as run_mod
calls = {}
def fake_compute(repo_root):
calls["repo_root"] = repo_root
return 1234567890.0
monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute)
# Build a runner without running the full __init__ — then manually
# exercise the stale-check init block that __init__ contains.
runner = object.__new__(GatewayRunner)
runner._boot_wall_time = time.time()
runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False
assert runner._boot_repo_mtime == 1234567890.0
assert calls["repo_root"] == runner._repo_root_for_staleness
assert runner._boot_wall_time > 0