fix(gateway): use git HEAD SHA, not file mtimes, for stale-code check (#19740)

The stale-code self-check (Issue #17648) used sentinel-file mtimes to
decide whether the gateway survived a `hermes update` with stale
`sys.modules`. That signal false-positives on any write to the
sentinel files — including agent-driven edits during Hermes-on-Hermes
dev sessions. Telling the agent to patch `run_agent.py` would flip
the check to True on the next user message and force a gateway
restart even though no update happened.

Switch the signal to `git rev-parse HEAD`. Agent file edits don't
move HEAD; `hermes update` (git pull) always does. Reading .git/HEAD
directly (no subprocess) with a 5s cache keeps the overhead negligible
on bursty chats. Non-git installs short-circuit to False — the
stale-modules class can't occur without a git-backed update path, so
there's nothing to detect.

The legacy `_compute_repo_mtime` helper is kept but unused by
detection, reserved as a fallback hook for future pip-install update
paths.

- _read_git_head_sha(): resolves HEAD across main checkout, worktree
  (follows `gitdir:` + `commondir` pointers), and packed-refs layouts.
- _current_git_sha_cached(): per-runner 5s SHA cache.
- _detect_stale_code(): boot SHA vs current SHA, returns False when
  either is unavailable.
- Tests cover all four layouts, the agent-edits-don't-trigger
  regression, and cache behavior.

Refs #17648.
This commit is contained in:
Teknium 2026-05-04 12:33:21 -07:00 committed by GitHub
parent a21f364ad7
commit d90f73bcec
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 482 additions and 139 deletions

View file

@ -101,10 +101,21 @@ _AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60
# already-loaded stale module object and raises ``ImportError`` — see # already-loaded stale module object and raises ``ImportError`` — see
# Issue #17648. Rather than papering over the import failure site-by-site # Issue #17648. Rather than papering over the import failure site-by-site
# in every tool file, detect the stale state centrally and auto-restart # in every tool file, detect the stale state centrally and auto-restart
# so the gateway reloads with fresh code. The sentinel files below are # so the gateway reloads with fresh code.
# the canonical repo-level markers that every update touches; if any is #
# newer than the gateway's boot time, we know the running process is out # The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
# of date. # update`` moves that is NOT moved by agent-driven file edits. Earlier
# revisions of this check compared file mtimes across a sentinel set
# (run_agent.py, gateway/run.py, ...), but that produced false positives
# whenever the agent edited its own source files during a session:
# mtime jumps, stale-check fires, gateway restarts, user must retype.
# See the conversation at PR #<this> for the motivating incident.
#
# The legacy mtime sentinels are kept ONLY as a last-resort fallback for
# non-git installs (pip install from wheel, sparse clones with no .git
# dir). In those environments ``hermes update`` is not a supported path,
# so the check effectively no-ops — which is the safe behavior: better
# to ship one broken import than to restart on every agent-edit.
_STALE_CODE_SENTINELS: tuple[str, ...] = ( _STALE_CODE_SENTINELS: tuple[str, ...] = (
"hermes_cli/config.py", "hermes_cli/config.py",
"hermes_cli/__init__.py", "hermes_cli/__init__.py",
@ -113,10 +124,106 @@ _STALE_CODE_SENTINELS: tuple[str, ...] = (
"pyproject.toml", "pyproject.toml",
) )
# Cache git HEAD reads across consecutive messages so a chat burst doesn't
# spawn one subprocess per message. 5s is long enough to collapse a burst
# and short enough that the real post-update detection still fires within
# the user's perceived "next message" window.
_GIT_SHA_CACHE_TTL_SECS = 5.0
def _read_git_head_sha(repo_root: Path) -> Optional[str]:
"""Return the git HEAD SHA for ``repo_root``, or None if unavailable.
Reads ``.git/HEAD`` directly (and follows one level of ref) instead
of shelling out to ``git`` cheaper, no subprocess tax, works on
gateway hosts that don't have a ``git`` binary on PATH. Returns
None for non-git installs (no ``.git`` dir) or any I/O error; callers
treat None as "can't tell" and skip the check.
Supports the three layouts we care about:
1. Main checkout: ``<repo>/.git/`` is a directory.
2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
points at ``<main>/.git/worktrees/<name>/``. The worktree's
gitdir has HEAD + index but NOT refs/heads/ those live in
the main checkout, and ``<worktree-gitdir>/commondir`` points
at the main ``.git``. We search both locations for refs.
3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
listed in ``<main-git-dir>/packed-refs``.
"""
try:
git_dir = repo_root / ".git"
# Worktrees store ``.git`` as a file pointing at gitdir: <path>
if git_dir.is_file():
try:
content = git_dir.read_text().strip()
if content.startswith("gitdir:"):
git_dir = Path(content.split(":", 1)[1].strip())
if not git_dir.is_absolute():
git_dir = (repo_root / git_dir).resolve()
except OSError:
return None
if not git_dir.is_dir():
return None
# Figure out the "common" git dir — the one that owns shared refs.
# For a worktree, commondir points at it (relative path, resolve
# against git_dir). For a main checkout, common_dir == git_dir.
common_dir = git_dir
commondir_file = git_dir / "commondir"
if commondir_file.is_file():
try:
rel = commondir_file.read_text().strip()
candidate = (git_dir / rel).resolve() if rel else git_dir
if candidate.is_dir():
common_dir = candidate
except OSError:
pass
head_path = git_dir / "HEAD"
if not head_path.is_file():
return None
head_content = head_path.read_text().strip()
if head_content.startswith("ref:"):
# Symbolic ref — follow one level (e.g. ref: refs/heads/main).
# Worktree-local refs (bisect, rebase-merge state) live under
# git_dir; shared refs (refs/heads/*, refs/tags/*) live under
# common_dir. Try git_dir first, then common_dir.
ref_rel = head_content.split(":", 1)[1].strip()
for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
ref_path = base / ref_rel
if ref_path.is_file():
try:
sha = ref_path.read_text().strip()
except OSError:
continue
if sha:
return sha
# Packed refs fallback — always stored in the common dir.
packed = common_dir / "packed-refs"
if packed.is_file():
try:
for line in packed.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or line.startswith("^"):
continue
parts = line.split(None, 1)
if len(parts) == 2 and parts[1] == ref_rel:
return parts[0] or None
except OSError:
return None
return None
# Detached HEAD — content is the SHA directly.
return head_content or None
except Exception:
return None
def _compute_repo_mtime(repo_root: Path) -> float: def _compute_repo_mtime(repo_root: Path) -> float:
"""Return the newest mtime across the stale-code sentinel files. """Return the newest mtime across the stale-code sentinel files.
Legacy fallback used only for non-git installs (``.git`` missing).
Missing files are ignored (they may not exist on older checkouts). Missing files are ignored (they may not exist on older checkouts).
Returns 0.0 if no sentinel file is readable treat that as "can't Returns 0.0 if no sentinel file is readable treat that as "can't
tell", which downstream callers interpret as "not stale" to avoid tell", which downstream callers interpret as "not stale" to avoid
@ -1005,6 +1112,7 @@ class GatewayRunner:
# running __init__ don't crash when _handle_message reads these. # running __init__ don't crash when _handle_message reads these.
_boot_wall_time: float = 0.0 _boot_wall_time: float = 0.0
_boot_repo_mtime: float = 0.0 _boot_repo_mtime: float = 0.0
_boot_git_sha: Optional[str] = None
_stale_code_restart_triggered: bool = False _stale_code_restart_triggered: bool = False
def __init__(self, config: Optional[GatewayConfig] = None): def __init__(self, config: Optional[GatewayConfig] = None):
@ -1020,15 +1128,23 @@ class GatewayRunner:
try: try:
self._boot_wall_time: float = time.time() self._boot_wall_time: float = time.time()
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
self._boot_git_sha: Optional[str] = _read_git_head_sha(
self._repo_root_for_staleness,
)
self._boot_repo_mtime: float = _compute_repo_mtime( self._boot_repo_mtime: float = _compute_repo_mtime(
self._repo_root_for_staleness, self._repo_root_for_staleness,
) )
except Exception: except Exception:
self._boot_wall_time = 0.0 self._boot_wall_time = 0.0
self._repo_root_for_staleness = Path(".") self._repo_root_for_staleness = Path(".")
self._boot_git_sha = None
self._boot_repo_mtime = 0.0 self._boot_repo_mtime = 0.0
self._stale_code_notified: set[str] = set() self._stale_code_notified: set[str] = set()
self._stale_code_restart_triggered: bool = False self._stale_code_restart_triggered: bool = False
# Cached current-SHA read, refreshed at most every
# _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
self._cached_current_sha: Optional[str] = self._boot_git_sha
self._cached_current_sha_at: float = self._boot_wall_time
# Load ephemeral config from config.yaml / env vars. # Load ephemeral config from config.yaml / env vars.
# Both are injected at API-call time only and never persisted. # Both are injected at API-call time only and never persisted.
@ -2737,36 +2853,69 @@ class GatewayRunner:
task.add_done_callback(self._background_tasks.discard) task.add_done_callback(self._background_tasks.discard)
return True return True
def _current_git_sha_cached(self) -> Optional[str]:
"""Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
A bursty chat (user mashes "hello?" three times) would otherwise
re-read ``.git/HEAD`` on every message. Caching collapses that
into a single read and still re-checks within the user's
perceived "next message" window.
"""
now = time.time()
if (
self._cached_current_sha is not None
and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
):
return self._cached_current_sha
try:
sha = _read_git_head_sha(self._repo_root_for_staleness)
except Exception:
sha = None
self._cached_current_sha = sha
self._cached_current_sha_at = now
return sha
def _detect_stale_code(self) -> bool: def _detect_stale_code(self) -> bool:
"""Return True if source files on disk are newer than the running process. """Return True if the git HEAD moved since this process booted.
A gateway that survives ``hermes update`` (manual SIGTERM never A gateway that survives ``hermes update`` (manual SIGTERM never
escalated, systemd restart race, detached-process respawn failed, escalated, systemd restart race, detached-process respawn failed,
etc.) keeps pre-update modules cached in ``sys.modules``. Later etc.) keeps pre-update modules cached in ``sys.modules``. Later
imports of names added post-update e.g. ``cfg_get`` from PR imports of names added post-update e.g. ``cfg_get`` from PR
#17304 — raise ImportError against the stale module object (see #17304 — raise ImportError against the stale module object (see
Issue #17648). Detecting this at the source — "the code on disk Issue #17648).
is newer than me" — lets us auto-restart instead of serving
broken responses until the user notices and runs
``hermes gateway restart`` manually.
Returns False when the boot-time snapshot is unavailable or no We compare the git HEAD SHA at boot to the current SHA on disk.
sentinel file is readable, to avoid false-positive restart loops ``hermes update`` always moves HEAD forward via ``git pull``;
in unusual checkouts (sparse clones, read-only filesystems). agent file edits (the agent patching ``run_agent.py`` or
``gateway/run.py`` during a self-dev session) never move HEAD.
That makes SHA comparison free of the false-positive class that
the old mtime check suffered from the agent can edit any file
without triggering a phantom restart.
Returns False when:
- the boot SHA is unavailable (non-git install, first call
during partial init, etc.); we can't tell and refuse to loop
- the current SHA matches the boot SHA
- reading the current SHA fails for any reason
""" """
if not self._boot_wall_time or not self._boot_repo_mtime: if not self._boot_wall_time:
return False
if not self._boot_git_sha:
# Non-git install. ``hermes update`` is git-based, so a
# non-git install can't experience the stale-modules class
# this check exists to catch. Return False — no check, no
# false positives. (If we ever ship a pip-install update
# path, we'd add a persistent update marker here and compare
# its timestamp to self._boot_wall_time.)
return False return False
try: try:
current = _compute_repo_mtime(self._repo_root_for_staleness) current = self._current_git_sha_cached()
except Exception: except Exception:
return False return False
if current <= 0.0: if not current:
return False return False
# 2-second slack guards against filesystems with coarse mtime return current != self._boot_git_sha
# resolution (FAT32, some NFS mounts). Real updates always move
# the newest-file mtime forward by minutes, so this doesn't hide
# genuine staleness.
return current > self._boot_repo_mtime + 2.0
def _trigger_stale_code_restart(self) -> None: def _trigger_stale_code_restart(self) -> None:
"""Idempotently kick off a graceful restart after stale-code detection. """Idempotently kick off a graceful restart after stale-code detection.
@ -2782,12 +2931,17 @@ class GatewayRunner:
if self._stale_code_restart_triggered: if self._stale_code_restart_triggered:
return return
self._stale_code_restart_triggered = True self._stale_code_restart_triggered = True
current_sha = None
try:
current_sha = self._current_git_sha_cached()
except Exception:
pass
logger.warning( logger.warning(
"Stale-code self-check: source files newer than gateway boot " "Stale-code self-check: git HEAD moved since gateway boot "
"time (boot=%.0f, newest=%.0f) — requesting graceful restart. " "(boot=%s, current=%s) — requesting graceful restart. "
"See Issue #17648.", "See Issue #17648.",
self._boot_repo_mtime, (self._boot_git_sha or "?")[:12],
_compute_repo_mtime(self._repo_root_for_staleness), (current_sha or "?")[:12],
) )
try: try:
self.request_restart(detached=False, via_service=True) self.request_restart(detached=False, via_service=True)

View file

@ -3,25 +3,34 @@
A gateway that survives ``hermes update`` keeps pre-update modules cached A gateway that survives ``hermes update`` keeps pre-update modules cached
in ``sys.modules``. Later imports of names added post-update (e.g. in ``sys.modules``. Later imports of names added post-update (e.g.
``cfg_get`` from PR #17304) raise ImportError against the stale module ``cfg_get`` from PR #17304) raise ImportError against the stale module
object. The self-check in ``GatewayRunner._detect_stale_code()`` detects object.
this by comparing boot-time sentinel-file mtimes against current ones,
and ``_trigger_stale_code_restart()`` triggers a graceful restart. The self-check compares the git HEAD SHA at boot to the current SHA on
disk. ``hermes update`` always moves HEAD forward via ``git pull``;
agent-driven file edits (Hermes editing ``run_agent.py`` / ``gateway/run.py``
during a self-dev session) never move HEAD so the SHA signal is free of
the false-positive class that the earlier mtime-based check suffered from.
""" """
import os import os
import time import time
from pathlib import Path from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest import pytest
from gateway.run import ( from gateway.run import (
GatewayRunner, GatewayRunner,
_compute_repo_mtime, _compute_repo_mtime,
_read_git_head_sha,
_STALE_CODE_SENTINELS, _STALE_CODE_SENTINELS,
_GIT_SHA_CACHE_TTL_SECS,
) )
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_tmp_repo(tmp_path: Path) -> Path: def _make_tmp_repo(tmp_path: Path) -> Path:
"""Create a fake repo with all stale-code sentinel files.""" """Create a fake repo with all stale-code sentinel files."""
for rel in _STALE_CODE_SENTINELS: for rel in _STALE_CODE_SENTINELS:
@ -31,109 +40,303 @@ def _make_tmp_repo(tmp_path: Path) -> Path:
return tmp_path return tmp_path
def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float): def _make_git_repo(tmp_path: Path, sha: str = "a" * 40, branch: str = "main") -> Path:
"""Stamp a minimal .git directory so _read_git_head_sha can resolve a SHA.
We don't run real git — just lay down the files the reader walks
(.git/HEAD pointing at refs/heads/<branch>, refs/heads/<branch>
containing the SHA).
"""
git_dir = tmp_path / ".git"
git_dir.mkdir(parents=True, exist_ok=True)
(git_dir / "HEAD").write_text(f"ref: refs/heads/{branch}\n")
refs_dir = git_dir / "refs" / "heads"
refs_dir.mkdir(parents=True, exist_ok=True)
(refs_dir / branch).write_text(f"{sha}\n")
return tmp_path
def _set_head_sha(repo_root: Path, sha: str, branch: str = "main") -> None:
"""Rewrite the current branch ref to a new SHA (simulates git pull)."""
(repo_root / ".git" / "refs" / "heads" / branch).write_text(f"{sha}\n")
def _make_runner(
repo_root: Path,
*,
boot_sha: str | None,
boot_wall: float = None,
boot_mtime: float = 0.0,
):
"""Bare GatewayRunner with just the stale-check attributes set.""" """Bare GatewayRunner with just the stale-check attributes set."""
if boot_wall is None:
boot_wall = time.time()
runner = object.__new__(GatewayRunner) runner = object.__new__(GatewayRunner)
runner._repo_root_for_staleness = repo_root runner._repo_root_for_staleness = repo_root
runner._boot_wall_time = boot_wall runner._boot_wall_time = boot_wall
runner._boot_git_sha = boot_sha
runner._boot_repo_mtime = boot_mtime runner._boot_repo_mtime = boot_mtime
runner._stale_code_notified = set() runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False runner._stale_code_restart_triggered = False
runner._cached_current_sha = boot_sha
runner._cached_current_sha_at = boot_wall
return runner return runner
def test_compute_repo_mtime_returns_newest(tmp_path): # ---------------------------------------------------------------------------
"""_compute_repo_mtime returns the newest mtime across sentinel files.""" # _read_git_head_sha — raw SHA reader
repo = _make_tmp_repo(tmp_path) # ---------------------------------------------------------------------------
# Stamp a baseline mtime across all sentinels def test_read_git_head_sha_branch_ref(tmp_path):
baseline = time.time() - 100 """Resolves ref: refs/heads/<branch> → SHA from refs/heads/<branch>."""
for rel in _STALE_CODE_SENTINELS: sha = "b" * 40
os.utime(repo / rel, (baseline, baseline)) _make_git_repo(tmp_path, sha=sha, branch="main")
assert _read_git_head_sha(tmp_path) == sha
# Touch one file forward
newer = time.time()
os.utime(repo / "hermes_cli/config.py", (newer, newer))
result = _compute_repo_mtime(repo)
assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution)
def test_compute_repo_mtime_missing_files_returns_zero(tmp_path): def test_read_git_head_sha_detached_head(tmp_path):
"""Missing sentinel files return 0.0 (treated as 'can't tell' upstream).""" """Detached HEAD: .git/HEAD contains the SHA directly."""
# tmp_path has none of the sentinels sha = "c" * 40
assert _compute_repo_mtime(tmp_path) == 0.0 git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text(f"{sha}\n")
assert _read_git_head_sha(tmp_path) == sha
def test_compute_repo_mtime_partial_files_still_works(tmp_path): def test_read_git_head_sha_packed_refs(tmp_path):
"""Partial sentinel presence still returns newest of the readable ones.""" """Falls back to packed-refs when refs/heads/<branch> is missing."""
(tmp_path / "hermes_cli").mkdir() sha = "d" * 40
target = tmp_path / "hermes_cli" / "config.py" git_dir = tmp_path / ".git"
target.write_text("# partial\n") git_dir.mkdir()
target_mtime = time.time() - 50 (git_dir / "HEAD").write_text("ref: refs/heads/main\n")
os.utime(target, (target_mtime, target_mtime)) # No refs/heads/main file — only packed-refs
(git_dir / "packed-refs").write_text(
result = _compute_repo_mtime(tmp_path) f"# pack-refs with: peeled fully-peeled sorted\n"
assert abs(result - target_mtime) < 1.0 f"{sha} refs/heads/main\n"
)
assert _read_git_head_sha(tmp_path) == sha
def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path): def test_read_git_head_sha_worktree_gitdir_file(tmp_path):
"""No boot snapshot → can't tell → not stale (no restart loop).""" """Worktree: .git is a file with `gitdir: <path>` pointing to the real git dir.
repo = _make_tmp_repo(tmp_path)
runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0) Real git worktrees store shared refs (refs/heads/*) in the main
checkout's .git/ and write a ``commondir`` pointer into the
worktree-gitdir. The reader must follow commondir to resolve the
branch ref this is the layout Hermes dev sessions actually use.
"""
sha = "e" * 40
# Main repo layout
main_repo = tmp_path / "main-repo"
main_git = main_repo / ".git"
(main_git / "refs" / "heads").mkdir(parents=True)
(main_git / "HEAD").write_text("ref: refs/heads/main\n")
(main_git / "refs" / "heads" / "main").write_text("0" * 40 + "\n")
# Worktree lives in main-repo/.git/worktrees/<name>/
worktree_git_dir = main_git / "worktrees" / "feature"
worktree_git_dir.mkdir(parents=True)
(worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n")
# commondir points back at the main .git (relative path, "../..")
(worktree_git_dir / "commondir").write_text("../..\n")
# Feature branch ref lives in the shared refs/heads
(main_git / "refs" / "heads" / "feature").write_text(f"{sha}\n")
# Worktree checkout with .git file pointing at worktree_git_dir
worktree = tmp_path / "wt"
worktree.mkdir()
(worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n")
assert _read_git_head_sha(worktree) == sha
def test_read_git_head_sha_worktree_packed_refs_in_common(tmp_path):
"""Worktree + packed-refs in common dir: fallback still resolves."""
sha = "f" * 40
main_repo = tmp_path / "main-repo"
main_git = main_repo / ".git"
main_git.mkdir(parents=True)
(main_git / "HEAD").write_text("ref: refs/heads/main\n")
# packed-refs in the common (main) .git
(main_git / "packed-refs").write_text(
f"# pack-refs with: peeled fully-peeled sorted\n"
f"{sha} refs/heads/feature\n"
)
worktree_git_dir = main_git / "worktrees" / "feature"
worktree_git_dir.mkdir(parents=True)
(worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n")
(worktree_git_dir / "commondir").write_text("../..\n")
worktree = tmp_path / "wt"
worktree.mkdir()
(worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n")
assert _read_git_head_sha(worktree) == sha
def test_read_git_head_sha_no_git_returns_none(tmp_path):
"""No .git dir → None (non-git install, safely disables the check)."""
assert _read_git_head_sha(tmp_path) is None
def test_read_git_head_sha_malformed_head_returns_none(tmp_path):
"""Empty HEAD file → None (don't loop on corrupt repos)."""
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "HEAD").write_text("")
assert _read_git_head_sha(tmp_path) is None
# ---------------------------------------------------------------------------
# _detect_stale_code — the main regression guard
# ---------------------------------------------------------------------------
def test_detect_stale_code_false_when_sha_unchanged(tmp_path):
"""Boot SHA == current SHA → not stale (no restart)."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
# Force fresh read by expiring the cache
runner._cached_current_sha_at = 0.0
assert runner._detect_stale_code() is False assert runner._detect_stale_code() is False
def test_detect_stale_code_false_when_files_unchanged(tmp_path): def test_detect_stale_code_true_after_git_pull(tmp_path):
"""Source files at boot mtime → not stale.""" """Boot SHA != current SHA → stale (hermes update happened)."""
repo = _make_tmp_repo(tmp_path) boot_sha = "a" * 40
# Freeze all sentinels to the same mtime _make_git_repo(tmp_path, sha=boot_sha)
baseline = time.time() - 100 runner = _make_runner(tmp_path, boot_sha=boot_sha)
for rel in _STALE_CODE_SENTINELS: # Simulate git pull moving HEAD forward
os.utime(repo / rel, (baseline, baseline)) _set_head_sha(tmp_path, "b" * 40)
runner._cached_current_sha_at = 0.0 # expire cache
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
assert runner._detect_stale_code() is False
def test_detect_stale_code_true_after_update(tmp_path):
"""Sentinel files newer than boot snapshot → stale."""
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Simulate hermes update touching config.py
new_mtime = time.time()
os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime))
assert runner._detect_stale_code() is True assert runner._detect_stale_code() is True
def test_detect_stale_code_ignores_subsecond_drift(tmp_path): def test_detect_stale_code_ignores_agent_file_edits(tmp_path):
"""2-second slack prevents false positives on coarse-mtime filesystems.""" """THE CORE REGRESSION: agent edits to source files do NOT trigger restart.
repo = _make_tmp_repo(tmp_path)
baseline = time.time() - 100 This is the motivating incident for the SHA-based check. Under the
previous mtime-based scheme, any ``patch`` / ``write_file`` call
against run_agent.py / gateway/run.py / hermes_cli/config.py would
flip the stale-check to True and force a gateway restart on the
next message even though no update actually happened. SHA
comparison decouples the two: git HEAD only moves on ``git pull``,
never on file writes.
"""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
_make_tmp_repo(tmp_path) # lay down sentinel files too
runner = _make_runner(tmp_path, boot_sha=sha)
# Simulate the agent editing run_agent.py and gateway/run.py with
# mtimes far into the future — exactly the scenario that used to
# false-positive the old mtime check.
future = time.time() + 10_000
for rel in _STALE_CODE_SENTINELS: for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline)) p = tmp_path / rel
if p.is_file():
p.write_text("# agent just edited this\n")
os.utime(p, (future, future))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) # HEAD SHA has NOT moved — check must stay False.
runner._cached_current_sha_at = 0.0 # expire cache
# Touch config.py 1s newer — within the 2s slack → not stale
os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0))
assert runner._detect_stale_code() is False assert runner._detect_stale_code() is False
# Touch 5s newer → stale
os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0))
assert runner._detect_stale_code() is True
def test_detect_stale_code_false_for_non_git_install(tmp_path):
"""Non-git install (no .git dir) → check disabled, never fires."""
# No .git dir at all; runner's boot_sha is None
runner = _make_runner(tmp_path, boot_sha=None)
# Even if we pretended the current SHA differed, the check should
# short-circuit on boot_sha=None and return False.
assert runner._detect_stale_code() is False
def test_detect_stale_code_false_when_no_boot_wall_time(tmp_path):
"""No boot snapshot at all → can't tell → not stale (no restart loop)."""
runner = _make_runner(tmp_path, boot_sha="a" * 40, boot_wall=0.0)
assert runner._detect_stale_code() is False
def test_detect_stale_code_handles_disappearing_git_dir(tmp_path):
""".git vanishes mid-run → current_sha = None → not stale (don't loop)."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
# Nuke the git dir after boot
import shutil
shutil.rmtree(tmp_path / ".git")
runner._cached_current_sha_at = 0.0 # expire cache
assert runner._detect_stale_code() is False
# ---------------------------------------------------------------------------
# SHA cache
# ---------------------------------------------------------------------------
def test_current_sha_cache_collapses_bursts(tmp_path, monkeypatch):
"""Consecutive calls inside the TTL window reuse the cached SHA."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
read_calls = {"n": 0}
real_reader = _read_git_head_sha
def counting_reader(repo_root):
read_calls["n"] += 1
return real_reader(repo_root)
from gateway import run as run_mod
monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader)
# Force cache expiry so the first call definitely reads
runner._cached_current_sha_at = 0.0
runner._current_git_sha_cached()
first_count = read_calls["n"]
# Immediate second/third calls should hit cache (no new read)
runner._current_git_sha_cached()
runner._current_git_sha_cached()
assert read_calls["n"] == first_count
def test_current_sha_cache_expires_after_ttl(tmp_path, monkeypatch):
"""After _GIT_SHA_CACHE_TTL_SECS elapses, a fresh read happens."""
sha = "a" * 40
_make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
read_calls = {"n": 0}
real_reader = _read_git_head_sha
def counting_reader(repo_root):
read_calls["n"] += 1
return real_reader(repo_root)
from gateway import run as run_mod
monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader)
runner._cached_current_sha_at = 0.0
runner._current_git_sha_cached()
first = read_calls["n"]
# Age the cache past the TTL
runner._cached_current_sha_at = time.time() - (_GIT_SHA_CACHE_TTL_SECS + 1.0)
runner._current_git_sha_cached()
assert read_calls["n"] == first + 1
# ---------------------------------------------------------------------------
# _trigger_stale_code_restart — idempotency preserved
# ---------------------------------------------------------------------------
def test_trigger_stale_code_restart_is_idempotent(tmp_path): def test_trigger_stale_code_restart_is_idempotent(tmp_path):
"""Calling _trigger_stale_code_restart twice only requests restart once.""" """Calling _trigger_stale_code_restart twice only requests restart once."""
repo = _make_tmp_repo(tmp_path) sha = "a" * 40
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) _make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
calls = [] calls = []
@ -153,8 +356,9 @@ def test_trigger_stale_code_restart_is_idempotent(tmp_path):
def test_trigger_stale_code_restart_survives_request_failure(tmp_path): def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
"""If request_restart raises, we swallow and mark as triggered anyway.""" """If request_restart raises, we swallow and mark as triggered anyway."""
repo = _make_tmp_repo(tmp_path) sha = "a" * 40
runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) _make_git_repo(tmp_path, sha=sha)
runner = _make_runner(tmp_path, boot_sha=sha)
def boom(*, detached=False, via_service=False): def boom(*, detached=False, via_service=False):
raise RuntimeError("no event loop") raise RuntimeError("no event loop")
@ -168,56 +372,41 @@ def test_trigger_stale_code_restart_survives_request_failure(tmp_path):
assert runner._stale_code_restart_triggered is True assert runner._stale_code_restart_triggered is True
def test_detect_stale_code_handles_disappearing_repo_root(tmp_path): # ---------------------------------------------------------------------------
"""If the repo root vanishes after boot, return False (don't loop).""" # Class-level defaults — tests that build bare runners via object.__new__
repo = _make_tmp_repo(tmp_path) # ---------------------------------------------------------------------------
baseline = time.time() - 100
for rel in _STALE_CODE_SENTINELS:
os.utime(repo / rel, (baseline, baseline))
runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline)
# Remove all sentinel files — _compute_repo_mtime returns 0.0
for rel in _STALE_CODE_SENTINELS:
(repo / rel).unlink(missing_ok=True)
assert runner._detect_stale_code() is False
def test_class_level_defaults_prevent_uninitialized_access(): def test_class_level_defaults_prevent_uninitialized_access():
"""Partial construction via object.__new__ must not crash _detect_stale_code.""" """Partial construction via object.__new__ must not crash _detect_stale_code."""
runner = object.__new__(GatewayRunner) runner = object.__new__(GatewayRunner)
# Don't set any instance attrs — class-level defaults should kick in # Don't set any instance attrs — class-level defaults should kick in
runner._repo_root_for_staleness = Path(".") runner._repo_root_for_staleness = Path(".")
# _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0) # _boot_wall_time / _boot_git_sha fall through to class defaults
# (0.0 and None respectively)
assert runner._detect_stale_code() is False assert runner._detect_stale_code() is False
# _stale_code_restart_triggered falls through to class default (False) # _stale_code_restart_triggered falls through to class default (False)
assert runner._stale_code_restart_triggered is False assert runner._stale_code_restart_triggered is False
def test_init_captures_boot_snapshot(monkeypatch, tmp_path): # ---------------------------------------------------------------------------
"""GatewayRunner.__init__ captures a usable stale-code baseline.""" # Legacy mtime reader kept for compatibility — light sanity check only
# Stub out the heavy parts of __init__ we don't need. We only want # ---------------------------------------------------------------------------
# to prove the stale-code snapshot is captured before anything else.
from gateway import run as run_mod
calls = {} def test_compute_repo_mtime_still_returns_newest(tmp_path):
"""_compute_repo_mtime remains available for any legacy callers."""
repo = _make_tmp_repo(tmp_path)
def fake_compute(repo_root): baseline = time.time() - 100
calls["repo_root"] = repo_root for rel in _STALE_CODE_SENTINELS:
return 1234567890.0 os.utime(repo / rel, (baseline, baseline))
monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute) newer = time.time()
os.utime(repo / "hermes_cli/config.py", (newer, newer))
# Build a runner without running the full __init__ — then manually result = _compute_repo_mtime(repo)
# exercise the stale-check init block that __init__ contains. assert abs(result - newer) < 1.0
runner = object.__new__(GatewayRunner)
runner._boot_wall_time = time.time()
runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent
runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness)
runner._stale_code_notified = set()
runner._stale_code_restart_triggered = False
assert runner._boot_repo_mtime == 1234567890.0
assert calls["repo_root"] == runner._repo_root_for_staleness def test_compute_repo_mtime_missing_files_returns_zero(tmp_path):
assert runner._boot_wall_time > 0 """Legacy sanity: missing sentinels → 0.0."""
assert _compute_repo_mtime(tmp_path) == 0.0