From d90f73bcec3daad4fc72b9f3471392acabdd5747 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 4 May 2026 12:33:21 -0700 Subject: [PATCH] fix(gateway): use git HEAD SHA, not file mtimes, for stale-code check (#19740) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stale-code self-check (Issue #17648) used sentinel-file mtimes to decide whether the gateway survived a `hermes update` with stale `sys.modules`. That signal false-positives on any write to the sentinel files — including agent-driven edits during Hermes-on-Hermes dev sessions. Telling the agent to patch `run_agent.py` would flip the check to True on the next user message and force a gateway restart even though no update happened. Switch the signal to `git rev-parse HEAD`. Agent file edits don't move HEAD; `hermes update` (git pull) always does. Reading .git/HEAD directly (no subprocess) with a 5s cache keeps the overhead negligible on bursty chats. Non-git installs short-circuit to False — the stale-modules class can't occur without a git-backed update path, so there's nothing to detect. The legacy `_compute_repo_mtime` helper is kept but unused by detection, reserved as a fallback hook for future pip-install update paths. - _read_git_head_sha(): resolves HEAD across main checkout, worktree (follows `gitdir:` + `commondir` pointers), and packed-refs layouts. - _current_git_sha_cached(): per-runner 5s SHA cache. - _detect_stale_code(): boot SHA vs current SHA, returns False when either is unavailable. - Tests cover all four layouts, the agent-edits-don't-trigger regression, and cache behavior. Refs #17648. --- gateway/run.py | 202 ++++++++-- tests/gateway/test_stale_code_self_check.py | 419 ++++++++++++++------ 2 files changed, 482 insertions(+), 139 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index f90b2b1b03..2b085d9915 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -101,10 +101,21 @@ _AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60 # already-loaded stale module object and raises ``ImportError`` — see # Issue #17648. Rather than papering over the import failure site-by-site # in every tool file, detect the stale state centrally and auto-restart -# so the gateway reloads with fresh code. The sentinel files below are -# the canonical repo-level markers that every update touches; if any is -# newer than the gateway's boot time, we know the running process is out -# of date. +# so the gateway reloads with fresh code. +# +# The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes +# update`` moves that is NOT moved by agent-driven file edits. Earlier +# revisions of this check compared file mtimes across a sentinel set +# (run_agent.py, gateway/run.py, ...), but that produced false positives +# whenever the agent edited its own source files during a session: +# mtime jumps, stale-check fires, gateway restarts, user must retype. +# See the conversation at PR # for the motivating incident. +# +# The legacy mtime sentinels are kept ONLY as a last-resort fallback for +# non-git installs (pip install from wheel, sparse clones with no .git +# dir). In those environments ``hermes update`` is not a supported path, +# so the check effectively no-ops — which is the safe behavior: better +# to ship one broken import than to restart on every agent-edit. _STALE_CODE_SENTINELS: tuple[str, ...] = ( "hermes_cli/config.py", "hermes_cli/__init__.py", @@ -113,10 +124,106 @@ _STALE_CODE_SENTINELS: tuple[str, ...] = ( "pyproject.toml", ) +# Cache git HEAD reads across consecutive messages so a chat burst doesn't +# spawn one subprocess per message. 5s is long enough to collapse a burst +# and short enough that the real post-update detection still fires within +# the user's perceived "next message" window. +_GIT_SHA_CACHE_TTL_SECS = 5.0 + + +def _read_git_head_sha(repo_root: Path) -> Optional[str]: + """Return the git HEAD SHA for ``repo_root``, or None if unavailable. + + Reads ``.git/HEAD`` directly (and follows one level of ref) instead + of shelling out to ``git`` — cheaper, no subprocess tax, works on + gateway hosts that don't have a ``git`` binary on PATH. Returns + None for non-git installs (no ``.git`` dir) or any I/O error; callers + treat None as "can't tell" and skip the check. + + Supports the three layouts we care about: + 1. Main checkout: ``/.git/`` is a directory. + 2. Git worktree: ``/.git`` is a file ``gitdir: `` that + points at ``
/.git/worktrees//``. The worktree's + gitdir has HEAD + index but NOT refs/heads/ — those live in + the main checkout, and ``/commondir`` points + at the main ``.git``. We search both locations for refs. + 3. Packed refs: ``refs/heads/`` is absent on disk but + listed in ``/packed-refs``. + """ + try: + git_dir = repo_root / ".git" + # Worktrees store ``.git`` as a file pointing at gitdir: + if git_dir.is_file(): + try: + content = git_dir.read_text().strip() + if content.startswith("gitdir:"): + git_dir = Path(content.split(":", 1)[1].strip()) + if not git_dir.is_absolute(): + git_dir = (repo_root / git_dir).resolve() + except OSError: + return None + if not git_dir.is_dir(): + return None + + # Figure out the "common" git dir — the one that owns shared refs. + # For a worktree, commondir points at it (relative path, resolve + # against git_dir). For a main checkout, common_dir == git_dir. + common_dir = git_dir + commondir_file = git_dir / "commondir" + if commondir_file.is_file(): + try: + rel = commondir_file.read_text().strip() + candidate = (git_dir / rel).resolve() if rel else git_dir + if candidate.is_dir(): + common_dir = candidate + except OSError: + pass + + head_path = git_dir / "HEAD" + if not head_path.is_file(): + return None + head_content = head_path.read_text().strip() + + if head_content.startswith("ref:"): + # Symbolic ref — follow one level (e.g. ref: refs/heads/main). + # Worktree-local refs (bisect, rebase-merge state) live under + # git_dir; shared refs (refs/heads/*, refs/tags/*) live under + # common_dir. Try git_dir first, then common_dir. + ref_rel = head_content.split(":", 1)[1].strip() + for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,): + ref_path = base / ref_rel + if ref_path.is_file(): + try: + sha = ref_path.read_text().strip() + except OSError: + continue + if sha: + return sha + # Packed refs fallback — always stored in the common dir. + packed = common_dir / "packed-refs" + if packed.is_file(): + try: + for line in packed.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or line.startswith("^"): + continue + parts = line.split(None, 1) + if len(parts) == 2 and parts[1] == ref_rel: + return parts[0] or None + except OSError: + return None + return None + + # Detached HEAD — content is the SHA directly. + return head_content or None + except Exception: + return None + def _compute_repo_mtime(repo_root: Path) -> float: """Return the newest mtime across the stale-code sentinel files. + Legacy fallback used only for non-git installs (``.git`` missing). Missing files are ignored (they may not exist on older checkouts). Returns 0.0 if no sentinel file is readable — treat that as "can't tell", which downstream callers interpret as "not stale" to avoid @@ -1005,6 +1112,7 @@ class GatewayRunner: # running __init__ don't crash when _handle_message reads these. _boot_wall_time: float = 0.0 _boot_repo_mtime: float = 0.0 + _boot_git_sha: Optional[str] = None _stale_code_restart_triggered: bool = False def __init__(self, config: Optional[GatewayConfig] = None): @@ -1020,15 +1128,23 @@ class GatewayRunner: try: self._boot_wall_time: float = time.time() self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent + self._boot_git_sha: Optional[str] = _read_git_head_sha( + self._repo_root_for_staleness, + ) self._boot_repo_mtime: float = _compute_repo_mtime( self._repo_root_for_staleness, ) except Exception: self._boot_wall_time = 0.0 self._repo_root_for_staleness = Path(".") + self._boot_git_sha = None self._boot_repo_mtime = 0.0 self._stale_code_notified: set[str] = set() self._stale_code_restart_triggered: bool = False + # Cached current-SHA read, refreshed at most every + # _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem. + self._cached_current_sha: Optional[str] = self._boot_git_sha + self._cached_current_sha_at: float = self._boot_wall_time # Load ephemeral config from config.yaml / env vars. # Both are injected at API-call time only and never persisted. @@ -2737,36 +2853,69 @@ class GatewayRunner: task.add_done_callback(self._background_tasks.discard) return True + def _current_git_sha_cached(self) -> Optional[str]: + """Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS. + + A bursty chat (user mashes "hello?" three times) would otherwise + re-read ``.git/HEAD`` on every message. Caching collapses that + into a single read and still re-checks within the user's + perceived "next message" window. + """ + now = time.time() + if ( + self._cached_current_sha is not None + and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS + ): + return self._cached_current_sha + try: + sha = _read_git_head_sha(self._repo_root_for_staleness) + except Exception: + sha = None + self._cached_current_sha = sha + self._cached_current_sha_at = now + return sha + def _detect_stale_code(self) -> bool: - """Return True if source files on disk are newer than the running process. + """Return True if the git HEAD moved since this process booted. A gateway that survives ``hermes update`` (manual SIGTERM never escalated, systemd restart race, detached-process respawn failed, etc.) keeps pre-update modules cached in ``sys.modules``. Later imports of names added post-update — e.g. ``cfg_get`` from PR #17304 — raise ImportError against the stale module object (see - Issue #17648). Detecting this at the source — "the code on disk - is newer than me" — lets us auto-restart instead of serving - broken responses until the user notices and runs - ``hermes gateway restart`` manually. + Issue #17648). - Returns False when the boot-time snapshot is unavailable or no - sentinel file is readable, to avoid false-positive restart loops - in unusual checkouts (sparse clones, read-only filesystems). + We compare the git HEAD SHA at boot to the current SHA on disk. + ``hermes update`` always moves HEAD forward via ``git pull``; + agent file edits (the agent patching ``run_agent.py`` or + ``gateway/run.py`` during a self-dev session) never move HEAD. + That makes SHA comparison free of the false-positive class that + the old mtime check suffered from — the agent can edit any file + without triggering a phantom restart. + + Returns False when: + - the boot SHA is unavailable (non-git install, first call + during partial init, etc.); we can't tell and refuse to loop + - the current SHA matches the boot SHA + - reading the current SHA fails for any reason """ - if not self._boot_wall_time or not self._boot_repo_mtime: + if not self._boot_wall_time: + return False + if not self._boot_git_sha: + # Non-git install. ``hermes update`` is git-based, so a + # non-git install can't experience the stale-modules class + # this check exists to catch. Return False — no check, no + # false positives. (If we ever ship a pip-install update + # path, we'd add a persistent update marker here and compare + # its timestamp to self._boot_wall_time.) return False try: - current = _compute_repo_mtime(self._repo_root_for_staleness) + current = self._current_git_sha_cached() except Exception: return False - if current <= 0.0: + if not current: return False - # 2-second slack guards against filesystems with coarse mtime - # resolution (FAT32, some NFS mounts). Real updates always move - # the newest-file mtime forward by minutes, so this doesn't hide - # genuine staleness. - return current > self._boot_repo_mtime + 2.0 + return current != self._boot_git_sha def _trigger_stale_code_restart(self) -> None: """Idempotently kick off a graceful restart after stale-code detection. @@ -2782,12 +2931,17 @@ class GatewayRunner: if self._stale_code_restart_triggered: return self._stale_code_restart_triggered = True + current_sha = None + try: + current_sha = self._current_git_sha_cached() + except Exception: + pass logger.warning( - "Stale-code self-check: source files newer than gateway boot " - "time (boot=%.0f, newest=%.0f) — requesting graceful restart. " + "Stale-code self-check: git HEAD moved since gateway boot " + "(boot=%s, current=%s) — requesting graceful restart. " "See Issue #17648.", - self._boot_repo_mtime, - _compute_repo_mtime(self._repo_root_for_staleness), + (self._boot_git_sha or "?")[:12], + (current_sha or "?")[:12], ) try: self.request_restart(detached=False, via_service=True) diff --git a/tests/gateway/test_stale_code_self_check.py b/tests/gateway/test_stale_code_self_check.py index 5289f575d4..64ad347145 100644 --- a/tests/gateway/test_stale_code_self_check.py +++ b/tests/gateway/test_stale_code_self_check.py @@ -3,25 +3,34 @@ A gateway that survives ``hermes update`` keeps pre-update modules cached in ``sys.modules``. Later imports of names added post-update (e.g. ``cfg_get`` from PR #17304) raise ImportError against the stale module -object. The self-check in ``GatewayRunner._detect_stale_code()`` detects -this by comparing boot-time sentinel-file mtimes against current ones, -and ``_trigger_stale_code_restart()`` triggers a graceful restart. +object. + +The self-check compares the git HEAD SHA at boot to the current SHA on +disk. ``hermes update`` always moves HEAD forward via ``git pull``; +agent-driven file edits (Hermes editing ``run_agent.py`` / ``gateway/run.py`` +during a self-dev session) never move HEAD — so the SHA signal is free of +the false-positive class that the earlier mtime-based check suffered from. """ import os import time from pathlib import Path -from unittest.mock import MagicMock, patch import pytest from gateway.run import ( GatewayRunner, _compute_repo_mtime, + _read_git_head_sha, _STALE_CODE_SENTINELS, + _GIT_SHA_CACHE_TTL_SECS, ) +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + def _make_tmp_repo(tmp_path: Path) -> Path: """Create a fake repo with all stale-code sentinel files.""" for rel in _STALE_CODE_SENTINELS: @@ -31,109 +40,303 @@ def _make_tmp_repo(tmp_path: Path) -> Path: return tmp_path -def _make_runner(repo_root: Path, *, boot_mtime: float, boot_wall: float): +def _make_git_repo(tmp_path: Path, sha: str = "a" * 40, branch: str = "main") -> Path: + """Stamp a minimal .git directory so _read_git_head_sha can resolve a SHA. + + We don't run real git — just lay down the files the reader walks + (.git/HEAD pointing at refs/heads/, refs/heads/ + containing the SHA). + """ + git_dir = tmp_path / ".git" + git_dir.mkdir(parents=True, exist_ok=True) + (git_dir / "HEAD").write_text(f"ref: refs/heads/{branch}\n") + refs_dir = git_dir / "refs" / "heads" + refs_dir.mkdir(parents=True, exist_ok=True) + (refs_dir / branch).write_text(f"{sha}\n") + return tmp_path + + +def _set_head_sha(repo_root: Path, sha: str, branch: str = "main") -> None: + """Rewrite the current branch ref to a new SHA (simulates git pull).""" + (repo_root / ".git" / "refs" / "heads" / branch).write_text(f"{sha}\n") + + +def _make_runner( + repo_root: Path, + *, + boot_sha: str | None, + boot_wall: float = None, + boot_mtime: float = 0.0, +): """Bare GatewayRunner with just the stale-check attributes set.""" + if boot_wall is None: + boot_wall = time.time() runner = object.__new__(GatewayRunner) runner._repo_root_for_staleness = repo_root runner._boot_wall_time = boot_wall + runner._boot_git_sha = boot_sha runner._boot_repo_mtime = boot_mtime runner._stale_code_notified = set() runner._stale_code_restart_triggered = False + runner._cached_current_sha = boot_sha + runner._cached_current_sha_at = boot_wall return runner -def test_compute_repo_mtime_returns_newest(tmp_path): - """_compute_repo_mtime returns the newest mtime across sentinel files.""" - repo = _make_tmp_repo(tmp_path) +# --------------------------------------------------------------------------- +# _read_git_head_sha — raw SHA reader +# --------------------------------------------------------------------------- - # Stamp a baseline mtime across all sentinels - baseline = time.time() - 100 - for rel in _STALE_CODE_SENTINELS: - os.utime(repo / rel, (baseline, baseline)) - - # Touch one file forward - newer = time.time() - os.utime(repo / "hermes_cli/config.py", (newer, newer)) - - result = _compute_repo_mtime(repo) - assert abs(result - newer) < 1.0 # within 1s (filesystem mtime resolution) +def test_read_git_head_sha_branch_ref(tmp_path): + """Resolves ref: refs/heads/ → SHA from refs/heads/.""" + sha = "b" * 40 + _make_git_repo(tmp_path, sha=sha, branch="main") + assert _read_git_head_sha(tmp_path) == sha -def test_compute_repo_mtime_missing_files_returns_zero(tmp_path): - """Missing sentinel files return 0.0 (treated as 'can't tell' upstream).""" - # tmp_path has none of the sentinels - assert _compute_repo_mtime(tmp_path) == 0.0 +def test_read_git_head_sha_detached_head(tmp_path): + """Detached HEAD: .git/HEAD contains the SHA directly.""" + sha = "c" * 40 + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text(f"{sha}\n") + assert _read_git_head_sha(tmp_path) == sha -def test_compute_repo_mtime_partial_files_still_works(tmp_path): - """Partial sentinel presence still returns newest of the readable ones.""" - (tmp_path / "hermes_cli").mkdir() - target = tmp_path / "hermes_cli" / "config.py" - target.write_text("# partial\n") - target_mtime = time.time() - 50 - os.utime(target, (target_mtime, target_mtime)) - - result = _compute_repo_mtime(tmp_path) - assert abs(result - target_mtime) < 1.0 +def test_read_git_head_sha_packed_refs(tmp_path): + """Falls back to packed-refs when refs/heads/ is missing.""" + sha = "d" * 40 + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text("ref: refs/heads/main\n") + # No refs/heads/main file — only packed-refs + (git_dir / "packed-refs").write_text( + f"# pack-refs with: peeled fully-peeled sorted\n" + f"{sha} refs/heads/main\n" + ) + assert _read_git_head_sha(tmp_path) == sha -def test_detect_stale_code_false_when_no_boot_snapshot(tmp_path): - """No boot snapshot → can't tell → not stale (no restart loop).""" - repo = _make_tmp_repo(tmp_path) - runner = _make_runner(repo, boot_mtime=0.0, boot_wall=0.0) +def test_read_git_head_sha_worktree_gitdir_file(tmp_path): + """Worktree: .git is a file with `gitdir: ` pointing to the real git dir. + + Real git worktrees store shared refs (refs/heads/*) in the main + checkout's .git/ and write a ``commondir`` pointer into the + worktree-gitdir. The reader must follow commondir to resolve the + branch ref — this is the layout Hermes dev sessions actually use. + """ + sha = "e" * 40 + # Main repo layout + main_repo = tmp_path / "main-repo" + main_git = main_repo / ".git" + (main_git / "refs" / "heads").mkdir(parents=True) + (main_git / "HEAD").write_text("ref: refs/heads/main\n") + (main_git / "refs" / "heads" / "main").write_text("0" * 40 + "\n") + + # Worktree lives in main-repo/.git/worktrees// + worktree_git_dir = main_git / "worktrees" / "feature" + worktree_git_dir.mkdir(parents=True) + (worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n") + # commondir points back at the main .git (relative path, "../..") + (worktree_git_dir / "commondir").write_text("../..\n") + # Feature branch ref lives in the shared refs/heads + (main_git / "refs" / "heads" / "feature").write_text(f"{sha}\n") + + # Worktree checkout with .git file pointing at worktree_git_dir + worktree = tmp_path / "wt" + worktree.mkdir() + (worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n") + + assert _read_git_head_sha(worktree) == sha + + +def test_read_git_head_sha_worktree_packed_refs_in_common(tmp_path): + """Worktree + packed-refs in common dir: fallback still resolves.""" + sha = "f" * 40 + main_repo = tmp_path / "main-repo" + main_git = main_repo / ".git" + main_git.mkdir(parents=True) + (main_git / "HEAD").write_text("ref: refs/heads/main\n") + # packed-refs in the common (main) .git + (main_git / "packed-refs").write_text( + f"# pack-refs with: peeled fully-peeled sorted\n" + f"{sha} refs/heads/feature\n" + ) + + worktree_git_dir = main_git / "worktrees" / "feature" + worktree_git_dir.mkdir(parents=True) + (worktree_git_dir / "HEAD").write_text("ref: refs/heads/feature\n") + (worktree_git_dir / "commondir").write_text("../..\n") + + worktree = tmp_path / "wt" + worktree.mkdir() + (worktree / ".git").write_text(f"gitdir: {worktree_git_dir}\n") + + assert _read_git_head_sha(worktree) == sha + + +def test_read_git_head_sha_no_git_returns_none(tmp_path): + """No .git dir → None (non-git install, safely disables the check).""" + assert _read_git_head_sha(tmp_path) is None + + +def test_read_git_head_sha_malformed_head_returns_none(tmp_path): + """Empty HEAD file → None (don't loop on corrupt repos).""" + git_dir = tmp_path / ".git" + git_dir.mkdir() + (git_dir / "HEAD").write_text("") + assert _read_git_head_sha(tmp_path) is None + + +# --------------------------------------------------------------------------- +# _detect_stale_code — the main regression guard +# --------------------------------------------------------------------------- + +def test_detect_stale_code_false_when_sha_unchanged(tmp_path): + """Boot SHA == current SHA → not stale (no restart).""" + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + runner = _make_runner(tmp_path, boot_sha=sha) + # Force fresh read by expiring the cache + runner._cached_current_sha_at = 0.0 assert runner._detect_stale_code() is False -def test_detect_stale_code_false_when_files_unchanged(tmp_path): - """Source files at boot mtime → not stale.""" - repo = _make_tmp_repo(tmp_path) - # Freeze all sentinels to the same mtime - baseline = time.time() - 100 - for rel in _STALE_CODE_SENTINELS: - os.utime(repo / rel, (baseline, baseline)) - - runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) - assert runner._detect_stale_code() is False - - -def test_detect_stale_code_true_after_update(tmp_path): - """Sentinel files newer than boot snapshot → stale.""" - repo = _make_tmp_repo(tmp_path) - baseline = time.time() - 100 - for rel in _STALE_CODE_SENTINELS: - os.utime(repo / rel, (baseline, baseline)) - - runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) - - # Simulate hermes update touching config.py - new_mtime = time.time() - os.utime(repo / "hermes_cli/config.py", (new_mtime, new_mtime)) - +def test_detect_stale_code_true_after_git_pull(tmp_path): + """Boot SHA != current SHA → stale (hermes update happened).""" + boot_sha = "a" * 40 + _make_git_repo(tmp_path, sha=boot_sha) + runner = _make_runner(tmp_path, boot_sha=boot_sha) + # Simulate git pull moving HEAD forward + _set_head_sha(tmp_path, "b" * 40) + runner._cached_current_sha_at = 0.0 # expire cache assert runner._detect_stale_code() is True -def test_detect_stale_code_ignores_subsecond_drift(tmp_path): - """2-second slack prevents false positives on coarse-mtime filesystems.""" - repo = _make_tmp_repo(tmp_path) - baseline = time.time() - 100 +def test_detect_stale_code_ignores_agent_file_edits(tmp_path): + """THE CORE REGRESSION: agent edits to source files do NOT trigger restart. + + This is the motivating incident for the SHA-based check. Under the + previous mtime-based scheme, any ``patch`` / ``write_file`` call + against run_agent.py / gateway/run.py / hermes_cli/config.py would + flip the stale-check to True and force a gateway restart on the + next message — even though no update actually happened. SHA + comparison decouples the two: git HEAD only moves on ``git pull``, + never on file writes. + """ + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + _make_tmp_repo(tmp_path) # lay down sentinel files too + runner = _make_runner(tmp_path, boot_sha=sha) + + # Simulate the agent editing run_agent.py and gateway/run.py with + # mtimes far into the future — exactly the scenario that used to + # false-positive the old mtime check. + future = time.time() + 10_000 for rel in _STALE_CODE_SENTINELS: - os.utime(repo / rel, (baseline, baseline)) + p = tmp_path / rel + if p.is_file(): + p.write_text("# agent just edited this\n") + os.utime(p, (future, future)) - runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) - - # Touch config.py 1s newer — within the 2s slack → not stale - os.utime(repo / "hermes_cli/config.py", (baseline + 1.0, baseline + 1.0)) + # HEAD SHA has NOT moved — check must stay False. + runner._cached_current_sha_at = 0.0 # expire cache assert runner._detect_stale_code() is False - # Touch 5s newer → stale - os.utime(repo / "hermes_cli/config.py", (baseline + 5.0, baseline + 5.0)) - assert runner._detect_stale_code() is True +def test_detect_stale_code_false_for_non_git_install(tmp_path): + """Non-git install (no .git dir) → check disabled, never fires.""" + # No .git dir at all; runner's boot_sha is None + runner = _make_runner(tmp_path, boot_sha=None) + # Even if we pretended the current SHA differed, the check should + # short-circuit on boot_sha=None and return False. + assert runner._detect_stale_code() is False + + +def test_detect_stale_code_false_when_no_boot_wall_time(tmp_path): + """No boot snapshot at all → can't tell → not stale (no restart loop).""" + runner = _make_runner(tmp_path, boot_sha="a" * 40, boot_wall=0.0) + assert runner._detect_stale_code() is False + + +def test_detect_stale_code_handles_disappearing_git_dir(tmp_path): + """.git vanishes mid-run → current_sha = None → not stale (don't loop).""" + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + runner = _make_runner(tmp_path, boot_sha=sha) + # Nuke the git dir after boot + import shutil + shutil.rmtree(tmp_path / ".git") + runner._cached_current_sha_at = 0.0 # expire cache + assert runner._detect_stale_code() is False + + +# --------------------------------------------------------------------------- +# SHA cache +# --------------------------------------------------------------------------- + +def test_current_sha_cache_collapses_bursts(tmp_path, monkeypatch): + """Consecutive calls inside the TTL window reuse the cached SHA.""" + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + runner = _make_runner(tmp_path, boot_sha=sha) + + read_calls = {"n": 0} + real_reader = _read_git_head_sha + + def counting_reader(repo_root): + read_calls["n"] += 1 + return real_reader(repo_root) + + from gateway import run as run_mod + monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader) + + # Force cache expiry so the first call definitely reads + runner._cached_current_sha_at = 0.0 + runner._current_git_sha_cached() + first_count = read_calls["n"] + + # Immediate second/third calls should hit cache (no new read) + runner._current_git_sha_cached() + runner._current_git_sha_cached() + assert read_calls["n"] == first_count + + +def test_current_sha_cache_expires_after_ttl(tmp_path, monkeypatch): + """After _GIT_SHA_CACHE_TTL_SECS elapses, a fresh read happens.""" + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + runner = _make_runner(tmp_path, boot_sha=sha) + + read_calls = {"n": 0} + real_reader = _read_git_head_sha + + def counting_reader(repo_root): + read_calls["n"] += 1 + return real_reader(repo_root) + + from gateway import run as run_mod + monkeypatch.setattr(run_mod, "_read_git_head_sha", counting_reader) + + runner._cached_current_sha_at = 0.0 + runner._current_git_sha_cached() + first = read_calls["n"] + + # Age the cache past the TTL + runner._cached_current_sha_at = time.time() - (_GIT_SHA_CACHE_TTL_SECS + 1.0) + runner._current_git_sha_cached() + assert read_calls["n"] == first + 1 + + +# --------------------------------------------------------------------------- +# _trigger_stale_code_restart — idempotency preserved +# --------------------------------------------------------------------------- def test_trigger_stale_code_restart_is_idempotent(tmp_path): """Calling _trigger_stale_code_restart twice only requests restart once.""" - repo = _make_tmp_repo(tmp_path) - runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + runner = _make_runner(tmp_path, boot_sha=sha) calls = [] @@ -153,8 +356,9 @@ def test_trigger_stale_code_restart_is_idempotent(tmp_path): def test_trigger_stale_code_restart_survives_request_failure(tmp_path): """If request_restart raises, we swallow and mark as triggered anyway.""" - repo = _make_tmp_repo(tmp_path) - runner = _make_runner(repo, boot_mtime=1.0, boot_wall=1.0) + sha = "a" * 40 + _make_git_repo(tmp_path, sha=sha) + runner = _make_runner(tmp_path, boot_sha=sha) def boom(*, detached=False, via_service=False): raise RuntimeError("no event loop") @@ -168,56 +372,41 @@ def test_trigger_stale_code_restart_survives_request_failure(tmp_path): assert runner._stale_code_restart_triggered is True -def test_detect_stale_code_handles_disappearing_repo_root(tmp_path): - """If the repo root vanishes after boot, return False (don't loop).""" - repo = _make_tmp_repo(tmp_path) - baseline = time.time() - 100 - for rel in _STALE_CODE_SENTINELS: - os.utime(repo / rel, (baseline, baseline)) - - runner = _make_runner(repo, boot_mtime=baseline, boot_wall=baseline) - - # Remove all sentinel files — _compute_repo_mtime returns 0.0 - for rel in _STALE_CODE_SENTINELS: - (repo / rel).unlink(missing_ok=True) - - assert runner._detect_stale_code() is False - +# --------------------------------------------------------------------------- +# Class-level defaults — tests that build bare runners via object.__new__ +# --------------------------------------------------------------------------- def test_class_level_defaults_prevent_uninitialized_access(): """Partial construction via object.__new__ must not crash _detect_stale_code.""" runner = object.__new__(GatewayRunner) # Don't set any instance attrs — class-level defaults should kick in runner._repo_root_for_staleness = Path(".") - # _boot_wall_time / _boot_repo_mtime fall through to class defaults (0.0) + # _boot_wall_time / _boot_git_sha fall through to class defaults + # (0.0 and None respectively) assert runner._detect_stale_code() is False # _stale_code_restart_triggered falls through to class default (False) assert runner._stale_code_restart_triggered is False -def test_init_captures_boot_snapshot(monkeypatch, tmp_path): - """GatewayRunner.__init__ captures a usable stale-code baseline.""" - # Stub out the heavy parts of __init__ we don't need. We only want - # to prove the stale-code snapshot is captured before anything else. - from gateway import run as run_mod +# --------------------------------------------------------------------------- +# Legacy mtime reader kept for compatibility — light sanity check only +# --------------------------------------------------------------------------- - calls = {} +def test_compute_repo_mtime_still_returns_newest(tmp_path): + """_compute_repo_mtime remains available for any legacy callers.""" + repo = _make_tmp_repo(tmp_path) - def fake_compute(repo_root): - calls["repo_root"] = repo_root - return 1234567890.0 + baseline = time.time() - 100 + for rel in _STALE_CODE_SENTINELS: + os.utime(repo / rel, (baseline, baseline)) - monkeypatch.setattr(run_mod, "_compute_repo_mtime", fake_compute) + newer = time.time() + os.utime(repo / "hermes_cli/config.py", (newer, newer)) - # Build a runner without running the full __init__ — then manually - # exercise the stale-check init block that __init__ contains. - runner = object.__new__(GatewayRunner) - runner._boot_wall_time = time.time() - runner._repo_root_for_staleness = Path(run_mod.__file__).resolve().parent.parent - runner._boot_repo_mtime = run_mod._compute_repo_mtime(runner._repo_root_for_staleness) - runner._stale_code_notified = set() - runner._stale_code_restart_triggered = False + result = _compute_repo_mtime(repo) + assert abs(result - newer) < 1.0 - assert runner._boot_repo_mtime == 1234567890.0 - assert calls["repo_root"] == runner._repo_root_for_staleness - assert runner._boot_wall_time > 0 + +def test_compute_repo_mtime_missing_files_returns_zero(tmp_path): + """Legacy sanity: missing sentinels → 0.0.""" + assert _compute_repo_mtime(tmp_path) == 0.0