mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-07 02:51:50 +00:00
fix(gateway): use git HEAD SHA, not file mtimes, for stale-code check (#19740)
The stale-code self-check (Issue #17648) used sentinel-file mtimes to decide whether the gateway survived a `hermes update` with stale `sys.modules`. That signal false-positives on any write to the sentinel files — including agent-driven edits during Hermes-on-Hermes dev sessions. Telling the agent to patch `run_agent.py` would flip the check to True on the next user message and force a gateway restart even though no update happened. Switch the signal to `git rev-parse HEAD`. Agent file edits don't move HEAD; `hermes update` (git pull) always does. Reading .git/HEAD directly (no subprocess) with a 5s cache keeps the overhead negligible on bursty chats. Non-git installs short-circuit to False — the stale-modules class can't occur without a git-backed update path, so there's nothing to detect. The legacy `_compute_repo_mtime` helper is kept but unused by detection, reserved as a fallback hook for future pip-install update paths. - _read_git_head_sha(): resolves HEAD across main checkout, worktree (follows `gitdir:` + `commondir` pointers), and packed-refs layouts. - _current_git_sha_cached(): per-runner 5s SHA cache. - _detect_stale_code(): boot SHA vs current SHA, returns False when either is unavailable. - Tests cover all four layouts, the agent-edits-don't-trigger regression, and cache behavior. Refs #17648.
This commit is contained in:
parent
a21f364ad7
commit
d90f73bcec
2 changed files with 482 additions and 139 deletions
202
gateway/run.py
202
gateway/run.py
|
|
@ -101,10 +101,21 @@ _AUTO_CONTINUE_FRESHNESS_SECS_DEFAULT = 60 * 60
|
|||
# already-loaded stale module object and raises ``ImportError`` — see
|
||||
# Issue #17648. Rather than papering over the import failure site-by-site
|
||||
# in every tool file, detect the stale state centrally and auto-restart
|
||||
# so the gateway reloads with fresh code. The sentinel files below are
|
||||
# the canonical repo-level markers that every update touches; if any is
|
||||
# newer than the gateway's boot time, we know the running process is out
|
||||
# of date.
|
||||
# so the gateway reloads with fresh code.
|
||||
#
|
||||
# The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
|
||||
# update`` moves that is NOT moved by agent-driven file edits. Earlier
|
||||
# revisions of this check compared file mtimes across a sentinel set
|
||||
# (run_agent.py, gateway/run.py, ...), but that produced false positives
|
||||
# whenever the agent edited its own source files during a session:
|
||||
# mtime jumps, stale-check fires, gateway restarts, user must retype.
|
||||
# See the conversation at PR #<this> for the motivating incident.
|
||||
#
|
||||
# The legacy mtime sentinels are kept ONLY as a last-resort fallback for
|
||||
# non-git installs (pip install from wheel, sparse clones with no .git
|
||||
# dir). In those environments ``hermes update`` is not a supported path,
|
||||
# so the check effectively no-ops — which is the safe behavior: better
|
||||
# to ship one broken import than to restart on every agent-edit.
|
||||
_STALE_CODE_SENTINELS: tuple[str, ...] = (
|
||||
"hermes_cli/config.py",
|
||||
"hermes_cli/__init__.py",
|
||||
|
|
@ -113,10 +124,106 @@ _STALE_CODE_SENTINELS: tuple[str, ...] = (
|
|||
"pyproject.toml",
|
||||
)
|
||||
|
||||
# Cache git HEAD reads across consecutive messages so a chat burst doesn't
|
||||
# spawn one subprocess per message. 5s is long enough to collapse a burst
|
||||
# and short enough that the real post-update detection still fires within
|
||||
# the user's perceived "next message" window.
|
||||
_GIT_SHA_CACHE_TTL_SECS = 5.0
|
||||
|
||||
|
||||
def _read_git_head_sha(repo_root: Path) -> Optional[str]:
|
||||
"""Return the git HEAD SHA for ``repo_root``, or None if unavailable.
|
||||
|
||||
Reads ``.git/HEAD`` directly (and follows one level of ref) instead
|
||||
of shelling out to ``git`` — cheaper, no subprocess tax, works on
|
||||
gateway hosts that don't have a ``git`` binary on PATH. Returns
|
||||
None for non-git installs (no ``.git`` dir) or any I/O error; callers
|
||||
treat None as "can't tell" and skip the check.
|
||||
|
||||
Supports the three layouts we care about:
|
||||
1. Main checkout: ``<repo>/.git/`` is a directory.
|
||||
2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
|
||||
points at ``<main>/.git/worktrees/<name>/``. The worktree's
|
||||
gitdir has HEAD + index but NOT refs/heads/ — those live in
|
||||
the main checkout, and ``<worktree-gitdir>/commondir`` points
|
||||
at the main ``.git``. We search both locations for refs.
|
||||
3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
|
||||
listed in ``<main-git-dir>/packed-refs``.
|
||||
"""
|
||||
try:
|
||||
git_dir = repo_root / ".git"
|
||||
# Worktrees store ``.git`` as a file pointing at gitdir: <path>
|
||||
if git_dir.is_file():
|
||||
try:
|
||||
content = git_dir.read_text().strip()
|
||||
if content.startswith("gitdir:"):
|
||||
git_dir = Path(content.split(":", 1)[1].strip())
|
||||
if not git_dir.is_absolute():
|
||||
git_dir = (repo_root / git_dir).resolve()
|
||||
except OSError:
|
||||
return None
|
||||
if not git_dir.is_dir():
|
||||
return None
|
||||
|
||||
# Figure out the "common" git dir — the one that owns shared refs.
|
||||
# For a worktree, commondir points at it (relative path, resolve
|
||||
# against git_dir). For a main checkout, common_dir == git_dir.
|
||||
common_dir = git_dir
|
||||
commondir_file = git_dir / "commondir"
|
||||
if commondir_file.is_file():
|
||||
try:
|
||||
rel = commondir_file.read_text().strip()
|
||||
candidate = (git_dir / rel).resolve() if rel else git_dir
|
||||
if candidate.is_dir():
|
||||
common_dir = candidate
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
head_path = git_dir / "HEAD"
|
||||
if not head_path.is_file():
|
||||
return None
|
||||
head_content = head_path.read_text().strip()
|
||||
|
||||
if head_content.startswith("ref:"):
|
||||
# Symbolic ref — follow one level (e.g. ref: refs/heads/main).
|
||||
# Worktree-local refs (bisect, rebase-merge state) live under
|
||||
# git_dir; shared refs (refs/heads/*, refs/tags/*) live under
|
||||
# common_dir. Try git_dir first, then common_dir.
|
||||
ref_rel = head_content.split(":", 1)[1].strip()
|
||||
for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
|
||||
ref_path = base / ref_rel
|
||||
if ref_path.is_file():
|
||||
try:
|
||||
sha = ref_path.read_text().strip()
|
||||
except OSError:
|
||||
continue
|
||||
if sha:
|
||||
return sha
|
||||
# Packed refs fallback — always stored in the common dir.
|
||||
packed = common_dir / "packed-refs"
|
||||
if packed.is_file():
|
||||
try:
|
||||
for line in packed.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or line.startswith("^"):
|
||||
continue
|
||||
parts = line.split(None, 1)
|
||||
if len(parts) == 2 and parts[1] == ref_rel:
|
||||
return parts[0] or None
|
||||
except OSError:
|
||||
return None
|
||||
return None
|
||||
|
||||
# Detached HEAD — content is the SHA directly.
|
||||
return head_content or None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _compute_repo_mtime(repo_root: Path) -> float:
|
||||
"""Return the newest mtime across the stale-code sentinel files.
|
||||
|
||||
Legacy fallback used only for non-git installs (``.git`` missing).
|
||||
Missing files are ignored (they may not exist on older checkouts).
|
||||
Returns 0.0 if no sentinel file is readable — treat that as "can't
|
||||
tell", which downstream callers interpret as "not stale" to avoid
|
||||
|
|
@ -1005,6 +1112,7 @@ class GatewayRunner:
|
|||
# running __init__ don't crash when _handle_message reads these.
|
||||
_boot_wall_time: float = 0.0
|
||||
_boot_repo_mtime: float = 0.0
|
||||
_boot_git_sha: Optional[str] = None
|
||||
_stale_code_restart_triggered: bool = False
|
||||
|
||||
def __init__(self, config: Optional[GatewayConfig] = None):
|
||||
|
|
@ -1020,15 +1128,23 @@ class GatewayRunner:
|
|||
try:
|
||||
self._boot_wall_time: float = time.time()
|
||||
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
|
||||
self._boot_git_sha: Optional[str] = _read_git_head_sha(
|
||||
self._repo_root_for_staleness,
|
||||
)
|
||||
self._boot_repo_mtime: float = _compute_repo_mtime(
|
||||
self._repo_root_for_staleness,
|
||||
)
|
||||
except Exception:
|
||||
self._boot_wall_time = 0.0
|
||||
self._repo_root_for_staleness = Path(".")
|
||||
self._boot_git_sha = None
|
||||
self._boot_repo_mtime = 0.0
|
||||
self._stale_code_notified: set[str] = set()
|
||||
self._stale_code_restart_triggered: bool = False
|
||||
# Cached current-SHA read, refreshed at most every
|
||||
# _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
|
||||
self._cached_current_sha: Optional[str] = self._boot_git_sha
|
||||
self._cached_current_sha_at: float = self._boot_wall_time
|
||||
|
||||
# Load ephemeral config from config.yaml / env vars.
|
||||
# Both are injected at API-call time only and never persisted.
|
||||
|
|
@ -2737,36 +2853,69 @@ class GatewayRunner:
|
|||
task.add_done_callback(self._background_tasks.discard)
|
||||
return True
|
||||
|
||||
def _current_git_sha_cached(self) -> Optional[str]:
|
||||
"""Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
|
||||
|
||||
A bursty chat (user mashes "hello?" three times) would otherwise
|
||||
re-read ``.git/HEAD`` on every message. Caching collapses that
|
||||
into a single read and still re-checks within the user's
|
||||
perceived "next message" window.
|
||||
"""
|
||||
now = time.time()
|
||||
if (
|
||||
self._cached_current_sha is not None
|
||||
and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
|
||||
):
|
||||
return self._cached_current_sha
|
||||
try:
|
||||
sha = _read_git_head_sha(self._repo_root_for_staleness)
|
||||
except Exception:
|
||||
sha = None
|
||||
self._cached_current_sha = sha
|
||||
self._cached_current_sha_at = now
|
||||
return sha
|
||||
|
||||
def _detect_stale_code(self) -> bool:
|
||||
"""Return True if source files on disk are newer than the running process.
|
||||
"""Return True if the git HEAD moved since this process booted.
|
||||
|
||||
A gateway that survives ``hermes update`` (manual SIGTERM never
|
||||
escalated, systemd restart race, detached-process respawn failed,
|
||||
etc.) keeps pre-update modules cached in ``sys.modules``. Later
|
||||
imports of names added post-update — e.g. ``cfg_get`` from PR
|
||||
#17304 — raise ImportError against the stale module object (see
|
||||
Issue #17648). Detecting this at the source — "the code on disk
|
||||
is newer than me" — lets us auto-restart instead of serving
|
||||
broken responses until the user notices and runs
|
||||
``hermes gateway restart`` manually.
|
||||
Issue #17648).
|
||||
|
||||
Returns False when the boot-time snapshot is unavailable or no
|
||||
sentinel file is readable, to avoid false-positive restart loops
|
||||
in unusual checkouts (sparse clones, read-only filesystems).
|
||||
We compare the git HEAD SHA at boot to the current SHA on disk.
|
||||
``hermes update`` always moves HEAD forward via ``git pull``;
|
||||
agent file edits (the agent patching ``run_agent.py`` or
|
||||
``gateway/run.py`` during a self-dev session) never move HEAD.
|
||||
That makes SHA comparison free of the false-positive class that
|
||||
the old mtime check suffered from — the agent can edit any file
|
||||
without triggering a phantom restart.
|
||||
|
||||
Returns False when:
|
||||
- the boot SHA is unavailable (non-git install, first call
|
||||
during partial init, etc.); we can't tell and refuse to loop
|
||||
- the current SHA matches the boot SHA
|
||||
- reading the current SHA fails for any reason
|
||||
"""
|
||||
if not self._boot_wall_time or not self._boot_repo_mtime:
|
||||
if not self._boot_wall_time:
|
||||
return False
|
||||
if not self._boot_git_sha:
|
||||
# Non-git install. ``hermes update`` is git-based, so a
|
||||
# non-git install can't experience the stale-modules class
|
||||
# this check exists to catch. Return False — no check, no
|
||||
# false positives. (If we ever ship a pip-install update
|
||||
# path, we'd add a persistent update marker here and compare
|
||||
# its timestamp to self._boot_wall_time.)
|
||||
return False
|
||||
try:
|
||||
current = _compute_repo_mtime(self._repo_root_for_staleness)
|
||||
current = self._current_git_sha_cached()
|
||||
except Exception:
|
||||
return False
|
||||
if current <= 0.0:
|
||||
if not current:
|
||||
return False
|
||||
# 2-second slack guards against filesystems with coarse mtime
|
||||
# resolution (FAT32, some NFS mounts). Real updates always move
|
||||
# the newest-file mtime forward by minutes, so this doesn't hide
|
||||
# genuine staleness.
|
||||
return current > self._boot_repo_mtime + 2.0
|
||||
return current != self._boot_git_sha
|
||||
|
||||
def _trigger_stale_code_restart(self) -> None:
|
||||
"""Idempotently kick off a graceful restart after stale-code detection.
|
||||
|
|
@ -2782,12 +2931,17 @@ class GatewayRunner:
|
|||
if self._stale_code_restart_triggered:
|
||||
return
|
||||
self._stale_code_restart_triggered = True
|
||||
current_sha = None
|
||||
try:
|
||||
current_sha = self._current_git_sha_cached()
|
||||
except Exception:
|
||||
pass
|
||||
logger.warning(
|
||||
"Stale-code self-check: source files newer than gateway boot "
|
||||
"time (boot=%.0f, newest=%.0f) — requesting graceful restart. "
|
||||
"Stale-code self-check: git HEAD moved since gateway boot "
|
||||
"(boot=%s, current=%s) — requesting graceful restart. "
|
||||
"See Issue #17648.",
|
||||
self._boot_repo_mtime,
|
||||
_compute_repo_mtime(self._repo_root_for_staleness),
|
||||
(self._boot_git_sha or "?")[:12],
|
||||
(current_sha or "?")[:12],
|
||||
)
|
||||
try:
|
||||
self.request_restart(detached=False, via_service=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue