mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): refuse model switch on stale checkout to avoid env_float ImportError
This commit is contained in:
parent
807bdc17f6
commit
0ba1dfed78
5 changed files with 275 additions and 0 deletions
64
gateway/code_skew.py
Normal file
64
gateway/code_skew.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
"""Detect when the gateway is running stale code after a hot ``git pull``.
|
||||
|
||||
The gateway is a single long-lived process; its ``sys.modules`` is frozen at
|
||||
boot. If the checkout is updated underneath it (a manual ``git pull``, or the
|
||||
window before ``hermes update``'s graceful restart fires), a first-time lazy
|
||||
import on a new code path can resolve a freshly-pulled consumer module against a
|
||||
stale cached dependency -> ImportError (see
|
||||
``tests/test_stale_utils_module_import.py`` for the exact failure).
|
||||
|
||||
We snapshot the checkout revision at gateway startup and compare on demand, so
|
||||
risky callers (e.g. ``/model`` switching) can refuse with a clear "restart the
|
||||
gateway" message instead of crashing on a cryptic import error.
|
||||
|
||||
If the revision can't be read (non-git install, IO error), the boot snapshot
|
||||
stays ``None`` and skew detection no-ops — it never produces a false positive.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
_boot_fingerprint: str | None = None
|
||||
|
||||
|
||||
def _fingerprint() -> str | None:
|
||||
"""Current checkout fingerprint, reusing the CLI's git-rev reader.
|
||||
|
||||
``hermes_cli.main`` is always already imported in a gateway process (it's
|
||||
the entry point), so this import is free and avoids duplicating the
|
||||
worktree-aware ref resolution.
|
||||
"""
|
||||
try:
|
||||
from hermes_cli.main import _read_git_revision_fingerprint
|
||||
|
||||
return _read_git_revision_fingerprint(_PROJECT_ROOT)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def record_boot_fingerprint() -> None:
|
||||
"""Snapshot the checkout revision at gateway startup (idempotent)."""
|
||||
global _boot_fingerprint
|
||||
if _boot_fingerprint is None:
|
||||
_boot_fingerprint = _fingerprint()
|
||||
|
||||
|
||||
def _short(fingerprint: str) -> str:
|
||||
"""Render a ``git:<ref>:<sha>`` fingerprint as a compact label."""
|
||||
sha = fingerprint.rsplit(":", 1)[-1]
|
||||
if sha and sha != "unresolved" and len(sha) > 10:
|
||||
return sha[:10]
|
||||
return sha or fingerprint
|
||||
|
||||
|
||||
def detect_code_skew() -> tuple[str, str] | None:
|
||||
"""Return ``(boot_rev, disk_rev)`` short labels if the checkout drifted
|
||||
since boot, else ``None``."""
|
||||
if _boot_fingerprint is None:
|
||||
return None
|
||||
current = _fingerprint()
|
||||
if current is None or current == _boot_fingerprint:
|
||||
return None
|
||||
return _short(_boot_fingerprint), _short(current)
|
||||
|
|
@ -17369,6 +17369,13 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
|
|||
Useful for systemd services to avoid restart-loop deadlocks
|
||||
when the previous process hasn't fully exited yet.
|
||||
"""
|
||||
# Snapshot the checkout revision now, while sys.modules still matches disk,
|
||||
# so a later `git pull` under this long-lived process can be detected (and
|
||||
# risky work like model switching refused) instead of crashing on a stale
|
||||
# in-memory module.
|
||||
from gateway.code_skew import record_boot_fingerprint
|
||||
record_boot_fingerprint()
|
||||
|
||||
# ── Duplicate-instance guard ──────────────────────────────────────
|
||||
# Prevent two gateways from running under the same HERMES_HOME.
|
||||
# The PID file is scoped to HERMES_HOME, so future multi-profile
|
||||
|
|
|
|||
|
|
@ -45,6 +45,35 @@ from utils import (
|
|||
logger = logging.getLogger("gateway.run")
|
||||
|
||||
|
||||
def _model_switch_skew_guard() -> Optional[str]:
|
||||
"""Refuse a model switch when the gateway is running stale code.
|
||||
|
||||
A long-lived gateway holds its modules in memory from boot. If the checkout
|
||||
changed underneath it (e.g. a manual ``git pull``), switching models can hit
|
||||
a first-time lazy import on a new code path and crash on a stale cached
|
||||
dependency — the cryptic ``cannot import name 'env_float' from 'utils'``.
|
||||
Detect the drift and tell the user to restart instead.
|
||||
|
||||
Intentionally scoped to model switching — the known, highest-risk trigger.
|
||||
Any first-time lazy import on a stale process is technically exposed; we
|
||||
don't guard every import site, only this one.
|
||||
"""
|
||||
from gateway.code_skew import detect_code_skew
|
||||
|
||||
skew = detect_code_skew()
|
||||
if not skew:
|
||||
return None
|
||||
boot_rev, disk_rev = skew
|
||||
return t(
|
||||
"gateway.model.error_prefix",
|
||||
error=(
|
||||
f"This gateway is running code from {boot_rev} but the checkout on "
|
||||
f"disk is now {disk_rev}. Switching models would risk a stale-module "
|
||||
f"crash — restart the gateway to load the new code: hermes gateway restart"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class GatewaySlashCommandsMixin:
|
||||
"""In-session slash-command handlers for GatewayRunner."""
|
||||
|
||||
|
|
@ -1146,6 +1175,9 @@ class GatewaySlashCommandsMixin:
|
|||
_chat_id: str, model_id: str, provider_slug: str
|
||||
) -> str:
|
||||
"""Perform the model switch and return confirmation text."""
|
||||
skew_error = _model_switch_skew_guard()
|
||||
if skew_error:
|
||||
return skew_error
|
||||
result = _switch_model(
|
||||
raw_input=model_id,
|
||||
current_provider=_cur_provider,
|
||||
|
|
@ -1366,6 +1398,9 @@ class GatewaySlashCommandsMixin:
|
|||
return "\n".join(lines)
|
||||
|
||||
# Perform the switch
|
||||
skew_error = _model_switch_skew_guard()
|
||||
if skew_error:
|
||||
return skew_error
|
||||
result = _switch_model(
|
||||
raw_input=model_input,
|
||||
current_provider=current_provider,
|
||||
|
|
|
|||
79
tests/test_code_skew.py
Normal file
79
tests/test_code_skew.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
"""Tests for gateway code-skew detection (stale-checkout guard).
|
||||
|
||||
Companion to ``tests/test_stale_utils_module_import.py``: that test proves the
|
||||
crash; these prove the guard that turns it into a clear "restart the gateway"
|
||||
message before a model switch can hit it.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from gateway import code_skew
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_boot_fingerprint(monkeypatch):
|
||||
"""Each test starts with no recorded boot fingerprint."""
|
||||
monkeypatch.setattr(code_skew, "_boot_fingerprint", None)
|
||||
|
||||
|
||||
class TestDetectCodeSkew:
|
||||
def test_no_boot_fingerprint_means_no_skew(self, monkeypatch):
|
||||
# Nothing recorded (e.g. non-git install) -> never a false positive.
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:def456")
|
||||
assert code_skew.detect_code_skew() is None
|
||||
|
||||
def test_unchanged_checkout_is_not_skew(self, monkeypatch):
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:abc1234567890")
|
||||
code_skew.record_boot_fingerprint()
|
||||
assert code_skew.detect_code_skew() is None
|
||||
|
||||
def test_drift_is_detected_with_short_revs(self, monkeypatch):
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:abc1234567890")
|
||||
code_skew.record_boot_fingerprint()
|
||||
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:def4567890123")
|
||||
skew = code_skew.detect_code_skew()
|
||||
assert skew == ("abc1234567", "def4567890")
|
||||
|
||||
def test_unreadable_current_rev_does_not_false_positive(self, monkeypatch):
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:abc1234567890")
|
||||
code_skew.record_boot_fingerprint()
|
||||
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: None)
|
||||
assert code_skew.detect_code_skew() is None
|
||||
|
||||
def test_record_is_idempotent(self, monkeypatch):
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:first")
|
||||
code_skew.record_boot_fingerprint()
|
||||
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:second")
|
||||
code_skew.record_boot_fingerprint() # must not overwrite the boot snapshot
|
||||
assert code_skew._boot_fingerprint == "git:refs/heads/main:first"
|
||||
|
||||
|
||||
class TestShort:
|
||||
def test_shortens_long_sha(self):
|
||||
assert code_skew._short("git:refs/heads/main:abcdef0123456789") == "abcdef0123"
|
||||
|
||||
def test_keeps_unresolved_marker(self):
|
||||
assert code_skew._short("git:refs/heads/main:unresolved") == "unresolved"
|
||||
|
||||
def test_passes_short_sha_through_untruncated(self):
|
||||
assert code_skew._short("git:HEAD:abc1234") == "abc1234"
|
||||
|
||||
|
||||
class TestModelSwitchSkewGuard:
|
||||
def test_guard_returns_none_without_skew(self, monkeypatch):
|
||||
from gateway import slash_commands
|
||||
|
||||
monkeypatch.setattr(code_skew, "detect_code_skew", lambda: None)
|
||||
assert slash_commands._model_switch_skew_guard() is None
|
||||
|
||||
def test_guard_message_names_revs_and_restart(self, monkeypatch):
|
||||
from gateway import slash_commands
|
||||
|
||||
monkeypatch.setattr(code_skew, "detect_code_skew", lambda: ("abc1234567", "def4567890"))
|
||||
msg = slash_commands._model_switch_skew_guard()
|
||||
assert msg is not None
|
||||
assert "abc1234567" in msg
|
||||
assert "def4567890" in msg
|
||||
assert "hermes gateway restart" in msg
|
||||
90
tests/test_stale_utils_module_import.py
Normal file
90
tests/test_stale_utils_module_import.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
"""Regression for the stale-``utils``-module ImportError after a hot ``git pull``.
|
||||
|
||||
Real incident (gateway session 1518671026962174144)::
|
||||
|
||||
Sorry, I encountered an error (ImportError).
|
||||
cannot import name 'env_float' from 'utils' (~/.hermes/hermes-agent/utils.py)
|
||||
|
||||
Mechanism:
|
||||
|
||||
1. A long-running gateway/agent process imported ``utils`` BEFORE ``env_float``
|
||||
existed (added in 06ca1e99, 2026-06-20 14:00). The cached module object in
|
||||
``sys.modules`` therefore has no ``env_float`` attribute.
|
||||
2. ``hermes update`` ran ``git pull``, updating ``utils.py`` (now defining
|
||||
``env_float``) and ~22 consumer modules (now doing ``from utils import
|
||||
env_float``) on disk -- WITHOUT restarting the process.
|
||||
3. Switching the live session's model (anthropic/opus -> opencode/glm) forced the
|
||||
FIRST import of a consumer module on the new provider's code path. Its
|
||||
top-level ``from utils import env_float`` resolved against the STALE cached
|
||||
``utils`` -> ImportError. The path in parentheses is the consumer-reported
|
||||
``utils.__file__`` on disk (which *does* define ``env_float``), which is why
|
||||
the error is so confusing: the file on disk is fine, the in-memory module is not.
|
||||
|
||||
``hermes_cli/main.py`` (the ``hermes update`` flow, ~line 9326) already
|
||||
acknowledges this exact hazard -- "source files on disk are newer than cached
|
||||
Python modules in this process" -- and reloads ``hermes_constants`` after the
|
||||
pull, but NOT ``utils``. Any ``utils`` consumer added in the same release stays
|
||||
exposed until the process restarts.
|
||||
|
||||
The messaging client (Discord/Telegram/Feishu/...) is incidental: the trigger is
|
||||
a fresh import on a stale process, not the platform. We assert that below by
|
||||
reproducing the failure with the Discord adapter's exact import line.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _import_fresh_consumer(name: str, source: str) -> types.ModuleType:
|
||||
"""Import a brand-new module whose body runs ``source`` -- mimicking a
|
||||
consumer module being imported for the first time on the model-switch path."""
|
||||
mod = types.ModuleType(name)
|
||||
mod.__file__ = f"{name}.py"
|
||||
sys.modules.pop(name, None)
|
||||
exec(compile(source, mod.__file__, "exec"), mod.__dict__)
|
||||
sys.modules[name] = mod
|
||||
return mod
|
||||
|
||||
|
||||
class TestStaleUtilsModuleImport:
|
||||
def test_fresh_consumer_import_fails_against_stale_utils(self, monkeypatch):
|
||||
"""The bug: stale in-memory ``utils`` + fresh ``from utils import env_float``."""
|
||||
import utils
|
||||
|
||||
# Sanity: today's on-disk source is healthy.
|
||||
assert hasattr(utils, "env_float")
|
||||
|
||||
# Simulate the pre-06-20 cached module (monkeypatch auto-restores after).
|
||||
monkeypatch.delattr(utils, "env_float")
|
||||
|
||||
with pytest.raises(ImportError, match=r"cannot import name 'env_float' from 'utils'"):
|
||||
_import_fresh_consumer("stale_switch_path_consumer", "from utils import env_float\n")
|
||||
|
||||
def test_client_is_incidental_discord_import_line_fails_identically(self, monkeypatch):
|
||||
"""Same failure via the Discord adapter's exact import line -- the client
|
||||
does not determine the bug, the stale process does."""
|
||||
import utils
|
||||
|
||||
monkeypatch.delattr(utils, "env_float")
|
||||
|
||||
# plugins/platforms/discord/adapter.py:106
|
||||
with pytest.raises(ImportError, match=r"cannot import name 'env_float' from 'utils'"):
|
||||
_import_fresh_consumer(
|
||||
"stale_discord_consumer",
|
||||
"from utils import atomic_json_write, env_float\n",
|
||||
)
|
||||
|
||||
def test_healthy_process_imports_consumer_fine(self):
|
||||
"""Control: when the cached ``utils`` matches disk (env_float present),
|
||||
the same consumer import succeeds -- proving the harness isolates the
|
||||
staleness, not an unrelated import error."""
|
||||
import utils
|
||||
|
||||
assert hasattr(utils, "env_float")
|
||||
mod = _import_fresh_consumer(
|
||||
"healthy_consumer",
|
||||
"from utils import env_float\nVALUE = env_float('UNSET_FOR_TEST', 1.5)\n",
|
||||
)
|
||||
assert mod.VALUE == 1.5
|
||||
Loading…
Add table
Add a link
Reference in a new issue