fix(gateway): refuse model switch on stale checkout to avoid env_float ImportError

This commit is contained in:
fyzanshaik 2026-06-24 00:02:48 +05:30 committed by kshitij
parent 807bdc17f6
commit 0ba1dfed78
5 changed files with 275 additions and 0 deletions

64
gateway/code_skew.py Normal file
View file

@ -0,0 +1,64 @@
"""Detect when the gateway is running stale code after a hot ``git pull``.
The gateway is a single long-lived process; its ``sys.modules`` is frozen at
boot. If the checkout is updated underneath it (a manual ``git pull``, or the
window before ``hermes update``'s graceful restart fires), a first-time lazy
import on a new code path can resolve a freshly-pulled consumer module against a
stale cached dependency -> ImportError (see
``tests/test_stale_utils_module_import.py`` for the exact failure).
We snapshot the checkout revision at gateway startup and compare on demand, so
risky callers (e.g. ``/model`` switching) can refuse with a clear "restart the
gateway" message instead of crashing on a cryptic import error.
If the revision can't be read (non-git install, IO error), the boot snapshot
stays ``None`` and skew detection no-ops it never produces a false positive.
"""
from __future__ import annotations
from pathlib import Path
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
_boot_fingerprint: str | None = None
def _fingerprint() -> str | None:
"""Current checkout fingerprint, reusing the CLI's git-rev reader.
``hermes_cli.main`` is always already imported in a gateway process (it's
the entry point), so this import is free and avoids duplicating the
worktree-aware ref resolution.
"""
try:
from hermes_cli.main import _read_git_revision_fingerprint
return _read_git_revision_fingerprint(_PROJECT_ROOT)
except Exception:
return None
def record_boot_fingerprint() -> None:
"""Snapshot the checkout revision at gateway startup (idempotent)."""
global _boot_fingerprint
if _boot_fingerprint is None:
_boot_fingerprint = _fingerprint()
def _short(fingerprint: str) -> str:
"""Render a ``git:<ref>:<sha>`` fingerprint as a compact label."""
sha = fingerprint.rsplit(":", 1)[-1]
if sha and sha != "unresolved" and len(sha) > 10:
return sha[:10]
return sha or fingerprint
def detect_code_skew() -> tuple[str, str] | None:
"""Return ``(boot_rev, disk_rev)`` short labels if the checkout drifted
since boot, else ``None``."""
if _boot_fingerprint is None:
return None
current = _fingerprint()
if current is None or current == _boot_fingerprint:
return None
return _short(_boot_fingerprint), _short(current)

View file

@ -17369,6 +17369,13 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
Useful for systemd services to avoid restart-loop deadlocks
when the previous process hasn't fully exited yet.
"""
# Snapshot the checkout revision now, while sys.modules still matches disk,
# so a later `git pull` under this long-lived process can be detected (and
# risky work like model switching refused) instead of crashing on a stale
# in-memory module.
from gateway.code_skew import record_boot_fingerprint
record_boot_fingerprint()
# ── Duplicate-instance guard ──────────────────────────────────────
# Prevent two gateways from running under the same HERMES_HOME.
# The PID file is scoped to HERMES_HOME, so future multi-profile

View file

@ -45,6 +45,35 @@ from utils import (
logger = logging.getLogger("gateway.run")
def _model_switch_skew_guard() -> Optional[str]:
"""Refuse a model switch when the gateway is running stale code.
A long-lived gateway holds its modules in memory from boot. If the checkout
changed underneath it (e.g. a manual ``git pull``), switching models can hit
a first-time lazy import on a new code path and crash on a stale cached
dependency the cryptic ``cannot import name 'env_float' from 'utils'``.
Detect the drift and tell the user to restart instead.
Intentionally scoped to model switching the known, highest-risk trigger.
Any first-time lazy import on a stale process is technically exposed; we
don't guard every import site, only this one.
"""
from gateway.code_skew import detect_code_skew
skew = detect_code_skew()
if not skew:
return None
boot_rev, disk_rev = skew
return t(
"gateway.model.error_prefix",
error=(
f"This gateway is running code from {boot_rev} but the checkout on "
f"disk is now {disk_rev}. Switching models would risk a stale-module "
f"crash — restart the gateway to load the new code: hermes gateway restart"
),
)
class GatewaySlashCommandsMixin:
"""In-session slash-command handlers for GatewayRunner."""
@ -1146,6 +1175,9 @@ class GatewaySlashCommandsMixin:
_chat_id: str, model_id: str, provider_slug: str
) -> str:
"""Perform the model switch and return confirmation text."""
skew_error = _model_switch_skew_guard()
if skew_error:
return skew_error
result = _switch_model(
raw_input=model_id,
current_provider=_cur_provider,
@ -1366,6 +1398,9 @@ class GatewaySlashCommandsMixin:
return "\n".join(lines)
# Perform the switch
skew_error = _model_switch_skew_guard()
if skew_error:
return skew_error
result = _switch_model(
raw_input=model_input,
current_provider=current_provider,

79
tests/test_code_skew.py Normal file
View file

@ -0,0 +1,79 @@
"""Tests for gateway code-skew detection (stale-checkout guard).
Companion to ``tests/test_stale_utils_module_import.py``: that test proves the
crash; these prove the guard that turns it into a clear "restart the gateway"
message before a model switch can hit it.
"""
import pytest
from gateway import code_skew
@pytest.fixture(autouse=True)
def _reset_boot_fingerprint(monkeypatch):
"""Each test starts with no recorded boot fingerprint."""
monkeypatch.setattr(code_skew, "_boot_fingerprint", None)
class TestDetectCodeSkew:
def test_no_boot_fingerprint_means_no_skew(self, monkeypatch):
# Nothing recorded (e.g. non-git install) -> never a false positive.
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:def456")
assert code_skew.detect_code_skew() is None
def test_unchanged_checkout_is_not_skew(self, monkeypatch):
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:abc1234567890")
code_skew.record_boot_fingerprint()
assert code_skew.detect_code_skew() is None
def test_drift_is_detected_with_short_revs(self, monkeypatch):
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:abc1234567890")
code_skew.record_boot_fingerprint()
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:def4567890123")
skew = code_skew.detect_code_skew()
assert skew == ("abc1234567", "def4567890")
def test_unreadable_current_rev_does_not_false_positive(self, monkeypatch):
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:abc1234567890")
code_skew.record_boot_fingerprint()
monkeypatch.setattr(code_skew, "_fingerprint", lambda: None)
assert code_skew.detect_code_skew() is None
def test_record_is_idempotent(self, monkeypatch):
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:first")
code_skew.record_boot_fingerprint()
monkeypatch.setattr(code_skew, "_fingerprint", lambda: "git:refs/heads/main:second")
code_skew.record_boot_fingerprint() # must not overwrite the boot snapshot
assert code_skew._boot_fingerprint == "git:refs/heads/main:first"
class TestShort:
def test_shortens_long_sha(self):
assert code_skew._short("git:refs/heads/main:abcdef0123456789") == "abcdef0123"
def test_keeps_unresolved_marker(self):
assert code_skew._short("git:refs/heads/main:unresolved") == "unresolved"
def test_passes_short_sha_through_untruncated(self):
assert code_skew._short("git:HEAD:abc1234") == "abc1234"
class TestModelSwitchSkewGuard:
def test_guard_returns_none_without_skew(self, monkeypatch):
from gateway import slash_commands
monkeypatch.setattr(code_skew, "detect_code_skew", lambda: None)
assert slash_commands._model_switch_skew_guard() is None
def test_guard_message_names_revs_and_restart(self, monkeypatch):
from gateway import slash_commands
monkeypatch.setattr(code_skew, "detect_code_skew", lambda: ("abc1234567", "def4567890"))
msg = slash_commands._model_switch_skew_guard()
assert msg is not None
assert "abc1234567" in msg
assert "def4567890" in msg
assert "hermes gateway restart" in msg

View file

@ -0,0 +1,90 @@
"""Regression for the stale-``utils``-module ImportError after a hot ``git pull``.
Real incident (gateway session 1518671026962174144)::
Sorry, I encountered an error (ImportError).
cannot import name 'env_float' from 'utils' (~/.hermes/hermes-agent/utils.py)
Mechanism:
1. A long-running gateway/agent process imported ``utils`` BEFORE ``env_float``
existed (added in 06ca1e99, 2026-06-20 14:00). The cached module object in
``sys.modules`` therefore has no ``env_float`` attribute.
2. ``hermes update`` ran ``git pull``, updating ``utils.py`` (now defining
``env_float``) and ~22 consumer modules (now doing ``from utils import
env_float``) on disk -- WITHOUT restarting the process.
3. Switching the live session's model (anthropic/opus -> opencode/glm) forced the
FIRST import of a consumer module on the new provider's code path. Its
top-level ``from utils import env_float`` resolved against the STALE cached
``utils`` -> ImportError. The path in parentheses is the consumer-reported
``utils.__file__`` on disk (which *does* define ``env_float``), which is why
the error is so confusing: the file on disk is fine, the in-memory module is not.
``hermes_cli/main.py`` (the ``hermes update`` flow, ~line 9326) already
acknowledges this exact hazard -- "source files on disk are newer than cached
Python modules in this process" -- and reloads ``hermes_constants`` after the
pull, but NOT ``utils``. Any ``utils`` consumer added in the same release stays
exposed until the process restarts.
The messaging client (Discord/Telegram/Feishu/...) is incidental: the trigger is
a fresh import on a stale process, not the platform. We assert that below by
reproducing the failure with the Discord adapter's exact import line.
"""
import sys
import types
import pytest
def _import_fresh_consumer(name: str, source: str) -> types.ModuleType:
"""Import a brand-new module whose body runs ``source`` -- mimicking a
consumer module being imported for the first time on the model-switch path."""
mod = types.ModuleType(name)
mod.__file__ = f"{name}.py"
sys.modules.pop(name, None)
exec(compile(source, mod.__file__, "exec"), mod.__dict__)
sys.modules[name] = mod
return mod
class TestStaleUtilsModuleImport:
def test_fresh_consumer_import_fails_against_stale_utils(self, monkeypatch):
"""The bug: stale in-memory ``utils`` + fresh ``from utils import env_float``."""
import utils
# Sanity: today's on-disk source is healthy.
assert hasattr(utils, "env_float")
# Simulate the pre-06-20 cached module (monkeypatch auto-restores after).
monkeypatch.delattr(utils, "env_float")
with pytest.raises(ImportError, match=r"cannot import name 'env_float' from 'utils'"):
_import_fresh_consumer("stale_switch_path_consumer", "from utils import env_float\n")
def test_client_is_incidental_discord_import_line_fails_identically(self, monkeypatch):
"""Same failure via the Discord adapter's exact import line -- the client
does not determine the bug, the stale process does."""
import utils
monkeypatch.delattr(utils, "env_float")
# plugins/platforms/discord/adapter.py:106
with pytest.raises(ImportError, match=r"cannot import name 'env_float' from 'utils'"):
_import_fresh_consumer(
"stale_discord_consumer",
"from utils import atomic_json_write, env_float\n",
)
def test_healthy_process_imports_consumer_fine(self):
"""Control: when the cached ``utils`` matches disk (env_float present),
the same consumer import succeeds -- proving the harness isolates the
staleness, not an unrelated import error."""
import utils
assert hasattr(utils, "env_float")
mod = _import_fresh_consumer(
"healthy_consumer",
"from utils import env_float\nVALUE = env_float('UNSET_FOR_TEST', 1.5)\n",
)
assert mod.VALUE == 1.5