mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
The external-drain marker .drain_request.json is written under HERMES_HOME, which on Hermes Cloud is a persistent Fly volume (/opt/data). A begin-drain marker therefore SURVIVES the post-update machine restart. But the disruptive lifecycle actions a drain protects (auto-update / image migrate / env edit / profile change) all restart the machine — which is exactly the signal the drain is over. The freshly-restarted gateway re-read the orphaned marker on its startup reconcile and parked itself back in 'draining', refusing every new turn indefinitely (NS-570: ~52 min until manually cleared). Fix: stamp the marker with an identity of THIS container/VM instantiation (kernel boot_id + PID 1 start time, read from /proc) and treat a marker whose epoch differs from the current instantiation as absent. A deliberate restart → new PID 1 → new epoch → stale marker ignored → gateway boots 'running'. A marker written during the current instantiation (the live drain) still matches; an s6 respawn of just the gateway (PID 1/init unchanged) keeps the same epoch, so an in-flight drain is still honoured (D4a reversibility preserved). The staleness check is lenient and never fail-closed: a legacy marker with no epoch, a corrupt/contentless marker, or an environment with no /proc (epoch unavailable) all degrade to the original presence-only behaviour. NAS is untouched — it only ever POSTs begin/cancel-drain over HTTP; the marker file is purely gateway-internal IPC. The fix is entirely within gateway/drain_control.py; the watcher and the dashboard endpoint go through the same drain_requested()/write_drain_request() chokepoints and need no functional change.
285 lines
11 KiB
Python
285 lines
11 KiB
Python
"""Tests for the external drain-control marker contract + gateway state machine.
|
|
|
|
Task 2.2/2.3. Two layers:
|
|
* drain_control.py — the presence-based marker contract (write/clear/read,
|
|
HERMES_HOME-scoped, never-raises).
|
|
* GatewayRunner enter/exit/watcher + the new-turn accept gate — the
|
|
reversible state machine driven by the marker.
|
|
|
|
Mocked tests are necessary-not-sufficient here (the HARD live-validation gate,
|
|
Q-B, exercises a real `hermes gateway run`); these lock the unit contract.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
import gateway.drain_control as dc
|
|
from gateway.run import GatewayRunner
|
|
from gateway.platforms.base import MessageEvent, MessageType
|
|
from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Marker contract (drain_control.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture
|
|
def home(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
return tmp_path
|
|
|
|
|
|
class TestMarkerContract:
|
|
def test_absent_by_default(self, home):
|
|
assert dc.drain_requested() is False
|
|
assert dc.read_drain_request() is None
|
|
|
|
def test_write_then_present(self, home):
|
|
payload = dc.write_drain_request(principal="nas")
|
|
assert dc.drain_requested() is True
|
|
assert payload["action"] == "drain"
|
|
assert payload["principal"] == "nas"
|
|
body = dc.read_drain_request()
|
|
assert body is not None and body["principal"] == "nas"
|
|
|
|
def test_clear_removes(self, home):
|
|
dc.write_drain_request()
|
|
assert dc.clear_drain_request() is True
|
|
assert dc.drain_requested() is False
|
|
# idempotent: clearing again is a no-op, returns False
|
|
assert dc.clear_drain_request() is False
|
|
|
|
def test_path_respects_hermes_home(self, home):
|
|
assert dc.drain_request_path() == home / ".drain_request.json"
|
|
|
|
def test_corrupt_marker_reads_as_present_contentless(self, home):
|
|
# A half-written / malformed marker must still count as "drain active"
|
|
# (fail-safe toward quiescing).
|
|
dc.drain_request_path().write_text("{not valid json", encoding="utf-8")
|
|
assert dc.drain_requested() is True
|
|
assert dc.read_drain_request() == {}
|
|
|
|
def test_write_is_atomic_json(self, home):
|
|
dc.write_drain_request(principal="x")
|
|
import json
|
|
|
|
data = json.loads(dc.drain_request_path().read_text())
|
|
assert data["action"] == "drain"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Instantiation-epoch staleness (NS-570: orphaned marker on durable volume)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestInstantiationEpoch:
|
|
def test_write_stamps_current_epoch(self, home):
|
|
payload = dc.write_drain_request(principal="nas")
|
|
assert payload["epoch"] == dc.current_instantiation_epoch()
|
|
body = dc.read_drain_request()
|
|
assert body is not None and body["epoch"] == dc.current_instantiation_epoch()
|
|
|
|
def test_current_epoch_is_stable_within_process(self):
|
|
# Memoised — an s6 respawn of just the gateway keeps PID 1, so a
|
|
# repeated call inside one process must return the same value (an
|
|
# in-flight drain stays honoured).
|
|
assert dc.current_instantiation_epoch() == dc.current_instantiation_epoch()
|
|
|
|
def test_marker_from_prior_instantiation_reads_as_absent(self, home, monkeypatch):
|
|
# THE NS-570 REGRESSION. A begin-drain marker written by a PREVIOUS
|
|
# container/VM instantiation survives on the durable HERMES_HOME volume
|
|
# across a machine restart. The freshly-restarted gateway (new epoch)
|
|
# must treat it as absent, NOT re-engage drain.
|
|
monkeypatch.setattr(dc, "current_instantiation_epoch", lambda: "epoch-OLD")
|
|
dc.write_drain_request(principal="nas") # stamps "epoch-OLD"
|
|
assert dc.drain_requested() is True # same epoch → active
|
|
|
|
# Simulate the restart: a brand-new instantiation epoch.
|
|
monkeypatch.setattr(dc, "current_instantiation_epoch", lambda: "epoch-NEW")
|
|
# The marker file is still physically present on the volume…
|
|
assert dc.drain_request_path().exists() is True
|
|
# …but it is ignored because its epoch belongs to a prior instantiation.
|
|
assert dc.drain_requested() is False
|
|
|
|
def test_marker_from_current_instantiation_is_honoured(self, home, monkeypatch):
|
|
monkeypatch.setattr(dc, "current_instantiation_epoch", lambda: "epoch-A")
|
|
dc.write_drain_request()
|
|
assert dc.drain_requested() is True
|
|
|
|
def test_legacy_marker_without_epoch_still_active(self, home):
|
|
# A marker written before this change (no "epoch" key) must remain
|
|
# fail-safe toward quiescing — never silently ignored.
|
|
import json
|
|
|
|
dc.drain_request_path().write_text(
|
|
json.dumps({"action": "drain", "requested_at": "x", "principal": "p"}),
|
|
encoding="utf-8",
|
|
)
|
|
assert dc.drain_requested() is True
|
|
|
|
def test_corrupt_marker_with_no_parseable_epoch_still_active(self, home):
|
|
# Half-written / malformed → read_drain_request returns {} → no epoch →
|
|
# lenient check keeps it active (fail-safe), same as before the change.
|
|
dc.drain_request_path().write_text("{not valid json", encoding="utf-8")
|
|
assert dc.drain_requested() is True
|
|
|
|
def test_unavailable_epoch_disables_staleness_check(self, home, monkeypatch):
|
|
# No /proc (non-Linux, etc.) → epoch "" → degrade to presence-only:
|
|
# any present marker (even with a foreign epoch) reads as active rather
|
|
# than fail-closed.
|
|
import json
|
|
|
|
dc.drain_request_path().write_text(
|
|
json.dumps({"action": "drain", "epoch": "some-other-epoch"}),
|
|
encoding="utf-8",
|
|
)
|
|
monkeypatch.setattr(dc, "current_instantiation_epoch", lambda: "")
|
|
assert dc.drain_requested() is True
|
|
|
|
def test_current_epoch_empty_when_proc_unreadable(self, monkeypatch):
|
|
# When neither /proc identity source is readable, the epoch is "" so
|
|
# the staleness check is disabled rather than crashing.
|
|
from pathlib import Path as _P
|
|
|
|
orig_read_text = _P.read_text
|
|
|
|
def _boom(self, *a, **k):
|
|
if str(self).startswith("/proc/"):
|
|
raise OSError("no /proc")
|
|
return orig_read_text(self, *a, **k)
|
|
|
|
dc.current_instantiation_epoch.cache_clear()
|
|
monkeypatch.setattr(_P, "read_text", _boom)
|
|
try:
|
|
assert dc.current_instantiation_epoch() == ""
|
|
finally:
|
|
dc.current_instantiation_epoch.cache_clear()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gateway state machine (enter / exit / idempotency)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _drain_runner():
|
|
runner, adapter = make_restart_runner()
|
|
runner._external_drain_active = False
|
|
# Bind the real methods under test.
|
|
runner._enter_external_drain = GatewayRunner._enter_external_drain.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._exit_external_drain = GatewayRunner._exit_external_drain.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
return runner, adapter
|
|
|
|
|
|
class TestDrainStateMachine:
|
|
def test_enter_sets_flag_and_flips_state(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
assert runner._external_drain_active is True
|
|
runner._update_runtime_status.assert_called_with("draining")
|
|
|
|
def test_enter_idempotent(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
runner._enter_external_drain() # second call — no-op
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
def test_exit_reverts_to_running(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
runner._exit_external_drain()
|
|
assert runner._external_drain_active is False
|
|
runner._update_runtime_status.assert_called_with("running")
|
|
|
|
def test_exit_idempotent_when_not_draining(self):
|
|
runner, _ = _drain_runner()
|
|
runner._exit_external_drain() # never entered — no-op
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
def test_exit_during_shutdown_does_not_revert_to_running(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
# A shutdown drain is now in progress — exit must NOT resurrect running.
|
|
runner._draining = True
|
|
runner._exit_external_drain()
|
|
assert runner._external_drain_active is False
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
def test_exit_when_loop_stopped_does_not_revert(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
runner._running = False
|
|
runner._exit_external_drain()
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Watcher reconciliation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDrainWatcher:
|
|
@pytest.mark.asyncio
|
|
async def test_watcher_enters_then_exits_with_marker(self, home):
|
|
runner, _ = _drain_runner()
|
|
runner._drain_control_watcher = GatewayRunner._drain_control_watcher.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
# Drive a few ticks manually rather than spinning the loop.
|
|
dc.write_drain_request()
|
|
task = asyncio.create_task(runner._drain_control_watcher(interval=0.02))
|
|
await asyncio.sleep(0.06)
|
|
assert runner._external_drain_active is True
|
|
dc.clear_drain_request()
|
|
await asyncio.sleep(0.06)
|
|
assert runner._external_drain_active is False
|
|
runner._running = False
|
|
await asyncio.sleep(0.04)
|
|
task.cancel()
|
|
try:
|
|
await task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# New-turn accept gate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestNewTurnGate:
|
|
@pytest.mark.asyncio
|
|
async def test_new_turn_refused_during_external_drain(self):
|
|
runner, _ = _drain_runner()
|
|
runner._external_drain_active = True
|
|
event = MessageEvent(
|
|
text="hello",
|
|
message_type=MessageType.TEXT,
|
|
source=make_restart_source(),
|
|
message_id="m1",
|
|
)
|
|
result = await runner._handle_message(event)
|
|
assert result is not None
|
|
assert "draining" in result.lower()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_in_flight_turn_not_interrupted_by_drain(self):
|
|
# Entering drain must NOT touch the running-agents set.
|
|
runner, _ = _drain_runner()
|
|
sentinel = MagicMock()
|
|
runner._running_agents["k"] = sentinel
|
|
runner._enter_external_drain()
|
|
assert runner._running_agents.get("k") is sentinel
|
|
sentinel.interrupt.assert_not_called()
|