mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
Tasks 2.1 + 2.2 + 2.3 of the safe-shutdown plan — the reversible
quiesce-without-restart machinery NAS drives during a lifecycle action (D4a).
These ship together because the endpoint, the control channel, and the gateway
state machine are one coherent slice.
2.2 — control channel (gateway/drain_control.py, new):
The dashboard has no HTTP path into a running gateway (guardrails: "there is NO
external control channel into a running gateway"); restart/drain is driven only
by markers the gateway reacts to. So begin/cancel-drain writes/removes a
presence-based marker .drain_request.json (HERMES_HOME-scoped, atomic write,
never-raises read; a corrupt marker reads as present-contentless → fail-safe
toward quiescing). This is Q-B option A.
2.2 — gateway state machine (gateway/run.py):
- _external_drain_active flag, DISTINCT from the shutdown _draining flag: this
one does NOT exit the process and is fully reversible.
- _enter_external_drain / _exit_external_drain: idempotent transitions that
flip gateway_state→draining / →running via _update_runtime_status (preserving
the live active_agents count). exit refuses to revert to running during a
real shutdown or after the loop stops (shutdown wins).
- _drain_control_watcher: 1s background task (modelled on _handoff_watcher)
reconciling accept-state with the marker; honours a marker that survived a
restart on its first tick. Registered alongside the other watchers in start.
- New-turn accept gate in _handle_message, placed BEFORE the session-slot
claim: when draining, refuse to START a new turn (so active_agents can only
fall → no TOCTOU race), while in-flight turns finish untouched. Internal/
system events (restart-recovery replays, bg-process completions) bypass it.
2.1 — endpoint (hermes_cli/web_server.py):
POST /api/gateway/drain {action: drain|cancel}. Authenticated by the Task-2.0a
token seam (the drain plugin registered this exact path as a token route);
attributes the request to the verified token principal. Begin writes the
marker, cancel removes it — the gateway process owns the actual transition.
Force-override (D6) is NOT here; it maps onto the existing immediate
/api/gateway/restart force path.
Tests (mocked — necessary-not-sufficient; the HARD live gate Q-B is next):
- tests/gateway/test_external_drain_control.py — marker contract (write/clear/
read/corrupt/atomic), state machine (enter/exit/idempotency/shutdown-wins/
loop-stopped), watcher reconcile-enter-then-exit, new-turn refusal, and
in-flight-not-interrupted. 15 tests.
- tests/hermes_cli/test_web_server.py — /api/gateway/drain begin/default-begin/
cancel/cancel-idempotent/bad-action-400. 6 tests.
- dashboard.drain_auth config section already added in 2.0b commit.
All touched suites green: 301 (gateway+auth) + 9 (web_server endpoints) passed.
Intentionally deferred:
- HARD live-validation gate (Q-B): real isolated `hermes gateway run`, drive a
real begin-drain marker, prove the 5-point checklist a–e.
- Spec-doc status flip + Phase-2 PR.
Build status: external-drain, restart-drain, status, dashboard-auth, drain-plugin,
token-auth, and web_server-endpoint suites green.
196 lines
7.1 KiB
Python
196 lines
7.1 KiB
Python
"""Tests for the external drain-control marker contract + gateway state machine.
|
|
|
|
Task 2.2/2.3. Two layers:
|
|
* drain_control.py — the presence-based marker contract (write/clear/read,
|
|
HERMES_HOME-scoped, never-raises).
|
|
* GatewayRunner enter/exit/watcher + the new-turn accept gate — the
|
|
reversible state machine driven by the marker.
|
|
|
|
Mocked tests are necessary-not-sufficient here (the HARD live-validation gate,
|
|
Q-B, exercises a real `hermes gateway run`); these lock the unit contract.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
import gateway.drain_control as dc
|
|
from gateway.run import GatewayRunner
|
|
from gateway.platforms.base import MessageEvent, MessageType
|
|
from tests.gateway.restart_test_helpers import make_restart_runner, make_restart_source
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Marker contract (drain_control.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture
|
|
def home(tmp_path, monkeypatch):
|
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
|
return tmp_path
|
|
|
|
|
|
class TestMarkerContract:
|
|
def test_absent_by_default(self, home):
|
|
assert dc.drain_requested() is False
|
|
assert dc.read_drain_request() is None
|
|
|
|
def test_write_then_present(self, home):
|
|
payload = dc.write_drain_request(principal="nas")
|
|
assert dc.drain_requested() is True
|
|
assert payload["action"] == "drain"
|
|
assert payload["principal"] == "nas"
|
|
body = dc.read_drain_request()
|
|
assert body is not None and body["principal"] == "nas"
|
|
|
|
def test_clear_removes(self, home):
|
|
dc.write_drain_request()
|
|
assert dc.clear_drain_request() is True
|
|
assert dc.drain_requested() is False
|
|
# idempotent: clearing again is a no-op, returns False
|
|
assert dc.clear_drain_request() is False
|
|
|
|
def test_path_respects_hermes_home(self, home):
|
|
assert dc.drain_request_path() == home / ".drain_request.json"
|
|
|
|
def test_corrupt_marker_reads_as_present_contentless(self, home):
|
|
# A half-written / malformed marker must still count as "drain active"
|
|
# (fail-safe toward quiescing).
|
|
dc.drain_request_path().write_text("{not valid json", encoding="utf-8")
|
|
assert dc.drain_requested() is True
|
|
assert dc.read_drain_request() == {}
|
|
|
|
def test_write_is_atomic_json(self, home):
|
|
dc.write_drain_request(principal="x")
|
|
import json
|
|
|
|
data = json.loads(dc.drain_request_path().read_text())
|
|
assert data["action"] == "drain"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Gateway state machine (enter / exit / idempotency)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _drain_runner():
|
|
runner, adapter = make_restart_runner()
|
|
runner._external_drain_active = False
|
|
# Bind the real methods under test.
|
|
runner._enter_external_drain = GatewayRunner._enter_external_drain.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
runner._exit_external_drain = GatewayRunner._exit_external_drain.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
return runner, adapter
|
|
|
|
|
|
class TestDrainStateMachine:
|
|
def test_enter_sets_flag_and_flips_state(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
assert runner._external_drain_active is True
|
|
runner._update_runtime_status.assert_called_with("draining")
|
|
|
|
def test_enter_idempotent(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
runner._enter_external_drain() # second call — no-op
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
def test_exit_reverts_to_running(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
runner._exit_external_drain()
|
|
assert runner._external_drain_active is False
|
|
runner._update_runtime_status.assert_called_with("running")
|
|
|
|
def test_exit_idempotent_when_not_draining(self):
|
|
runner, _ = _drain_runner()
|
|
runner._exit_external_drain() # never entered — no-op
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
def test_exit_during_shutdown_does_not_revert_to_running(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
# A shutdown drain is now in progress — exit must NOT resurrect running.
|
|
runner._draining = True
|
|
runner._exit_external_drain()
|
|
assert runner._external_drain_active is False
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
def test_exit_when_loop_stopped_does_not_revert(self):
|
|
runner, _ = _drain_runner()
|
|
runner._enter_external_drain()
|
|
runner._update_runtime_status.reset_mock()
|
|
runner._running = False
|
|
runner._exit_external_drain()
|
|
runner._update_runtime_status.assert_not_called()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Watcher reconciliation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDrainWatcher:
|
|
@pytest.mark.asyncio
|
|
async def test_watcher_enters_then_exits_with_marker(self, home):
|
|
runner, _ = _drain_runner()
|
|
runner._drain_control_watcher = GatewayRunner._drain_control_watcher.__get__(
|
|
runner, GatewayRunner
|
|
)
|
|
# Drive a few ticks manually rather than spinning the loop.
|
|
dc.write_drain_request()
|
|
task = asyncio.create_task(runner._drain_control_watcher(interval=0.02))
|
|
await asyncio.sleep(0.06)
|
|
assert runner._external_drain_active is True
|
|
dc.clear_drain_request()
|
|
await asyncio.sleep(0.06)
|
|
assert runner._external_drain_active is False
|
|
runner._running = False
|
|
await asyncio.sleep(0.04)
|
|
task.cancel()
|
|
try:
|
|
await task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# New-turn accept gate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestNewTurnGate:
|
|
@pytest.mark.asyncio
|
|
async def test_new_turn_refused_during_external_drain(self):
|
|
runner, _ = _drain_runner()
|
|
runner._external_drain_active = True
|
|
event = MessageEvent(
|
|
text="hello",
|
|
message_type=MessageType.TEXT,
|
|
source=make_restart_source(),
|
|
message_id="m1",
|
|
)
|
|
result = await runner._handle_message(event)
|
|
assert result is not None
|
|
assert "draining" in result.lower()
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_in_flight_turn_not_interrupted_by_drain(self):
|
|
# Entering drain must NOT touch the running-agents set.
|
|
runner, _ = _drain_runner()
|
|
sentinel = MagicMock()
|
|
runner._running_agents["k"] = sentinel
|
|
runner._enter_external_drain()
|
|
assert runner._running_agents.get("k") is sentinel
|
|
sentinel.interrupt.assert_not_called()
|