feat(dashboard): surface gateway busy/drainable on /api/status

Give an external consumer (NAS) a trustworthy, always-reachable busy/idle
readout it can poll before a disruptive lifecycle action (restart,
migrate, stop, auto-update). The dashboard /api/status is the only HTTP
surface guaranteed up on a hosted agent regardless of which gateway
platforms are enabled, and it already reads gateway_state.json.

Add to /api/status (additive, non-breaking):
  - active_agents       — in-flight gateway-turn count (now refreshed
                          per-turn by the companion gateway-side commit)
  - gateway_busy        — running AND active_agents > 0
  - gateway_drainable   — running and live (a valid begin-drain target)
  - restart_drain_timeout — resolved seconds, so the consumer can size its
                          poll deadline without out-of-band knowledge
                          (env HERMES_RESTART_DRAIN_TIMEOUT → config
                          agent.restart_drain_timeout → default)

The busy/drainable contract is defined once in gateway.status
(derive_gateway_busy / derive_gateway_drainable) and consumed by both
/api/status and /health/detailed so the two surfaces can never disagree.
Liveness keys off gateway_running (a live PID/health probe), NEVER
gateway_updated_at — a healthy idle gateway never advances that timestamp.
All derived fields degrade to safe falsy values when the gateway is down
or the status file is absent/corrupt (never a spurious "busy" that would
wedge the consumer). active_sessions (the 5-min DB recency heuristic the
SPA reads) is left exactly as-is — new signal, new fields.

Tests (behaviour contracts, not snapshots): the pure derivation contract
across every running/state/count/liveness combination; /api/status
integration for busy, idle-drainable, draining, down, stale-busy-file,
corrupt-count, and timeout surfacing; and /health/detailed parity.
This commit is contained in:
Ben 2026-06-21 20:17:53 +10:00 committed by kshitijk4poor
parent 51a338a1b6
commit 0ee75469d7
6 changed files with 302 additions and 3 deletions

View file

@ -1103,16 +1103,34 @@ class APIServerAdapter(BasePlatformAdapter):
dashboard can display full status without needing a shared PID file or
/proc access. No authentication required.
"""
from gateway.status import read_runtime_status
from gateway.status import (
derive_gateway_busy,
derive_gateway_drainable,
read_runtime_status,
)
runtime = read_runtime_status() or {}
gw_state = runtime.get("gateway_state")
gw_active = runtime.get("active_agents", 0)
# This endpoint is served BY the gateway process, so it is by definition
# alive — gateway_running is True. Derive busy/drainable from the same
# shared contract /api/status uses so the two surfaces never disagree.
return web.json_response({
"status": "ok",
"platform": "hermes-agent",
"version": _hermes_version(),
"gateway_state": runtime.get("gateway_state"),
"gateway_state": gw_state,
"platforms": runtime.get("platforms", {}),
"active_agents": runtime.get("active_agents", 0),
"active_agents": gw_active,
"gateway_busy": derive_gateway_busy(
gateway_running=True,
gateway_state=gw_state,
active_agents=gw_active,
),
"gateway_drainable": derive_gateway_drainable(
gateway_running=True,
gateway_state=gw_state,
),
"exit_reason": runtime.get("exit_reason"),
"updated_at": runtime.get("updated_at"),
"pid": os.getpid(),

View file

@ -621,6 +621,49 @@ def read_runtime_status() -> Optional[dict[str, Any]]:
return _read_json_file(_get_runtime_status_path())
# States in which the gateway is alive and could be asked to drain. Anything
# else (draining already, stopping, stopped, startup_failed, None) is NOT a
# valid begin-drain target.
_DRAINABLE_GATEWAY_STATES = frozenset({"running"})
def derive_gateway_busy(
*, gateway_running: bool, gateway_state: Any, active_agents: Any
) -> bool:
"""Whether the gateway is actively processing in-flight turns.
The contract NAS gates lifecycle actions on. Busy iff the gateway is live
(``gateway_running``), in the ``running`` state, AND at least one agent is
mid-turn (``active_agents > 0``). Degrades to ``False`` whenever liveness
is unknown, the state is anything but ``running``, or the count is
absent/unparseable i.e. a down or file-absent gateway reads "not busy",
never a spurious "busy".
NOTE: liveness keys off ``gateway_running`` (a live PID / health probe),
NEVER ``updated_at`` a healthy idle gateway never advances that timestamp.
"""
if not gateway_running:
return False
if gateway_state not in _DRAINABLE_GATEWAY_STATES:
return False
try:
return int(active_agents) > 0
except (TypeError, ValueError):
return False
def derive_gateway_drainable(*, gateway_running: bool, gateway_state: Any) -> bool:
"""Whether the gateway can accept a begin-drain request right now.
True iff the gateway is live and in the ``running`` state i.e. not already
draining/stopping/stopped and not in a failed-start state. This is
independent of ``active_agents``: an idle running gateway is drainable (the
drain just completes immediately). Degrades to ``False`` for a down or
non-running gateway.
"""
return bool(gateway_running) and gateway_state in _DRAINABLE_GATEWAY_STATES
def get_runtime_status_running_pid(
runtime: Optional[dict[str, Any]] = None,
) -> Optional[int]:

View file

@ -69,6 +69,8 @@ from hermes_cli.memory_providers import (
get_memory_provider,
)
from gateway.status import (
derive_gateway_busy,
derive_gateway_drainable,
get_running_pid,
get_runtime_status_running_pid,
read_runtime_status,
@ -1835,6 +1837,42 @@ async def get_status(profile: Optional[str] = None):
except Exception:
pass
# Busy/drainable readout (NAS lifecycle-safety gate). active_agents is
# the in-flight gateway-turn count the gateway now persists at every
# turn boundary; gateway_busy/gateway_drainable are derived from it +
# liveness via the single shared contract in gateway.status. Liveness
# keys off gateway_running (a live PID/health probe), NEVER
# gateway_updated_at — a healthy idle gateway never advances that.
active_agents = 0
if runtime:
try:
active_agents = max(0, int(runtime.get("active_agents", 0) or 0))
except (TypeError, ValueError):
active_agents = 0
gateway_busy = derive_gateway_busy(
gateway_running=gateway_running,
gateway_state=gateway_state,
active_agents=active_agents,
)
gateway_drainable = derive_gateway_drainable(
gateway_running=gateway_running,
gateway_state=gateway_state,
)
# Resolved drain timeout (seconds) so NAS can size its poll deadline
# without out-of-band knowledge. Mirrors gateway/restart.py precedence:
# HERMES_RESTART_DRAIN_TIMEOUT env override → config agent.* → default.
from gateway.restart import parse_restart_drain_timeout
_drain_timeout_raw = os.environ.get("HERMES_RESTART_DRAIN_TIMEOUT")
if _drain_timeout_raw is None:
try:
_drain_timeout_raw = cfg_get(
load_config(), "agent", "restart_drain_timeout", default=None
)
except Exception:
_drain_timeout_raw = None
restart_drain_timeout = parse_restart_drain_timeout(_drain_timeout_raw)
# Dashboard auth gate (Phase 7): surface whether the gate is engaged
# and which providers are registered so ``hermes status`` and the
# SPA's StatusPage can show "OAuth gate ON via Nous Research" or
@ -1863,6 +1901,10 @@ async def get_status(profile: Optional[str] = None):
"gateway_platforms": gateway_platforms,
"gateway_exit_reason": gateway_exit_reason,
"gateway_updated_at": gateway_updated_at,
"active_agents": active_agents,
"gateway_busy": gateway_busy,
"gateway_drainable": gateway_drainable,
"restart_drain_timeout": restart_drain_timeout,
"active_sessions": active_sessions,
"auth_required": auth_required,
"auth_providers": auth_providers,

View file

@ -584,6 +584,10 @@ class TestHealthDetailedEndpoint:
assert data["gateway_state"] == "running"
assert data["platforms"] == {"telegram": {"state": "connected"}}
assert data["active_agents"] == 2
# Derived busy/drainable: this endpoint is served BY the live
# gateway, so running + 2 agents ⇒ busy and drainable.
assert data["gateway_busy"] is True
assert data["gateway_drainable"] is True
assert isinstance(data["pid"], int)
assert "updated_at" in data
@ -599,6 +603,9 @@ class TestHealthDetailedEndpoint:
assert data["status"] == "ok"
assert data["gateway_state"] is None
assert data["platforms"] == {}
# No runtime file ⇒ state None ⇒ not busy, not drainable.
assert data["gateway_busy"] is False
assert data["gateway_drainable"] is False
@pytest.mark.asyncio
async def test_health_detailed_does_not_require_auth(self, auth_adapter):

View file

@ -1132,4 +1132,50 @@ class TestActiveAgentsTurnBoundaryWrite:
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
status.write_runtime_status(gateway_state="running", active_agents=-5)
assert status.read_runtime_status()["active_agents"] == 0
class TestGatewayBusyDerivation:
"""Pure contract for derive_gateway_busy / derive_gateway_drainable — the
single shared definition both /api/status and /health/detailed consume."""
def test_busy_requires_running_state_and_positive_count(self):
assert status.derive_gateway_busy(
gateway_running=True, gateway_state="running", active_agents=1
) is True
assert status.derive_gateway_busy(
gateway_running=True, gateway_state="running", active_agents=0
) is False
def test_busy_false_when_not_live_even_if_file_says_active(self):
# Liveness wins: gateway_running False ⇒ never busy, regardless of count.
assert status.derive_gateway_busy(
gateway_running=False, gateway_state="running", active_agents=9
) is False
def test_busy_false_for_non_running_states(self):
for state in ("draining", "stopping", "stopped", "startup_failed", None):
assert status.derive_gateway_busy(
gateway_running=True, gateway_state=state, active_agents=5
) is False, state
def test_busy_degrades_on_unparseable_count(self):
for bad in (None, "garbage", object()):
assert status.derive_gateway_busy(
gateway_running=True, gateway_state="running", active_agents=bad
) is False
def test_drainable_is_running_and_live_independent_of_count(self):
# Idle running gateway is drainable but NOT busy.
assert status.derive_gateway_drainable(
gateway_running=True, gateway_state="running"
) is True
assert status.derive_gateway_busy(
gateway_running=True, gateway_state="running", active_agents=0
) is False
def test_drainable_false_when_down_or_not_running(self):
assert status.derive_gateway_drainable(
gateway_running=False, gateway_state="running"
) is False
for state in ("draining", "stopped", None):
assert status.derive_gateway_drainable(
gateway_running=True, gateway_state=state
) is False, state

View file

@ -4271,6 +4271,149 @@ class TestStatusRemoteGateway:
assert data["gateway_state"] == "running"
class TestGatewayBusyReadout:
"""Tests for the NAS busy/drainable readout on /api/status.
Behaviour contracts (not snapshots): assert how gateway_busy / gateway_drainable
must RELATE to gateway_running + gateway_state + active_agents, and that every
field degrades to a safe falsy value when the gateway is down or its status
file is absent. Liveness must key off gateway_running, NEVER gateway_updated_at.
"""
@pytest.fixture(autouse=True)
def _setup_test_client(self):
try:
from starlette.testclient import TestClient
except ImportError:
pytest.skip("fastapi/starlette not installed")
from hermes_cli.web_server import app, _SESSION_HEADER_NAME, _SESSION_TOKEN
self.client = TestClient(app)
self.client.headers[_SESSION_HEADER_NAME] = _SESSION_TOKEN
def test_busy_when_running_with_active_agents(self, monkeypatch):
"""gateway_busy is True iff running AND active_agents > 0."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: 1234)
monkeypatch.setattr(ws, "read_runtime_status", lambda: {
"gateway_state": "running",
"platforms": {},
"active_agents": 2,
# A deliberately stale timestamp: busy must NOT depend on it.
"updated_at": "2020-01-01T00:00:00+00:00",
})
data = self.client.get("/api/status").json()
assert data["active_agents"] == 2
assert data["gateway_busy"] is True
assert data["gateway_drainable"] is True
def test_idle_running_is_drainable_but_not_busy(self, monkeypatch):
"""A running gateway with zero in-flight turns is drainable, not busy."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: 1234)
monkeypatch.setattr(ws, "read_runtime_status", lambda: {
"gateway_state": "running",
"platforms": {},
"active_agents": 0,
})
data = self.client.get("/api/status").json()
assert data["active_agents"] == 0
assert data["gateway_busy"] is False
assert data["gateway_drainable"] is True
def test_draining_state_is_neither_busy_nor_drainable(self, monkeypatch):
"""While draining, the gateway is not a fresh begin-drain target, and
busy is False even with a stale active_agents>0 in the file the state
gate dominates."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: 1234)
monkeypatch.setattr(ws, "read_runtime_status", lambda: {
"gateway_state": "draining",
"platforms": {},
"active_agents": 3,
})
data = self.client.get("/api/status").json()
assert data["gateway_busy"] is False
assert data["gateway_drainable"] is False
def test_down_gateway_degrades_to_safe_falsy(self, monkeypatch):
"""Gateway down (no PID, no remote probe): busy/drainable False,
active_agents 0 never a spurious busy that would wedge NAS."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: None)
monkeypatch.setattr(ws, "read_runtime_status", lambda: None)
monkeypatch.setattr(ws, "_GATEWAY_HEALTH_URL", None)
data = self.client.get("/api/status").json()
assert data["gateway_running"] is False
assert data["active_agents"] == 0
assert data["gateway_busy"] is False
assert data["gateway_drainable"] is False
def test_down_gateway_with_stale_busy_file_still_not_busy(self, monkeypatch):
"""A leftover status file claiming running + active_agents>0 must NOT
read as busy when the live PID probe says the gateway is down. Liveness
wins over the file."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: None)
monkeypatch.setattr(ws, "_GATEWAY_HEALTH_URL", None)
# File says running with active turns, but get_running_pid()==None and
# get_runtime_status_running_pid finds no live PID → gateway_running False.
monkeypatch.setattr(ws, "get_runtime_status_running_pid", lambda *_a, **_k: None)
monkeypatch.setattr(ws, "read_runtime_status", lambda: {
"gateway_state": "running",
"platforms": {},
"active_agents": 5,
})
data = self.client.get("/api/status").json()
assert data["gateway_running"] is False
assert data["gateway_busy"] is False
assert data["gateway_drainable"] is False
def test_restart_drain_timeout_surfaced_and_numeric(self, monkeypatch):
"""restart_drain_timeout is present and resolves to a non-negative
float so NAS can size its poll deadline without out-of-band knowledge."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: 1234)
monkeypatch.setattr(ws, "read_runtime_status", lambda: {
"gateway_state": "running",
"platforms": {},
"active_agents": 0,
})
monkeypatch.setenv("HERMES_RESTART_DRAIN_TIMEOUT", "90")
data = self.client.get("/api/status").json()
assert "restart_drain_timeout" in data
assert isinstance(data["restart_drain_timeout"], (int, float))
assert data["restart_drain_timeout"] == 90.0
def test_active_agents_unparseable_in_file_degrades_to_zero(self, monkeypatch):
"""A corrupt active_agents value in the status file must not 500 or
produce a spurious busy it degrades to 0/not-busy."""
import hermes_cli.web_server as ws
monkeypatch.setattr(ws, "get_running_pid", lambda: 1234)
monkeypatch.setattr(ws, "read_runtime_status", lambda: {
"gateway_state": "running",
"platforms": {},
"active_agents": "garbage",
})
data = self.client.get("/api/status").json()
assert data["active_agents"] == 0
assert data["gateway_busy"] is False
# ---------------------------------------------------------------------------
# Dashboard theme normaliser tests
# ---------------------------------------------------------------------------