refactor(gateway): dedupe drain-timeout resolution + share active_agents parse

Follow-up cleanups on top of the busy/idle readout (PR #50103):

- web_server.py /api/status reused the single drain-timeout resolver
  hermes_cli.gateway._get_restart_drain_timeout() (HERMES_RESTART_DRAIN_TIMEOUT
  env -> agent.restart_drain_timeout config -> default) instead of inlining a
  third hand-rolled copy of that precedence chain. Also fixes a subtle
  divergence: the inline copy used os.environ.get() so a set-but-empty env var
  was treated as a value rather than falling through to config; the shared
  resolver .strip()s and falls through correctly.
- Added gateway.status.parse_active_agents() and routed BOTH HTTP surfaces
  (/api/status and /health/detailed) through it, so the exposed active_agents
  field is consistently clamped non-negative. Previously /api/status clamped
  while /health/detailed exposed the raw file value, diverging on a corrupt
  count.
- Added TestParseActiveAgents covering the shared coercion contract.
This commit is contained in:
kshitijk4poor 2026-06-21 16:37:42 +05:30
parent 0ee75469d7
commit b577f25100
4 changed files with 55 additions and 19 deletions

View file

@ -1106,12 +1106,13 @@ class APIServerAdapter(BasePlatformAdapter):
from gateway.status import (
derive_gateway_busy,
derive_gateway_drainable,
parse_active_agents,
read_runtime_status,
)
runtime = read_runtime_status() or {}
gw_state = runtime.get("gateway_state")
gw_active = runtime.get("active_agents", 0)
gw_active = parse_active_agents(runtime.get("active_agents", 0))
# This endpoint is served BY the gateway process, so it is by definition
# alive — gateway_running is True. Derive busy/drainable from the same
# shared contract /api/status uses so the two surfaces never disagree.

View file

@ -621,6 +621,21 @@ def read_runtime_status() -> Optional[dict[str, Any]]:
return _read_json_file(_get_runtime_status_path())
def parse_active_agents(raw: Any) -> int:
"""Coerce a persisted ``active_agents`` value to a clamped non-negative int.
The status file is written atomically but can still hold an
absent/None/garbage ``active_agents`` after a partial write or a manual
edit. Both HTTP surfaces (``/api/status`` and ``/health/detailed``) read it
through this single helper so the field they expose is consistent and never
negative. Mirrors the write-side clamp in ``write_runtime_status``.
"""
try:
return max(0, int(raw))
except (TypeError, ValueError):
return 0
# States in which the gateway is alive and could be asked to drain. Anything
# else (draining already, stopping, stopped, startup_failed, None) is NOT a
# valid begin-drain target.

View file

@ -73,6 +73,7 @@ from gateway.status import (
derive_gateway_drainable,
get_running_pid,
get_runtime_status_running_pid,
parse_active_agents,
read_runtime_status,
)
from utils import env_var_enabled
@ -1843,12 +1844,7 @@ async def get_status(profile: Optional[str] = None):
# liveness via the single shared contract in gateway.status. Liveness
# keys off gateway_running (a live PID/health probe), NEVER
# gateway_updated_at — a healthy idle gateway never advances that.
active_agents = 0
if runtime:
try:
active_agents = max(0, int(runtime.get("active_agents", 0) or 0))
except (TypeError, ValueError):
active_agents = 0
active_agents = parse_active_agents(runtime.get("active_agents", 0)) if runtime else 0
gateway_busy = derive_gateway_busy(
gateway_running=gateway_running,
gateway_state=gateway_state,
@ -1859,19 +1855,15 @@ async def get_status(profile: Optional[str] = None):
gateway_state=gateway_state,
)
# Resolved drain timeout (seconds) so NAS can size its poll deadline
# without out-of-band knowledge. Mirrors gateway/restart.py precedence:
# HERMES_RESTART_DRAIN_TIMEOUT env override → config agent.* → default.
from gateway.restart import parse_restart_drain_timeout
# without out-of-band knowledge. Reuse the single resolver
# (HERMES_RESTART_DRAIN_TIMEOUT env → config agent.restart_drain_timeout
# → default) rather than re-deriving the precedence chain here.
try:
from hermes_cli.gateway import _get_restart_drain_timeout
_drain_timeout_raw = os.environ.get("HERMES_RESTART_DRAIN_TIMEOUT")
if _drain_timeout_raw is None:
try:
_drain_timeout_raw = cfg_get(
load_config(), "agent", "restart_drain_timeout", default=None
)
except Exception:
_drain_timeout_raw = None
restart_drain_timeout = parse_restart_drain_timeout(_drain_timeout_raw)
restart_drain_timeout = _get_restart_drain_timeout()
except Exception:
restart_drain_timeout = None
# Dashboard auth gate (Phase 7): surface whether the gate is engaged
# and which providers are registered so ``hermes status`` and the

View file

@ -1093,6 +1093,34 @@ class TestCorruptStatusFiles:
assert status._read_pid_record(p) == {"pid": 4242}
class TestParseActiveAgents:
"""The shared read-side coercion used by BOTH HTTP surfaces (/api/status
and /health/detailed) so the exposed active_agents field is consistent and
never negative regardless of what the status file holds."""
def test_valid_int_passthrough(self):
assert status.parse_active_agents(3) == 3
def test_zero(self):
assert status.parse_active_agents(0) == 0
def test_numeric_string_coerced(self):
assert status.parse_active_agents("5") == 5
def test_negative_clamped_to_zero(self):
assert status.parse_active_agents(-3) == 0
def test_none_degrades_to_zero(self):
assert status.parse_active_agents(None) == 0
def test_garbage_string_degrades_to_zero(self):
assert status.parse_active_agents("garbage") == 0
def test_float_truncates(self):
# int() truncation, then clamp — never raises.
assert status.parse_active_agents(2.9) == 2
class TestActiveAgentsTurnBoundaryWrite:
"""The load-bearing Phase 1a contract: writing the in-flight count at a
turn boundary must PRESERVE the lifecycle gateway_state. The whole readout