diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py index 8d67aec85c4..aa968dcb98c 100644 --- a/gateway/platforms/api_server.py +++ b/gateway/platforms/api_server.py @@ -1106,12 +1106,13 @@ class APIServerAdapter(BasePlatformAdapter): from gateway.status import ( derive_gateway_busy, derive_gateway_drainable, + parse_active_agents, read_runtime_status, ) runtime = read_runtime_status() or {} gw_state = runtime.get("gateway_state") - gw_active = runtime.get("active_agents", 0) + gw_active = parse_active_agents(runtime.get("active_agents", 0)) # This endpoint is served BY the gateway process, so it is by definition # alive — gateway_running is True. Derive busy/drainable from the same # shared contract /api/status uses so the two surfaces never disagree. diff --git a/gateway/status.py b/gateway/status.py index d5f956a6cd6..b925571c96d 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -621,6 +621,21 @@ def read_runtime_status() -> Optional[dict[str, Any]]: return _read_json_file(_get_runtime_status_path()) +def parse_active_agents(raw: Any) -> int: + """Coerce a persisted ``active_agents`` value to a clamped non-negative int. + + The status file is written atomically but can still hold an + absent/None/garbage ``active_agents`` after a partial write or a manual + edit. Both HTTP surfaces (``/api/status`` and ``/health/detailed``) read it + through this single helper so the field they expose is consistent and never + negative. Mirrors the write-side clamp in ``write_runtime_status``. + """ + try: + return max(0, int(raw)) + except (TypeError, ValueError): + return 0 + + # States in which the gateway is alive and could be asked to drain. Anything # else (draining already, stopping, stopped, startup_failed, None) is NOT a # valid begin-drain target. diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 487ba7a3538..8e1e0e72124 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -73,6 +73,7 @@ from gateway.status import ( derive_gateway_drainable, get_running_pid, get_runtime_status_running_pid, + parse_active_agents, read_runtime_status, ) from utils import env_var_enabled @@ -1843,12 +1844,7 @@ async def get_status(profile: Optional[str] = None): # liveness via the single shared contract in gateway.status. Liveness # keys off gateway_running (a live PID/health probe), NEVER # gateway_updated_at — a healthy idle gateway never advances that. - active_agents = 0 - if runtime: - try: - active_agents = max(0, int(runtime.get("active_agents", 0) or 0)) - except (TypeError, ValueError): - active_agents = 0 + active_agents = parse_active_agents(runtime.get("active_agents", 0)) if runtime else 0 gateway_busy = derive_gateway_busy( gateway_running=gateway_running, gateway_state=gateway_state, @@ -1859,19 +1855,15 @@ async def get_status(profile: Optional[str] = None): gateway_state=gateway_state, ) # Resolved drain timeout (seconds) so NAS can size its poll deadline - # without out-of-band knowledge. Mirrors gateway/restart.py precedence: - # HERMES_RESTART_DRAIN_TIMEOUT env override → config agent.* → default. - from gateway.restart import parse_restart_drain_timeout + # without out-of-band knowledge. Reuse the single resolver + # (HERMES_RESTART_DRAIN_TIMEOUT env → config agent.restart_drain_timeout + # → default) rather than re-deriving the precedence chain here. + try: + from hermes_cli.gateway import _get_restart_drain_timeout - _drain_timeout_raw = os.environ.get("HERMES_RESTART_DRAIN_TIMEOUT") - if _drain_timeout_raw is None: - try: - _drain_timeout_raw = cfg_get( - load_config(), "agent", "restart_drain_timeout", default=None - ) - except Exception: - _drain_timeout_raw = None - restart_drain_timeout = parse_restart_drain_timeout(_drain_timeout_raw) + restart_drain_timeout = _get_restart_drain_timeout() + except Exception: + restart_drain_timeout = None # Dashboard auth gate (Phase 7): surface whether the gate is engaged # and which providers are registered so ``hermes status`` and the diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 22f92c81ef4..63f90fe3332 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -1093,6 +1093,34 @@ class TestCorruptStatusFiles: assert status._read_pid_record(p) == {"pid": 4242} +class TestParseActiveAgents: + """The shared read-side coercion used by BOTH HTTP surfaces (/api/status + and /health/detailed) so the exposed active_agents field is consistent and + never negative regardless of what the status file holds.""" + + def test_valid_int_passthrough(self): + assert status.parse_active_agents(3) == 3 + + def test_zero(self): + assert status.parse_active_agents(0) == 0 + + def test_numeric_string_coerced(self): + assert status.parse_active_agents("5") == 5 + + def test_negative_clamped_to_zero(self): + assert status.parse_active_agents(-3) == 0 + + def test_none_degrades_to_zero(self): + assert status.parse_active_agents(None) == 0 + + def test_garbage_string_degrades_to_zero(self): + assert status.parse_active_agents("garbage") == 0 + + def test_float_truncates(self): + # int() truncation, then clamp — never raises. + assert status.parse_active_agents(2.9) == 2 + + class TestActiveAgentsTurnBoundaryWrite: """The load-bearing Phase 1a contract: writing the in-flight count at a turn boundary must PRESERVE the lifecycle gateway_state. The whole readout