refactor(gateway): route all active_agents coercion through parse_active_agents; harden drain-timeout fallback

Second cleanup pass (simplify-code review of the first follow-up):

- write_runtime_status now clamps active_agents via parse_active_agents
  instead of an inline max(0, int(...)). Removes the duplicated clamp the
  helper's docstring acknowledged AND closes a write-side ValueError gap
  (a non-numeric active_agents previously raised; now degrades to 0).
- hermes_cli/gateway.py draining-status line routes its active-agents count
  through parse_active_agents too — the third coercion site of the same
  persisted field, now consistent and non-raising with the two HTTP surfaces.
- web_server.py /api/status: the drain-timeout resolver fallback now catches
  ImportError specifically and falls back to DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
  (a real float) instead of a blanket 'except Exception -> None'. None would
  have violated the surfaced field's int/float contract and stripped NAS's
  poll-deadline hint silently.
- Dropped a redundant 'if runtime else 0' branch (parse_active_agents already
  handles the empty/None case) and tightened the parse_active_agents docstring
  to describe the actual single-contract role (write + both reads).
This commit is contained in:
kshitijk4poor 2026-06-21 16:43:13 +05:30
parent b577f25100
commit 4d7bb382b0
3 changed files with 16 additions and 10 deletions

View file

@ -595,7 +595,7 @@ def write_runtime_status(
if restart_requested is not _UNSET:
payload["restart_requested"] = bool(restart_requested)
if active_agents is not _UNSET:
payload["active_agents"] = max(0, int(active_agents))
payload["active_agents"] = parse_active_agents(active_agents)
if served_profiles is not _UNSET:
# Profiles this gateway multiplexes (multi-profile mode). Absent/empty
# for a single-profile gateway. Lets `hermes status` show per-profile
@ -624,11 +624,11 @@ def read_runtime_status() -> Optional[dict[str, Any]]:
def parse_active_agents(raw: Any) -> int:
"""Coerce a persisted ``active_agents`` value to a clamped non-negative int.
The status file is written atomically but can still hold an
absent/None/garbage ``active_agents`` after a partial write or a manual
edit. Both HTTP surfaces (``/api/status`` and ``/health/detailed``) read it
through this single helper so the field they expose is consistent and never
negative. Mirrors the write-side clamp in ``write_runtime_status``.
The shared coercion for the in-flight gateway-turn count. Used on the WRITE
side (``write_runtime_status``) and by both HTTP read surfaces
(``/api/status`` and ``/health/detailed``) so the count is clamped to a
single contract never negative, never raising on a manually-edited or
otherwise non-numeric value (degrades to ``0``).
"""
try:
return max(0, int(raw))

View file

@ -4573,7 +4573,9 @@ def _runtime_health_lines() -> list[str]:
lines.append(f"⚠ Last startup issue: {exit_reason}")
elif gateway_state == "draining":
action = "restart" if restart_requested else "shutdown"
count = int(active_agents or 0)
from gateway.status import parse_active_agents
count = parse_active_agents(active_agents)
lines.append(f"⏳ Gateway draining for {action} ({count} active agent(s))")
elif gateway_state == "stopped" and exit_reason:
lines.append(f"⚠ Last shutdown reason: {exit_reason}")

View file

@ -1844,7 +1844,7 @@ async def get_status(profile: Optional[str] = None):
# liveness via the single shared contract in gateway.status. Liveness
# keys off gateway_running (a live PID/health probe), NEVER
# gateway_updated_at — a healthy idle gateway never advances that.
active_agents = parse_active_agents(runtime.get("active_agents", 0)) if runtime else 0
active_agents = parse_active_agents((runtime or {}).get("active_agents", 0))
gateway_busy = derive_gateway_busy(
gateway_running=gateway_running,
gateway_state=gateway_state,
@ -1862,8 +1862,12 @@ async def get_status(profile: Optional[str] = None):
from hermes_cli.gateway import _get_restart_drain_timeout
restart_drain_timeout = _get_restart_drain_timeout()
except Exception:
restart_drain_timeout = None
except ImportError:
# Resolver moved/renamed — fall back to the real default so the
# field stays a numeric poll-deadline hint, never None.
from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
restart_drain_timeout = DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
# Dashboard auth gate (Phase 7): surface whether the gate is engaged
# and which providers are registered so ``hermes status`` and the