From 4d7bb382b08d1d3b6a3e70869a6ffcc143efebde Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Sun, 21 Jun 2026 16:43:13 +0530 Subject: [PATCH] refactor(gateway): route all active_agents coercion through parse_active_agents; harden drain-timeout fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second cleanup pass (simplify-code review of the first follow-up): - write_runtime_status now clamps active_agents via parse_active_agents instead of an inline max(0, int(...)). Removes the duplicated clamp the helper's docstring acknowledged AND closes a write-side ValueError gap (a non-numeric active_agents previously raised; now degrades to 0). - hermes_cli/gateway.py draining-status line routes its active-agents count through parse_active_agents too — the third coercion site of the same persisted field, now consistent and non-raising with the two HTTP surfaces. - web_server.py /api/status: the drain-timeout resolver fallback now catches ImportError specifically and falls back to DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT (a real float) instead of a blanket 'except Exception -> None'. None would have violated the surfaced field's int/float contract and stripped NAS's poll-deadline hint silently. - Dropped a redundant 'if runtime else 0' branch (parse_active_agents already handles the empty/None case) and tightened the parse_active_agents docstring to describe the actual single-contract role (write + both reads). --- gateway/status.py | 12 ++++++------ hermes_cli/gateway.py | 4 +++- hermes_cli/web_server.py | 10 +++++++--- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index b925571c96d..c13752af171 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -595,7 +595,7 @@ def write_runtime_status( if restart_requested is not _UNSET: payload["restart_requested"] = bool(restart_requested) if active_agents is not _UNSET: - payload["active_agents"] = max(0, int(active_agents)) + payload["active_agents"] = parse_active_agents(active_agents) if served_profiles is not _UNSET: # Profiles this gateway multiplexes (multi-profile mode). Absent/empty # for a single-profile gateway. Lets `hermes status` show per-profile @@ -624,11 +624,11 @@ def read_runtime_status() -> Optional[dict[str, Any]]: def parse_active_agents(raw: Any) -> int: """Coerce a persisted ``active_agents`` value to a clamped non-negative int. - The status file is written atomically but can still hold an - absent/None/garbage ``active_agents`` after a partial write or a manual - edit. Both HTTP surfaces (``/api/status`` and ``/health/detailed``) read it - through this single helper so the field they expose is consistent and never - negative. Mirrors the write-side clamp in ``write_runtime_status``. + The shared coercion for the in-flight gateway-turn count. Used on the WRITE + side (``write_runtime_status``) and by both HTTP read surfaces + (``/api/status`` and ``/health/detailed``) so the count is clamped to a + single contract — never negative, never raising on a manually-edited or + otherwise non-numeric value (degrades to ``0``). """ try: return max(0, int(raw)) diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index cf65af98c40..34f7b96a984 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -4573,7 +4573,9 @@ def _runtime_health_lines() -> list[str]: lines.append(f"⚠ Last startup issue: {exit_reason}") elif gateway_state == "draining": action = "restart" if restart_requested else "shutdown" - count = int(active_agents or 0) + from gateway.status import parse_active_agents + + count = parse_active_agents(active_agents) lines.append(f"⏳ Gateway draining for {action} ({count} active agent(s))") elif gateway_state == "stopped" and exit_reason: lines.append(f"⚠ Last shutdown reason: {exit_reason}") diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 8e1e0e72124..74ea8182533 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -1844,7 +1844,7 @@ async def get_status(profile: Optional[str] = None): # liveness via the single shared contract in gateway.status. Liveness # keys off gateway_running (a live PID/health probe), NEVER # gateway_updated_at — a healthy idle gateway never advances that. - active_agents = parse_active_agents(runtime.get("active_agents", 0)) if runtime else 0 + active_agents = parse_active_agents((runtime or {}).get("active_agents", 0)) gateway_busy = derive_gateway_busy( gateway_running=gateway_running, gateway_state=gateway_state, @@ -1862,8 +1862,12 @@ async def get_status(profile: Optional[str] = None): from hermes_cli.gateway import _get_restart_drain_timeout restart_drain_timeout = _get_restart_drain_timeout() - except Exception: - restart_drain_timeout = None + except ImportError: + # Resolver moved/renamed — fall back to the real default so the + # field stays a numeric poll-deadline hint, never None. + from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT + + restart_drain_timeout = DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT # Dashboard auth gate (Phase 7): surface whether the gate is engaged # and which providers are registered so ``hermes status`` and the