mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 10:52:21 +00:00
refactor(gateway): dedupe drain-timeout resolution + share active_agents parse
Follow-up cleanups on top of the busy/idle readout (PR #50103): - web_server.py /api/status reused the single drain-timeout resolver hermes_cli.gateway._get_restart_drain_timeout() (HERMES_RESTART_DRAIN_TIMEOUT env -> agent.restart_drain_timeout config -> default) instead of inlining a third hand-rolled copy of that precedence chain. Also fixes a subtle divergence: the inline copy used os.environ.get() so a set-but-empty env var was treated as a value rather than falling through to config; the shared resolver .strip()s and falls through correctly. - Added gateway.status.parse_active_agents() and routed BOTH HTTP surfaces (/api/status and /health/detailed) through it, so the exposed active_agents field is consistently clamped non-negative. Previously /api/status clamped while /health/detailed exposed the raw file value, diverging on a corrupt count. - Added TestParseActiveAgents covering the shared coercion contract.
This commit is contained in:
parent
0ee75469d7
commit
b577f25100
4 changed files with 55 additions and 19 deletions
|
|
@ -1106,12 +1106,13 @@ class APIServerAdapter(BasePlatformAdapter):
|
|||
from gateway.status import (
|
||||
derive_gateway_busy,
|
||||
derive_gateway_drainable,
|
||||
parse_active_agents,
|
||||
read_runtime_status,
|
||||
)
|
||||
|
||||
runtime = read_runtime_status() or {}
|
||||
gw_state = runtime.get("gateway_state")
|
||||
gw_active = runtime.get("active_agents", 0)
|
||||
gw_active = parse_active_agents(runtime.get("active_agents", 0))
|
||||
# This endpoint is served BY the gateway process, so it is by definition
|
||||
# alive — gateway_running is True. Derive busy/drainable from the same
|
||||
# shared contract /api/status uses so the two surfaces never disagree.
|
||||
|
|
|
|||
|
|
@ -621,6 +621,21 @@ def read_runtime_status() -> Optional[dict[str, Any]]:
|
|||
return _read_json_file(_get_runtime_status_path())
|
||||
|
||||
|
||||
def parse_active_agents(raw: Any) -> int:
|
||||
"""Coerce a persisted ``active_agents`` value to a clamped non-negative int.
|
||||
|
||||
The status file is written atomically but can still hold an
|
||||
absent/None/garbage ``active_agents`` after a partial write or a manual
|
||||
edit. Both HTTP surfaces (``/api/status`` and ``/health/detailed``) read it
|
||||
through this single helper so the field they expose is consistent and never
|
||||
negative. Mirrors the write-side clamp in ``write_runtime_status``.
|
||||
"""
|
||||
try:
|
||||
return max(0, int(raw))
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
|
||||
# States in which the gateway is alive and could be asked to drain. Anything
|
||||
# else (draining already, stopping, stopped, startup_failed, None) is NOT a
|
||||
# valid begin-drain target.
|
||||
|
|
|
|||
|
|
@ -73,6 +73,7 @@ from gateway.status import (
|
|||
derive_gateway_drainable,
|
||||
get_running_pid,
|
||||
get_runtime_status_running_pid,
|
||||
parse_active_agents,
|
||||
read_runtime_status,
|
||||
)
|
||||
from utils import env_var_enabled
|
||||
|
|
@ -1843,12 +1844,7 @@ async def get_status(profile: Optional[str] = None):
|
|||
# liveness via the single shared contract in gateway.status. Liveness
|
||||
# keys off gateway_running (a live PID/health probe), NEVER
|
||||
# gateway_updated_at — a healthy idle gateway never advances that.
|
||||
active_agents = 0
|
||||
if runtime:
|
||||
try:
|
||||
active_agents = max(0, int(runtime.get("active_agents", 0) or 0))
|
||||
except (TypeError, ValueError):
|
||||
active_agents = 0
|
||||
active_agents = parse_active_agents(runtime.get("active_agents", 0)) if runtime else 0
|
||||
gateway_busy = derive_gateway_busy(
|
||||
gateway_running=gateway_running,
|
||||
gateway_state=gateway_state,
|
||||
|
|
@ -1859,19 +1855,15 @@ async def get_status(profile: Optional[str] = None):
|
|||
gateway_state=gateway_state,
|
||||
)
|
||||
# Resolved drain timeout (seconds) so NAS can size its poll deadline
|
||||
# without out-of-band knowledge. Mirrors gateway/restart.py precedence:
|
||||
# HERMES_RESTART_DRAIN_TIMEOUT env override → config agent.* → default.
|
||||
from gateway.restart import parse_restart_drain_timeout
|
||||
# without out-of-band knowledge. Reuse the single resolver
|
||||
# (HERMES_RESTART_DRAIN_TIMEOUT env → config agent.restart_drain_timeout
|
||||
# → default) rather than re-deriving the precedence chain here.
|
||||
try:
|
||||
from hermes_cli.gateway import _get_restart_drain_timeout
|
||||
|
||||
_drain_timeout_raw = os.environ.get("HERMES_RESTART_DRAIN_TIMEOUT")
|
||||
if _drain_timeout_raw is None:
|
||||
try:
|
||||
_drain_timeout_raw = cfg_get(
|
||||
load_config(), "agent", "restart_drain_timeout", default=None
|
||||
)
|
||||
except Exception:
|
||||
_drain_timeout_raw = None
|
||||
restart_drain_timeout = parse_restart_drain_timeout(_drain_timeout_raw)
|
||||
restart_drain_timeout = _get_restart_drain_timeout()
|
||||
except Exception:
|
||||
restart_drain_timeout = None
|
||||
|
||||
# Dashboard auth gate (Phase 7): surface whether the gate is engaged
|
||||
# and which providers are registered so ``hermes status`` and the
|
||||
|
|
|
|||
|
|
@ -1093,6 +1093,34 @@ class TestCorruptStatusFiles:
|
|||
assert status._read_pid_record(p) == {"pid": 4242}
|
||||
|
||||
|
||||
class TestParseActiveAgents:
|
||||
"""The shared read-side coercion used by BOTH HTTP surfaces (/api/status
|
||||
and /health/detailed) so the exposed active_agents field is consistent and
|
||||
never negative regardless of what the status file holds."""
|
||||
|
||||
def test_valid_int_passthrough(self):
|
||||
assert status.parse_active_agents(3) == 3
|
||||
|
||||
def test_zero(self):
|
||||
assert status.parse_active_agents(0) == 0
|
||||
|
||||
def test_numeric_string_coerced(self):
|
||||
assert status.parse_active_agents("5") == 5
|
||||
|
||||
def test_negative_clamped_to_zero(self):
|
||||
assert status.parse_active_agents(-3) == 0
|
||||
|
||||
def test_none_degrades_to_zero(self):
|
||||
assert status.parse_active_agents(None) == 0
|
||||
|
||||
def test_garbage_string_degrades_to_zero(self):
|
||||
assert status.parse_active_agents("garbage") == 0
|
||||
|
||||
def test_float_truncates(self):
|
||||
# int() truncation, then clamp — never raises.
|
||||
assert status.parse_active_agents(2.9) == 2
|
||||
|
||||
|
||||
class TestActiveAgentsTurnBoundaryWrite:
|
||||
"""The load-bearing Phase 1a contract: writing the in-flight count at a
|
||||
turn boundary must PRESERVE the lifecycle gateway_state. The whole readout
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue