mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
refactor(gateway): dedupe drain-timeout resolution + share active_agents parse
Follow-up cleanups on top of the busy/idle readout (PR #50103): - web_server.py /api/status reused the single drain-timeout resolver hermes_cli.gateway._get_restart_drain_timeout() (HERMES_RESTART_DRAIN_TIMEOUT env -> agent.restart_drain_timeout config -> default) instead of inlining a third hand-rolled copy of that precedence chain. Also fixes a subtle divergence: the inline copy used os.environ.get() so a set-but-empty env var was treated as a value rather than falling through to config; the shared resolver .strip()s and falls through correctly. - Added gateway.status.parse_active_agents() and routed BOTH HTTP surfaces (/api/status and /health/detailed) through it, so the exposed active_agents field is consistently clamped non-negative. Previously /api/status clamped while /health/detailed exposed the raw file value, diverging on a corrupt count. - Added TestParseActiveAgents covering the shared coercion contract.
This commit is contained in:
parent
0ee75469d7
commit
b577f25100
4 changed files with 55 additions and 19 deletions
|
|
@ -73,6 +73,7 @@ from gateway.status import (
|
|||
derive_gateway_drainable,
|
||||
get_running_pid,
|
||||
get_runtime_status_running_pid,
|
||||
parse_active_agents,
|
||||
read_runtime_status,
|
||||
)
|
||||
from utils import env_var_enabled
|
||||
|
|
@ -1843,12 +1844,7 @@ async def get_status(profile: Optional[str] = None):
|
|||
# liveness via the single shared contract in gateway.status. Liveness
|
||||
# keys off gateway_running (a live PID/health probe), NEVER
|
||||
# gateway_updated_at — a healthy idle gateway never advances that.
|
||||
active_agents = 0
|
||||
if runtime:
|
||||
try:
|
||||
active_agents = max(0, int(runtime.get("active_agents", 0) or 0))
|
||||
except (TypeError, ValueError):
|
||||
active_agents = 0
|
||||
active_agents = parse_active_agents(runtime.get("active_agents", 0)) if runtime else 0
|
||||
gateway_busy = derive_gateway_busy(
|
||||
gateway_running=gateway_running,
|
||||
gateway_state=gateway_state,
|
||||
|
|
@ -1859,19 +1855,15 @@ async def get_status(profile: Optional[str] = None):
|
|||
gateway_state=gateway_state,
|
||||
)
|
||||
# Resolved drain timeout (seconds) so NAS can size its poll deadline
|
||||
# without out-of-band knowledge. Mirrors gateway/restart.py precedence:
|
||||
# HERMES_RESTART_DRAIN_TIMEOUT env override → config agent.* → default.
|
||||
from gateway.restart import parse_restart_drain_timeout
|
||||
# without out-of-band knowledge. Reuse the single resolver
|
||||
# (HERMES_RESTART_DRAIN_TIMEOUT env → config agent.restart_drain_timeout
|
||||
# → default) rather than re-deriving the precedence chain here.
|
||||
try:
|
||||
from hermes_cli.gateway import _get_restart_drain_timeout
|
||||
|
||||
_drain_timeout_raw = os.environ.get("HERMES_RESTART_DRAIN_TIMEOUT")
|
||||
if _drain_timeout_raw is None:
|
||||
try:
|
||||
_drain_timeout_raw = cfg_get(
|
||||
load_config(), "agent", "restart_drain_timeout", default=None
|
||||
)
|
||||
except Exception:
|
||||
_drain_timeout_raw = None
|
||||
restart_drain_timeout = parse_restart_drain_timeout(_drain_timeout_raw)
|
||||
restart_drain_timeout = _get_restart_drain_timeout()
|
||||
except Exception:
|
||||
restart_drain_timeout = None
|
||||
|
||||
# Dashboard auth gate (Phase 7): surface whether the gate is engaged
|
||||
# and which providers are registered so ``hermes status`` and the
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue