mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(profiles): detect a separate-process gateway in profile status
The dashboard Profiles view showed "Gateway stopped" for a gateway that
is in fact running — while the sidebar status strip and `hermes gateway
status` (CLI) both correctly showed it running. Reported on v0.17.0
running the gateway + dashboard in one Docker container.
Root cause: three liveness surfaces with three detection strengths, all
reading the same `gateway.pid`:
- `hermes gateway status` -> find_gateway_pids() (process-table scan)
- sidebar /api/status -> get_running_pid() + gateway_state.json PID
fallback + health-URL probe
- Profiles view -> _check_gateway_running() = get_running_pid()
ONLY, no fallback
`get_running_pid()` short-circuits to None the moment the runtime lock
(`gateway.lock`) doesn't register as held by the *calling* process —
which is always true when the reader is a separate process from the
gateway (the dashboard is its own s6 service in the container), and also
for any launch-service-managed gateway that left a fresh
`gateway_state.json` but no live PID file. So the Profiles view alone
reported the live gateway as stopped.
Fix: give _check_gateway_running the same fallback the sidebar already
has — after the pid-file/lock check misses, validate the PID recorded in
that profile's gateway_state.json against the live process table via the
existing get_runtime_status_running_pid(). read_runtime_status() gains an
optional path arg so a profile's state file can be read without mutating
the process-global HERMES_HOME (preserving the contextvar-based profile
isolation the dashboard relies on). Backward compatible: every existing
caller passes no argument.
Tests: a regression test that fails pre-fix (live gateway, lock check
returns None -> must still report running) and a guard test that a
'stopped' state file is never reported running even with a live PID.
This commit is contained in:
parent
fa2f0bf3da
commit
31bced1607
3 changed files with 100 additions and 5 deletions
|
|
@ -660,9 +660,15 @@ def write_runtime_status(
|
|||
_write_json_file(path, payload)
|
||||
|
||||
|
||||
def read_runtime_status() -> Optional[dict[str, Any]]:
|
||||
"""Read the persisted gateway runtime health/status information."""
|
||||
return _read_json_file(_get_runtime_status_path())
|
||||
def read_runtime_status(path: Optional[Path] = None) -> Optional[dict[str, Any]]:
|
||||
"""Read the persisted gateway runtime health/status information.
|
||||
|
||||
``path`` is optional so callers that need to inspect a *different*
|
||||
profile's state file (e.g. the dashboard enumerating every profile)
|
||||
can do so without mutating ``HERMES_HOME`` in-process. Defaults to
|
||||
the active profile's ``gateway_state.json``.
|
||||
"""
|
||||
return _read_json_file(path or _get_runtime_status_path())
|
||||
|
||||
|
||||
def parse_active_agents(raw: Any) -> int:
|
||||
|
|
|
|||
|
|
@ -612,10 +612,34 @@ def _read_config_model(profile_dir: Path) -> tuple:
|
|||
|
||||
|
||||
def _check_gateway_running(profile_dir: Path) -> bool:
|
||||
"""Check if a gateway is running for a given profile directory."""
|
||||
"""Check if a gateway is running for a given profile directory.
|
||||
|
||||
Primary signal is the profile's ``gateway.pid`` (verified against the
|
||||
runtime lock). That check fails closed whenever the lock isn't held by
|
||||
*this* reader — which is exactly the case for a dashboard process that is
|
||||
a separate s6 service from the gateway it's reporting on (Docker), or any
|
||||
launch-service-managed gateway that left a fresh ``gateway_state.json`` but
|
||||
no live PID file. In those cases fall back to validating the PID recorded
|
||||
in the profile's own ``gateway_state.json`` against the live process table,
|
||||
mirroring the ``/api/status`` sidebar's liveness logic so the two surfaces
|
||||
agree. Parameterized by ``profile_dir`` so it never mutates ``HERMES_HOME``.
|
||||
"""
|
||||
try:
|
||||
from gateway.status import get_running_pid
|
||||
return get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) is not None
|
||||
if (
|
||||
get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False)
|
||||
is not None
|
||||
):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
from gateway.status import (
|
||||
get_runtime_status_running_pid,
|
||||
read_runtime_status,
|
||||
)
|
||||
runtime = read_runtime_status(profile_dir / "gateway_state.json")
|
||||
return get_runtime_status_running_pid(runtime) is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
|
|
|||
|
|
@ -1446,6 +1446,71 @@ class TestEdgeCases:
|
|||
cleanup_stale=False,
|
||||
)
|
||||
|
||||
def test_gateway_running_check_falls_back_to_runtime_state(self, profile_env):
|
||||
"""A live gateway whose PID-file/lock check fails closed (separate-process
|
||||
reader, e.g. the dashboard s6 service in Docker) is still detected via the
|
||||
profile's gateway_state.json validated against the live process table.
|
||||
|
||||
Regression: the Profiles view used to show "Gateway stopped" while the
|
||||
sidebar (which already has this fallback) showed "Gateway running" for the
|
||||
same live gateway. See get_running_pid() short-circuiting on an
|
||||
unheld runtime lock before it inspects the PID record.
|
||||
"""
|
||||
import os
|
||||
import gateway.status as gw_status
|
||||
from hermes_cli.profiles import _check_gateway_running
|
||||
|
||||
tmp_path = profile_env
|
||||
default_home = tmp_path / ".hermes"
|
||||
default_home.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write a realistic gateway_state.json pointing at THIS live process with
|
||||
# a gateway-shaped argv, so get_runtime_status_running_pid validates it.
|
||||
live_pid = os.getpid()
|
||||
(default_home / "gateway_state.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"pid": live_pid,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": gw_status._get_process_start_time(live_pid),
|
||||
"gateway_state": "running",
|
||||
"active_agents": 0,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# Primary pid-file/lock check returns None (no lock held by this reader),
|
||||
# exactly as it does for a separate-process dashboard. The fallback must
|
||||
# then read the state file and confirm the gateway is alive.
|
||||
with patch("gateway.status.get_running_pid", return_value=None):
|
||||
assert _check_gateway_running(default_home) is True
|
||||
|
||||
def test_gateway_running_check_runtime_state_stopped(self, profile_env):
|
||||
"""A gateway_state.json with state 'stopped' must NOT be reported running,
|
||||
even when the recorded PID happens to be alive."""
|
||||
import os
|
||||
from hermes_cli.profiles import _check_gateway_running
|
||||
|
||||
tmp_path = profile_env
|
||||
default_home = tmp_path / ".hermes"
|
||||
default_home.mkdir(parents=True, exist_ok=True)
|
||||
(default_home / "gateway_state.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"pid": os.getpid(),
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"gateway_state": "stopped",
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with patch("gateway.status.get_running_pid", return_value=None):
|
||||
assert _check_gateway_running(default_home) is False
|
||||
|
||||
def test_profile_name_boundary_single_char(self):
|
||||
"""Single alphanumeric character is valid."""
|
||||
validate_profile_name("a")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue