fix(profiles): detect a separate-process gateway in profile status

The dashboard Profiles view showed "Gateway stopped" for a gateway that
is in fact running — while the sidebar status strip and `hermes gateway
status` (CLI) both correctly showed it running. Reported on v0.17.0
running the gateway + dashboard in one Docker container.

Root cause: three liveness surfaces with three detection strengths, all
reading the same `gateway.pid`:

  - `hermes gateway status` -> find_gateway_pids() (process-table scan)
  - sidebar /api/status     -> get_running_pid() + gateway_state.json PID
                               fallback + health-URL probe
  - Profiles view           -> _check_gateway_running() = get_running_pid()
                               ONLY, no fallback

`get_running_pid()` short-circuits to None the moment the runtime lock
(`gateway.lock`) doesn't register as held by the *calling* process —
which is always true when the reader is a separate process from the
gateway (the dashboard is its own s6 service in the container), and also
for any launch-service-managed gateway that left a fresh
`gateway_state.json` but no live PID file. So the Profiles view alone
reported the live gateway as stopped.

Fix: give _check_gateway_running the same fallback the sidebar already
has — after the pid-file/lock check misses, validate the PID recorded in
that profile's gateway_state.json against the live process table via the
existing get_runtime_status_running_pid(). read_runtime_status() gains an
optional path arg so a profile's state file can be read without mutating
the process-global HERMES_HOME (preserving the contextvar-based profile
isolation the dashboard relies on). Backward compatible: every existing
caller passes no argument.

Tests: a regression test that fails pre-fix (live gateway, lock check
returns None -> must still report running) and a guard test that a
'stopped' state file is never reported running even with a live PID.
This commit is contained in:
Ben 2026-06-24 16:08:36 +10:00 committed by Ben Barclay
parent fa2f0bf3da
commit 31bced1607
3 changed files with 100 additions and 5 deletions

View file

@ -660,9 +660,15 @@ def write_runtime_status(
_write_json_file(path, payload)
def read_runtime_status() -> Optional[dict[str, Any]]:
"""Read the persisted gateway runtime health/status information."""
return _read_json_file(_get_runtime_status_path())
def read_runtime_status(path: Optional[Path] = None) -> Optional[dict[str, Any]]:
"""Read the persisted gateway runtime health/status information.
``path`` is optional so callers that need to inspect a *different*
profile's state file (e.g. the dashboard enumerating every profile)
can do so without mutating ``HERMES_HOME`` in-process. Defaults to
the active profile's ``gateway_state.json``.
"""
return _read_json_file(path or _get_runtime_status_path())
def parse_active_agents(raw: Any) -> int:

View file

@ -612,10 +612,34 @@ def _read_config_model(profile_dir: Path) -> tuple:
def _check_gateway_running(profile_dir: Path) -> bool:
"""Check if a gateway is running for a given profile directory."""
"""Check if a gateway is running for a given profile directory.
Primary signal is the profile's ``gateway.pid`` (verified against the
runtime lock). That check fails closed whenever the lock isn't held by
*this* reader which is exactly the case for a dashboard process that is
a separate s6 service from the gateway it's reporting on (Docker), or any
launch-service-managed gateway that left a fresh ``gateway_state.json`` but
no live PID file. In those cases fall back to validating the PID recorded
in the profile's own ``gateway_state.json`` against the live process table,
mirroring the ``/api/status`` sidebar's liveness logic so the two surfaces
agree. Parameterized by ``profile_dir`` so it never mutates ``HERMES_HOME``.
"""
try:
from gateway.status import get_running_pid
return get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) is not None
if (
get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False)
is not None
):
return True
except Exception:
pass
try:
from gateway.status import (
get_runtime_status_running_pid,
read_runtime_status,
)
runtime = read_runtime_status(profile_dir / "gateway_state.json")
return get_runtime_status_running_pid(runtime) is not None
except Exception:
return False

View file

@ -1446,6 +1446,71 @@ class TestEdgeCases:
cleanup_stale=False,
)
def test_gateway_running_check_falls_back_to_runtime_state(self, profile_env):
"""A live gateway whose PID-file/lock check fails closed (separate-process
reader, e.g. the dashboard s6 service in Docker) is still detected via the
profile's gateway_state.json validated against the live process table.
Regression: the Profiles view used to show "Gateway stopped" while the
sidebar (which already has this fallback) showed "Gateway running" for the
same live gateway. See get_running_pid() short-circuiting on an
unheld runtime lock before it inspects the PID record.
"""
import os
import gateway.status as gw_status
from hermes_cli.profiles import _check_gateway_running
tmp_path = profile_env
default_home = tmp_path / ".hermes"
default_home.mkdir(parents=True, exist_ok=True)
# Write a realistic gateway_state.json pointing at THIS live process with
# a gateway-shaped argv, so get_runtime_status_running_pid validates it.
live_pid = os.getpid()
(default_home / "gateway_state.json").write_text(
json.dumps(
{
"pid": live_pid,
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": gw_status._get_process_start_time(live_pid),
"gateway_state": "running",
"active_agents": 0,
}
),
encoding="utf-8",
)
# Primary pid-file/lock check returns None (no lock held by this reader),
# exactly as it does for a separate-process dashboard. The fallback must
# then read the state file and confirm the gateway is alive.
with patch("gateway.status.get_running_pid", return_value=None):
assert _check_gateway_running(default_home) is True
def test_gateway_running_check_runtime_state_stopped(self, profile_env):
"""A gateway_state.json with state 'stopped' must NOT be reported running,
even when the recorded PID happens to be alive."""
import os
from hermes_cli.profiles import _check_gateway_running
tmp_path = profile_env
default_home = tmp_path / ".hermes"
default_home.mkdir(parents=True, exist_ok=True)
(default_home / "gateway_state.json").write_text(
json.dumps(
{
"pid": os.getpid(),
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"gateway_state": "stopped",
}
),
encoding="utf-8",
)
with patch("gateway.status.get_running_pid", return_value=None):
assert _check_gateway_running(default_home) is False
def test_profile_name_boundary_single_char(self):
"""Single alphanumeric character is valid."""
validate_profile_name("a")