From 31bced160742548738679c9fdce7f81ff7057211 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 24 Jun 2026 16:08:36 +1000 Subject: [PATCH] fix(profiles): detect a separate-process gateway in profile status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dashboard Profiles view showed "Gateway stopped" for a gateway that is in fact running — while the sidebar status strip and `hermes gateway status` (CLI) both correctly showed it running. Reported on v0.17.0 running the gateway + dashboard in one Docker container. Root cause: three liveness surfaces with three detection strengths, all reading the same `gateway.pid`: - `hermes gateway status` -> find_gateway_pids() (process-table scan) - sidebar /api/status -> get_running_pid() + gateway_state.json PID fallback + health-URL probe - Profiles view -> _check_gateway_running() = get_running_pid() ONLY, no fallback `get_running_pid()` short-circuits to None the moment the runtime lock (`gateway.lock`) doesn't register as held by the *calling* process — which is always true when the reader is a separate process from the gateway (the dashboard is its own s6 service in the container), and also for any launch-service-managed gateway that left a fresh `gateway_state.json` but no live PID file. So the Profiles view alone reported the live gateway as stopped. Fix: give _check_gateway_running the same fallback the sidebar already has — after the pid-file/lock check misses, validate the PID recorded in that profile's gateway_state.json against the live process table via the existing get_runtime_status_running_pid(). read_runtime_status() gains an optional path arg so a profile's state file can be read without mutating the process-global HERMES_HOME (preserving the contextvar-based profile isolation the dashboard relies on). Backward compatible: every existing caller passes no argument. Tests: a regression test that fails pre-fix (live gateway, lock check returns None -> must still report running) and a guard test that a 'stopped' state file is never reported running even with a live PID. --- gateway/status.py | 12 ++++-- hermes_cli/profiles.py | 28 ++++++++++++- tests/hermes_cli/test_profiles.py | 65 +++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 5 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index a27d8c7c02e..9958e0d5553 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -660,9 +660,15 @@ def write_runtime_status( _write_json_file(path, payload) -def read_runtime_status() -> Optional[dict[str, Any]]: - """Read the persisted gateway runtime health/status information.""" - return _read_json_file(_get_runtime_status_path()) +def read_runtime_status(path: Optional[Path] = None) -> Optional[dict[str, Any]]: + """Read the persisted gateway runtime health/status information. + + ``path`` is optional so callers that need to inspect a *different* + profile's state file (e.g. the dashboard enumerating every profile) + can do so without mutating ``HERMES_HOME`` in-process. Defaults to + the active profile's ``gateway_state.json``. + """ + return _read_json_file(path or _get_runtime_status_path()) def parse_active_agents(raw: Any) -> int: diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index 490077884e5..025eff5176c 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -612,10 +612,34 @@ def _read_config_model(profile_dir: Path) -> tuple: def _check_gateway_running(profile_dir: Path) -> bool: - """Check if a gateway is running for a given profile directory.""" + """Check if a gateway is running for a given profile directory. + + Primary signal is the profile's ``gateway.pid`` (verified against the + runtime lock). That check fails closed whenever the lock isn't held by + *this* reader — which is exactly the case for a dashboard process that is + a separate s6 service from the gateway it's reporting on (Docker), or any + launch-service-managed gateway that left a fresh ``gateway_state.json`` but + no live PID file. In those cases fall back to validating the PID recorded + in the profile's own ``gateway_state.json`` against the live process table, + mirroring the ``/api/status`` sidebar's liveness logic so the two surfaces + agree. Parameterized by ``profile_dir`` so it never mutates ``HERMES_HOME``. + """ try: from gateway.status import get_running_pid - return get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) is not None + if ( + get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) + is not None + ): + return True + except Exception: + pass + try: + from gateway.status import ( + get_runtime_status_running_pid, + read_runtime_status, + ) + runtime = read_runtime_status(profile_dir / "gateway_state.json") + return get_runtime_status_running_pid(runtime) is not None except Exception: return False diff --git a/tests/hermes_cli/test_profiles.py b/tests/hermes_cli/test_profiles.py index 59afe84e563..29840d8c728 100644 --- a/tests/hermes_cli/test_profiles.py +++ b/tests/hermes_cli/test_profiles.py @@ -1446,6 +1446,71 @@ class TestEdgeCases: cleanup_stale=False, ) + def test_gateway_running_check_falls_back_to_runtime_state(self, profile_env): + """A live gateway whose PID-file/lock check fails closed (separate-process + reader, e.g. the dashboard s6 service in Docker) is still detected via the + profile's gateway_state.json validated against the live process table. + + Regression: the Profiles view used to show "Gateway stopped" while the + sidebar (which already has this fallback) showed "Gateway running" for the + same live gateway. See get_running_pid() short-circuiting on an + unheld runtime lock before it inspects the PID record. + """ + import os + import gateway.status as gw_status + from hermes_cli.profiles import _check_gateway_running + + tmp_path = profile_env + default_home = tmp_path / ".hermes" + default_home.mkdir(parents=True, exist_ok=True) + + # Write a realistic gateway_state.json pointing at THIS live process with + # a gateway-shaped argv, so get_runtime_status_running_pid validates it. + live_pid = os.getpid() + (default_home / "gateway_state.json").write_text( + json.dumps( + { + "pid": live_pid, + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "start_time": gw_status._get_process_start_time(live_pid), + "gateway_state": "running", + "active_agents": 0, + } + ), + encoding="utf-8", + ) + + # Primary pid-file/lock check returns None (no lock held by this reader), + # exactly as it does for a separate-process dashboard. The fallback must + # then read the state file and confirm the gateway is alive. + with patch("gateway.status.get_running_pid", return_value=None): + assert _check_gateway_running(default_home) is True + + def test_gateway_running_check_runtime_state_stopped(self, profile_env): + """A gateway_state.json with state 'stopped' must NOT be reported running, + even when the recorded PID happens to be alive.""" + import os + from hermes_cli.profiles import _check_gateway_running + + tmp_path = profile_env + default_home = tmp_path / ".hermes" + default_home.mkdir(parents=True, exist_ok=True) + (default_home / "gateway_state.json").write_text( + json.dumps( + { + "pid": os.getpid(), + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "gateway_state": "stopped", + } + ), + encoding="utf-8", + ) + + with patch("gateway.status.get_running_pid", return_value=None): + assert _check_gateway_running(default_home) is False + def test_profile_name_boundary_single_char(self): """Single alphanumeric character is valid.""" validate_profile_name("a")