diff --git a/gateway/status.py b/gateway/status.py index a27d8c7c02e..9958e0d5553 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -660,9 +660,15 @@ def write_runtime_status( _write_json_file(path, payload) -def read_runtime_status() -> Optional[dict[str, Any]]: - """Read the persisted gateway runtime health/status information.""" - return _read_json_file(_get_runtime_status_path()) +def read_runtime_status(path: Optional[Path] = None) -> Optional[dict[str, Any]]: + """Read the persisted gateway runtime health/status information. + + ``path`` is optional so callers that need to inspect a *different* + profile's state file (e.g. the dashboard enumerating every profile) + can do so without mutating ``HERMES_HOME`` in-process. Defaults to + the active profile's ``gateway_state.json``. + """ + return _read_json_file(path or _get_runtime_status_path()) def parse_active_agents(raw: Any) -> int: diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index 490077884e5..025eff5176c 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -612,10 +612,34 @@ def _read_config_model(profile_dir: Path) -> tuple: def _check_gateway_running(profile_dir: Path) -> bool: - """Check if a gateway is running for a given profile directory.""" + """Check if a gateway is running for a given profile directory. + + Primary signal is the profile's ``gateway.pid`` (verified against the + runtime lock). That check fails closed whenever the lock isn't held by + *this* reader — which is exactly the case for a dashboard process that is + a separate s6 service from the gateway it's reporting on (Docker), or any + launch-service-managed gateway that left a fresh ``gateway_state.json`` but + no live PID file. In those cases fall back to validating the PID recorded + in the profile's own ``gateway_state.json`` against the live process table, + mirroring the ``/api/status`` sidebar's liveness logic so the two surfaces + agree. Parameterized by ``profile_dir`` so it never mutates ``HERMES_HOME``. + """ try: from gateway.status import get_running_pid - return get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) is not None + if ( + get_running_pid(profile_dir / "gateway.pid", cleanup_stale=False) + is not None + ): + return True + except Exception: + pass + try: + from gateway.status import ( + get_runtime_status_running_pid, + read_runtime_status, + ) + runtime = read_runtime_status(profile_dir / "gateway_state.json") + return get_runtime_status_running_pid(runtime) is not None except Exception: return False diff --git a/tests/hermes_cli/test_profiles.py b/tests/hermes_cli/test_profiles.py index 59afe84e563..29840d8c728 100644 --- a/tests/hermes_cli/test_profiles.py +++ b/tests/hermes_cli/test_profiles.py @@ -1446,6 +1446,71 @@ class TestEdgeCases: cleanup_stale=False, ) + def test_gateway_running_check_falls_back_to_runtime_state(self, profile_env): + """A live gateway whose PID-file/lock check fails closed (separate-process + reader, e.g. the dashboard s6 service in Docker) is still detected via the + profile's gateway_state.json validated against the live process table. + + Regression: the Profiles view used to show "Gateway stopped" while the + sidebar (which already has this fallback) showed "Gateway running" for the + same live gateway. See get_running_pid() short-circuiting on an + unheld runtime lock before it inspects the PID record. + """ + import os + import gateway.status as gw_status + from hermes_cli.profiles import _check_gateway_running + + tmp_path = profile_env + default_home = tmp_path / ".hermes" + default_home.mkdir(parents=True, exist_ok=True) + + # Write a realistic gateway_state.json pointing at THIS live process with + # a gateway-shaped argv, so get_runtime_status_running_pid validates it. + live_pid = os.getpid() + (default_home / "gateway_state.json").write_text( + json.dumps( + { + "pid": live_pid, + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "start_time": gw_status._get_process_start_time(live_pid), + "gateway_state": "running", + "active_agents": 0, + } + ), + encoding="utf-8", + ) + + # Primary pid-file/lock check returns None (no lock held by this reader), + # exactly as it does for a separate-process dashboard. The fallback must + # then read the state file and confirm the gateway is alive. + with patch("gateway.status.get_running_pid", return_value=None): + assert _check_gateway_running(default_home) is True + + def test_gateway_running_check_runtime_state_stopped(self, profile_env): + """A gateway_state.json with state 'stopped' must NOT be reported running, + even when the recorded PID happens to be alive.""" + import os + from hermes_cli.profiles import _check_gateway_running + + tmp_path = profile_env + default_home = tmp_path / ".hermes" + default_home.mkdir(parents=True, exist_ok=True) + (default_home / "gateway_state.json").write_text( + json.dumps( + { + "pid": os.getpid(), + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "gateway_state": "stopped", + } + ), + encoding="utf-8", + ) + + with patch("gateway.status.get_running_pid", return_value=None): + assert _check_gateway_running(default_home) is False + def test_profile_name_boundary_single_char(self): """Single alphanumeric character is valid.""" validate_profile_name("a")