From 538c419d2e0349003613b12e5a330c512cedb629 Mon Sep 17 00:00:00 2001 From: Ben Date: Thu, 25 Jun 2026 09:56:39 +1000 Subject: [PATCH] fix(gateway): scope dashboard liveness fallback to the profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #52151 hardened the runtime-status liveness check to trust a readable live process command line over stale gateway_state.json argv, so a recycled PID now owned by an s6 supervisor no longer counts as a running gateway. That fix is correct but incomplete for the reported symptom: the web dashboard showed a named profile's gateway green while `hermes -p gateway status` showed it stopped. Two further issues: 1. Cross-profile PID reuse. In per-profile Docker supervision, one profile's stale `gateway_state.json` can record a PID the OS later recycled onto a DIFFERENT profile's live gateway. That PID's command line still `looks_like_gateway`, so the dead profile was reported running. The recorded argv has its `-p ` selector stripped in-process by `_apply_profile_override`, so it cannot disambiguate; the live `/proc` cmdline still carries it. `get_runtime_status_running_pid` now accepts an `expected_home` and validates the live command line belongs to THAT profile (mirroring `hermes_cli.gateway._matches_current_profile`, the logic the CLI scan path already uses — which is why the CLI was correct). `_check_gateway_running` passes the enumerated profile dir. 2. The existing regression test `test_gateway_running_check_falls_back_to_ runtime_state` used the live pytest PID with a gateway-shaped record; once the live cmdline became authoritative it no longer looked like a gateway. Updated to mock the live cmdline to the real separate-process scenario it describes. The active-profile path (`get_running_pid`) is intentionally left unscoped: it is lock-verified and any live gateway cmdline is acceptable there. Multiplex mode is unaffected — `running` state is only ever written to a gateway's own home, never a secondary served profile's. Adds coverage for: cross-profile PID reuse (named + default), matching profile cmdline (`-p`, `--profile`, explicit HERMES_HOME=), the bare default gateway, and the unreadable-cmdline cross-platform fallback. Each new cross-profile assertion fails without the profile scope and passes with it. Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com> --- gateway/status.py | 83 +++++++++++++++++++- hermes_cli/profiles.py | 2 +- tests/gateway/test_status.py | 124 ++++++++++++++++++++++++++++++ tests/hermes_cli/test_profiles.py | 82 +++++++++++++++++++- 4 files changed, 285 insertions(+), 6 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index cc27de23549..8998c7a7a64 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -304,17 +304,85 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool: return looks_like_gateway_runtime_command_line(cmdline) -def _record_matches_live_gateway_pid(record: dict[str, Any], pid: int) -> bool: +def _profile_name_for_home(profile_home: Path) -> Optional[str]: + """Return the profile id a HERMES_HOME directory represents, or None. + + A named profile's home is ``/profiles/`` (immediate parent is + ``profiles``). The root/default home (``~/.hermes`` or ``$HERMES_HOME``) + has no such parent, so it maps to the default profile (``None`` here, which + callers treat as "the bare, flag-less gateway"). + """ + if profile_home.parent.name == "profiles": + return profile_home.name + return None + + +def _command_line_belongs_to_profile(command: str, profile_home: Path) -> bool: + """Return True when a gateway command line belongs to ``profile_home``. + + Mirrors ``hermes_cli.gateway._matches_current_profile`` so the dashboard's + cross-profile liveness fallback scopes a live PID to the *right* profile. + In a per-profile container, one profile's stale ``gateway_state.json`` can + record a PID that the OS has since recycled onto a DIFFERENT profile's live + gateway. That recycled PID's command line still ``looks_like_gateway`` — + so without a profile check the dead profile is reported running. A named + profile gateway carries ``-p ``/``--profile `` (or, rarely, an + explicit ``HERMES_HOME=``) on its argv; the default/root gateway runs + bare with no profile flag. + """ + command_lc = command.lower() + profile_name = _profile_name_for_home(profile_home) + home_lc = str(profile_home).lower() + + if profile_name is not None and profile_name != "default": + profile_lc = profile_name.lower() + return ( + f"--profile {profile_lc}" in command_lc + or f"-p {profile_lc}" in command_lc + or f"hermes_home={home_lc}" in command_lc + ) + + # Default/root profile: the gateway runs with no profile flag. Accept unless + # the command advertises *some other* profile (an explicit -p/--profile) or + # a non-matching explicit HERMES_HOME= on the argv. HERMES_HOME is usually + # passed via the environment (not visible on the command line), so its mere + # absence is not disqualifying — only a conflicting explicit value is. + if "--profile " in command_lc or " -p " in command_lc: + return False + if "hermes_home=" in command_lc and f"hermes_home={home_lc}" not in command_lc: + return False + return True + + +def _record_matches_live_gateway_pid( + record: dict[str, Any], + pid: int, + *, + expected_home: Optional[Path] = None, +) -> bool: """Return True when a live PID still identifies as this gateway record. Prefer the live command line whenever it is readable. Runtime status files can outlive the gateway process they describe; if PID reuse leaves the same PID occupied by an s6 supervisor/log process, the stale record's argv should not make that unrelated process count as a running gateway. + + When ``expected_home`` is provided (the dashboard enumerating a specific + profile's state file), the readable live command line must additionally + belong to *that* profile — otherwise a PID recycled onto a different + profile's live gateway would make the dead profile look alive. When the + live command line cannot be read (Windows/permission), fall back to the + persisted record so cross-platform behavior is preserved. """ live_cmdline = _read_process_cmdline(pid) if live_cmdline: - return looks_like_gateway_runtime_command_line(live_cmdline) + if not looks_like_gateway_runtime_command_line(live_cmdline): + return False + if expected_home is not None and not _command_line_belongs_to_profile( + live_cmdline, expected_home + ): + return False + return True return _record_looks_like_gateway(record) @@ -745,6 +813,8 @@ def derive_gateway_drainable(*, gateway_running: bool, gateway_state: Any) -> bo def get_runtime_status_running_pid( runtime: Optional[dict[str, Any]] = None, + *, + expected_home: Optional[Path] = None, ) -> Optional[int]: """Return a live gateway PID from the runtime status record, if valid. @@ -753,6 +823,13 @@ def get_runtime_status_running_pid( a live process and a fresh ``gateway_state.json`` but no ``gateway.pid``; use this as a conservative fallback by checking both the persisted state and the OS process identity. + + ``expected_home`` scopes the OS-identity check to a specific profile's + HERMES_HOME. Pass it when validating *another* profile's state file (the + dashboard enumerating every profile): a stale record whose PID the OS has + recycled onto a different profile's live gateway must not be reported + running for the dead profile. Omit it (the default) for the active + profile, where any live gateway command line is acceptable. """ payload = runtime if runtime is not None else read_runtime_status() if not isinstance(payload, dict): @@ -773,7 +850,7 @@ def get_runtime_status_running_pid( ): return None - if _record_matches_live_gateway_pid(payload, pid): + if _record_matches_live_gateway_pid(payload, pid, expected_home=expected_home): return pid return None diff --git a/hermes_cli/profiles.py b/hermes_cli/profiles.py index 025eff5176c..65d3d73dbe1 100644 --- a/hermes_cli/profiles.py +++ b/hermes_cli/profiles.py @@ -639,7 +639,7 @@ def _check_gateway_running(profile_dir: Path) -> bool: read_runtime_status, ) runtime = read_runtime_status(profile_dir / "gateway_state.json") - return get_runtime_status_running_pid(runtime) is not None + return get_runtime_status_running_pid(runtime, expected_home=profile_dir) is not None except Exception: return False diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index acb4c9850ab..42165b74803 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -397,6 +397,130 @@ class TestGatewayRuntimeStatus: assert status.get_runtime_status_running_pid(payload) == 132 + def test_runtime_status_running_pid_rejects_pid_reused_by_other_profile(self, monkeypatch): + """Regression (user report): a stale profile's recycled PID must not be + reported running just because it now hosts a DIFFERENT profile's gateway. + + Per-profile Docker supervision: ``coder``'s gateway died leaving a + ``gateway_state=running`` record at PID 139. The OS then recycled 139 + onto the live *default* gateway (``hermes gateway run``). The recorded + ``start_time`` is absent (older state file), so the start-time PID-reuse + guard does not catch it. Without the profile scope the live command + line still ``looks_like_gateway`` and ``coder`` is wrongly reported up. + """ + payload = { + "pid": 139, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + } + coder_home = Path("/opt/data/profiles/coder") + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + # PID 139 is now the live DEFAULT gateway (bare, no -p coder). + monkeypatch.setattr( + status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace" + ) + + assert ( + status.get_runtime_status_running_pid(payload, expected_home=coder_home) + is None + ) + + def test_runtime_status_running_pid_accepts_matching_profile_cmdline(self, monkeypatch): + """A genuinely-live named gateway carries ``-p `` / ``--profile`` + on its command line and must be reported running for that profile.""" + payload = { + "pid": 139, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "start_time": 1000, + } + coder_home = Path("/opt/data/profiles/coder") + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000) + for cmdline in ( + "hermes -p coder gateway run --replace", + "/opt/hermes/.venv/bin/hermes --profile coder gateway run --replace", + "hermes_home=/opt/data/profiles/coder hermes gateway run --replace", + ): + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid, c=cmdline: c) + assert ( + status.get_runtime_status_running_pid(payload, expected_home=coder_home) + == 139 + ), cmdline + + def test_runtime_status_running_pid_default_profile_rejects_named_cmdline(self, monkeypatch): + """The default/root profile runs a bare gateway (no profile flag). A + recycled PID now hosting a *named* profile gateway must not be reported + running for the default profile.""" + payload = { + "pid": 139, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + } + default_home = Path("/opt/data") + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + monkeypatch.setattr( + status, "_read_process_cmdline", lambda pid: "hermes -p coder gateway run --replace" + ) + + assert ( + status.get_runtime_status_running_pid(payload, expected_home=default_home) + is None + ) + + def test_runtime_status_running_pid_default_profile_accepts_bare_cmdline(self, monkeypatch): + """The default/root gateway (bare ``hermes gateway run``) is reported + running for the default profile.""" + payload = { + "pid": 139, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "start_time": 1000, + } + default_home = Path("/opt/data") + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000) + monkeypatch.setattr( + status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace" + ) + + assert ( + status.get_runtime_status_running_pid(payload, expected_home=default_home) + == 139 + ) + + def test_runtime_status_running_pid_profile_scope_falls_back_when_cmdline_unreadable(self, monkeypatch): + """When the live command line is unreadable (Windows/permission), the + profile scope cannot apply — fall back to the persisted record so the + cross-platform behavior is preserved.""" + payload = { + "pid": 139, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "start_time": 1000, + } + coder_home = Path("/opt/data/profiles/coder") + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None) + + assert ( + status.get_runtime_status_running_pid(payload, expected_home=coder_home) + == 139 + ) + def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path)) diff --git a/tests/hermes_cli/test_profiles.py b/tests/hermes_cli/test_profiles.py index 29840d8c728..44dfd6d4dae 100644 --- a/tests/hermes_cli/test_profiles.py +++ b/tests/hermes_cli/test_profiles.py @@ -1483,8 +1483,15 @@ class TestEdgeCases: # Primary pid-file/lock check returns None (no lock held by this reader), # exactly as it does for a separate-process dashboard. The fallback must - # then read the state file and confirm the gateway is alive. - with patch("gateway.status.get_running_pid", return_value=None): + # then read the state file and confirm the gateway is alive by checking + # the recorded PID's live command line. In the real separate-process + # scenario that PID belongs to the live gateway, so mock its command + # line to a bare ``gateway run`` (this is the default/root home, which + # runs the gateway with no profile flag). + with patch("gateway.status.get_running_pid", return_value=None), patch( + "gateway.status._read_process_cmdline", + return_value="hermes gateway run --replace", + ): assert _check_gateway_running(default_home) is True def test_gateway_running_check_runtime_state_stopped(self, profile_env): @@ -1511,6 +1518,77 @@ class TestEdgeCases: with patch("gateway.status.get_running_pid", return_value=None): assert _check_gateway_running(default_home) is False + def test_gateway_running_check_rejects_pid_reused_by_other_profile(self, profile_env): + """Regression (user report): the dashboard showed a NAMED profile's + gateway green while ``hermes -p gateway status`` showed it + stopped. + + Per-profile Docker supervision: a named profile (``coder``) left a + ``gateway_state=running`` record whose PID the OS later recycled onto a + DIFFERENT live process (here the default profile's gateway). The + ``_check_gateway_running`` fallback must scope the live PID to *this* + profile's command line, so a recycled PID hosting another profile's + gateway is not reported running for ``coder``. + """ + from hermes_cli.profiles import _check_gateway_running + + tmp_path = profile_env + coder_home = tmp_path / ".hermes" / "profiles" / "coder" + coder_home.mkdir(parents=True, exist_ok=True) + (coder_home / "gateway_state.json").write_text( + json.dumps( + { + "pid": 139, + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "gateway_state": "running", + "active_agents": 0, + } + ), + encoding="utf-8", + ) + + # PID 139 is alive but is the DEFAULT gateway (bare, no -p coder), not + # coder's. start_time is absent so the PID-reuse guard cannot catch it; + # the profile scope must. + with patch("gateway.status.get_running_pid", return_value=None), patch( + "gateway.status._pid_exists", return_value=True + ), patch("gateway.status._get_process_start_time", return_value=None), patch( + "gateway.status._read_process_cmdline", + return_value="hermes gateway run --replace", + ): + assert _check_gateway_running(coder_home) is False + + def test_gateway_running_check_detects_matching_named_profile(self, profile_env): + """A genuinely-live named gateway (``-p coder`` on its command line) is + still reported running for that profile.""" + from hermes_cli.profiles import _check_gateway_running + + tmp_path = profile_env + coder_home = tmp_path / ".hermes" / "profiles" / "coder" + coder_home.mkdir(parents=True, exist_ok=True) + (coder_home / "gateway_state.json").write_text( + json.dumps( + { + "pid": 139, + "kind": "hermes-gateway", + "argv": ["hermes", "gateway", "run"], + "start_time": 1000, + "gateway_state": "running", + "active_agents": 0, + } + ), + encoding="utf-8", + ) + + with patch("gateway.status.get_running_pid", return_value=None), patch( + "gateway.status._pid_exists", return_value=True + ), patch("gateway.status._get_process_start_time", return_value=1000), patch( + "gateway.status._read_process_cmdline", + return_value="hermes -p coder gateway run --replace", + ): + assert _check_gateway_running(coder_home) is True + def test_profile_name_boundary_single_char(self): """Single alphanumeric character is valid.""" validate_profile_name("a")