fix(gateway): scope dashboard liveness fallback to the profile

PR #52151 hardened the runtime-status liveness check to trust a readable
live process command line over stale gateway_state.json argv, so a recycled
PID now owned by an s6 supervisor no longer counts as a running gateway.

That fix is correct but incomplete for the reported symptom: the web
dashboard showed a named profile's gateway green while
`hermes -p <name> gateway status` showed it stopped. Two further issues:

1. Cross-profile PID reuse. In per-profile Docker supervision, one profile's
   stale `gateway_state.json` can record a PID the OS later recycled onto a
   DIFFERENT profile's live gateway. That PID's command line still
   `looks_like_gateway`, so the dead profile was reported running. The
   recorded argv has its `-p <name>` selector stripped in-process by
   `_apply_profile_override`, so it cannot disambiguate; the live `/proc`
   cmdline still carries it. `get_runtime_status_running_pid` now accepts an
   `expected_home` and validates the live command line belongs to THAT
   profile (mirroring `hermes_cli.gateway._matches_current_profile`, the
   logic the CLI scan path already uses — which is why the CLI was correct).
   `_check_gateway_running` passes the enumerated profile dir.

2. The existing regression test `test_gateway_running_check_falls_back_to_
   runtime_state` used the live pytest PID with a gateway-shaped record; once
   the live cmdline became authoritative it no longer looked like a gateway.
   Updated to mock the live cmdline to the real separate-process scenario it
   describes.

The active-profile path (`get_running_pid`) is intentionally left unscoped:
it is lock-verified and any live gateway cmdline is acceptable there. Multiplex
mode is unaffected — `running` state is only ever written to a gateway's own
home, never a secondary served profile's.

Adds coverage for: cross-profile PID reuse (named + default), matching
profile cmdline (`-p`, `--profile`, explicit HERMES_HOME=), the bare default
gateway, and the unreadable-cmdline cross-platform fallback. Each new
cross-profile assertion fails without the profile scope and passes with it.

Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com>
This commit is contained in:
Ben 2026-06-25 09:56:39 +10:00 committed by Ben Barclay
parent f1617a7ebb
commit 538c419d2e
4 changed files with 285 additions and 6 deletions

View file

@ -397,6 +397,130 @@ class TestGatewayRuntimeStatus:
assert status.get_runtime_status_running_pid(payload) == 132
def test_runtime_status_running_pid_rejects_pid_reused_by_other_profile(self, monkeypatch):
"""Regression (user report): a stale profile's recycled PID must not be
reported running just because it now hosts a DIFFERENT profile's gateway.
Per-profile Docker supervision: ``coder``'s gateway died leaving a
``gateway_state=running`` record at PID 139. The OS then recycled 139
onto the live *default* gateway (``hermes gateway run``). The recorded
``start_time`` is absent (older state file), so the start-time PID-reuse
guard does not catch it. Without the profile scope the live command
line still ``looks_like_gateway`` and ``coder`` is wrongly reported up.
"""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
}
coder_home = Path("/opt/data/profiles/coder")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
# PID 139 is now the live DEFAULT gateway (bare, no -p coder).
monkeypatch.setattr(
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
)
assert (
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
is None
)
def test_runtime_status_running_pid_accepts_matching_profile_cmdline(self, monkeypatch):
"""A genuinely-live named gateway carries ``-p <profile>`` / ``--profile``
on its command line and must be reported running for that profile."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
}
coder_home = Path("/opt/data/profiles/coder")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
for cmdline in (
"hermes -p coder gateway run --replace",
"/opt/hermes/.venv/bin/hermes --profile coder gateway run --replace",
"hermes_home=/opt/data/profiles/coder hermes gateway run --replace",
):
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid, c=cmdline: c)
assert (
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
== 139
), cmdline
def test_runtime_status_running_pid_default_profile_rejects_named_cmdline(self, monkeypatch):
"""The default/root profile runs a bare gateway (no profile flag). A
recycled PID now hosting a *named* profile gateway must not be reported
running for the default profile."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
}
default_home = Path("/opt/data")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
monkeypatch.setattr(
status, "_read_process_cmdline", lambda pid: "hermes -p coder gateway run --replace"
)
assert (
status.get_runtime_status_running_pid(payload, expected_home=default_home)
is None
)
def test_runtime_status_running_pid_default_profile_accepts_bare_cmdline(self, monkeypatch):
"""The default/root gateway (bare ``hermes gateway run``) is reported
running for the default profile."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
}
default_home = Path("/opt/data")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
monkeypatch.setattr(
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
)
assert (
status.get_runtime_status_running_pid(payload, expected_home=default_home)
== 139
)
def test_runtime_status_running_pid_profile_scope_falls_back_when_cmdline_unreadable(self, monkeypatch):
"""When the live command line is unreadable (Windows/permission), the
profile scope cannot apply fall back to the persisted record so the
cross-platform behavior is preserved."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
}
coder_home = Path("/opt/data/profiles/coder")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert (
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
== 139
)
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))

View file

@ -1483,8 +1483,15 @@ class TestEdgeCases:
# Primary pid-file/lock check returns None (no lock held by this reader),
# exactly as it does for a separate-process dashboard. The fallback must
# then read the state file and confirm the gateway is alive.
with patch("gateway.status.get_running_pid", return_value=None):
# then read the state file and confirm the gateway is alive by checking
# the recorded PID's live command line. In the real separate-process
# scenario that PID belongs to the live gateway, so mock its command
# line to a bare ``gateway run`` (this is the default/root home, which
# runs the gateway with no profile flag).
with patch("gateway.status.get_running_pid", return_value=None), patch(
"gateway.status._read_process_cmdline",
return_value="hermes gateway run --replace",
):
assert _check_gateway_running(default_home) is True
def test_gateway_running_check_runtime_state_stopped(self, profile_env):
@ -1511,6 +1518,77 @@ class TestEdgeCases:
with patch("gateway.status.get_running_pid", return_value=None):
assert _check_gateway_running(default_home) is False
def test_gateway_running_check_rejects_pid_reused_by_other_profile(self, profile_env):
"""Regression (user report): the dashboard showed a NAMED profile's
gateway green while ``hermes -p <name> gateway status`` showed it
stopped.
Per-profile Docker supervision: a named profile (``coder``) left a
``gateway_state=running`` record whose PID the OS later recycled onto a
DIFFERENT live process (here the default profile's gateway). The
``_check_gateway_running`` fallback must scope the live PID to *this*
profile's command line, so a recycled PID hosting another profile's
gateway is not reported running for ``coder``.
"""
from hermes_cli.profiles import _check_gateway_running
tmp_path = profile_env
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
coder_home.mkdir(parents=True, exist_ok=True)
(coder_home / "gateway_state.json").write_text(
json.dumps(
{
"pid": 139,
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"gateway_state": "running",
"active_agents": 0,
}
),
encoding="utf-8",
)
# PID 139 is alive but is the DEFAULT gateway (bare, no -p coder), not
# coder's. start_time is absent so the PID-reuse guard cannot catch it;
# the profile scope must.
with patch("gateway.status.get_running_pid", return_value=None), patch(
"gateway.status._pid_exists", return_value=True
), patch("gateway.status._get_process_start_time", return_value=None), patch(
"gateway.status._read_process_cmdline",
return_value="hermes gateway run --replace",
):
assert _check_gateway_running(coder_home) is False
def test_gateway_running_check_detects_matching_named_profile(self, profile_env):
"""A genuinely-live named gateway (``-p coder`` on its command line) is
still reported running for that profile."""
from hermes_cli.profiles import _check_gateway_running
tmp_path = profile_env
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
coder_home.mkdir(parents=True, exist_ok=True)
(coder_home / "gateway_state.json").write_text(
json.dumps(
{
"pid": 139,
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
"gateway_state": "running",
"active_agents": 0,
}
),
encoding="utf-8",
)
with patch("gateway.status.get_running_pid", return_value=None), patch(
"gateway.status._pid_exists", return_value=True
), patch("gateway.status._get_process_start_time", return_value=1000), patch(
"gateway.status._read_process_cmdline",
return_value="hermes -p coder gateway run --replace",
):
assert _check_gateway_running(coder_home) is True
def test_profile_name_boundary_single_char(self):
"""Single alphanumeric character is valid."""
validate_profile_name("a")