fix(gateway): scope dashboard liveness fallback to the profile

PR #52151 hardened the runtime-status liveness check to trust a readable
live process command line over stale gateway_state.json argv, so a recycled
PID now owned by an s6 supervisor no longer counts as a running gateway.

That fix is correct but incomplete for the reported symptom: the web
dashboard showed a named profile's gateway green while
`hermes -p <name> gateway status` showed it stopped. Two further issues:

1. Cross-profile PID reuse. In per-profile Docker supervision, one profile's
   stale `gateway_state.json` can record a PID the OS later recycled onto a
   DIFFERENT profile's live gateway. That PID's command line still
   `looks_like_gateway`, so the dead profile was reported running. The
   recorded argv has its `-p <name>` selector stripped in-process by
   `_apply_profile_override`, so it cannot disambiguate; the live `/proc`
   cmdline still carries it. `get_runtime_status_running_pid` now accepts an
   `expected_home` and validates the live command line belongs to THAT
   profile (mirroring `hermes_cli.gateway._matches_current_profile`, the
   logic the CLI scan path already uses — which is why the CLI was correct).
   `_check_gateway_running` passes the enumerated profile dir.

2. The existing regression test `test_gateway_running_check_falls_back_to_
   runtime_state` used the live pytest PID with a gateway-shaped record; once
   the live cmdline became authoritative it no longer looked like a gateway.
   Updated to mock the live cmdline to the real separate-process scenario it
   describes.

The active-profile path (`get_running_pid`) is intentionally left unscoped:
it is lock-verified and any live gateway cmdline is acceptable there. Multiplex
mode is unaffected — `running` state is only ever written to a gateway's own
home, never a secondary served profile's.

Adds coverage for: cross-profile PID reuse (named + default), matching
profile cmdline (`-p`, `--profile`, explicit HERMES_HOME=), the bare default
gateway, and the unreadable-cmdline cross-platform fallback. Each new
cross-profile assertion fails without the profile scope and passes with it.

Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com>
This commit is contained in:
Ben 2026-06-25 09:56:39 +10:00 committed by Ben Barclay
parent f1617a7ebb
commit 538c419d2e
4 changed files with 285 additions and 6 deletions

View file

@ -304,17 +304,85 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
return looks_like_gateway_runtime_command_line(cmdline)
def _record_matches_live_gateway_pid(record: dict[str, Any], pid: int) -> bool:
def _profile_name_for_home(profile_home: Path) -> Optional[str]:
"""Return the profile id a HERMES_HOME directory represents, or None.
A named profile's home is ``<root>/profiles/<name>`` (immediate parent is
``profiles``). The root/default home (``~/.hermes`` or ``$HERMES_HOME``)
has no such parent, so it maps to the default profile (``None`` here, which
callers treat as "the bare, flag-less gateway").
"""
if profile_home.parent.name == "profiles":
return profile_home.name
return None
def _command_line_belongs_to_profile(command: str, profile_home: Path) -> bool:
"""Return True when a gateway command line belongs to ``profile_home``.
Mirrors ``hermes_cli.gateway._matches_current_profile`` so the dashboard's
cross-profile liveness fallback scopes a live PID to the *right* profile.
In a per-profile container, one profile's stale ``gateway_state.json`` can
record a PID that the OS has since recycled onto a DIFFERENT profile's live
gateway. That recycled PID's command line still ``looks_like_gateway`` —
so without a profile check the dead profile is reported running. A named
profile gateway carries ``-p <name>``/``--profile <name>`` (or, rarely, an
explicit ``HERMES_HOME=<path>``) on its argv; the default/root gateway runs
bare with no profile flag.
"""
command_lc = command.lower()
profile_name = _profile_name_for_home(profile_home)
home_lc = str(profile_home).lower()
if profile_name is not None and profile_name != "default":
profile_lc = profile_name.lower()
return (
f"--profile {profile_lc}" in command_lc
or f"-p {profile_lc}" in command_lc
or f"hermes_home={home_lc}" in command_lc
)
# Default/root profile: the gateway runs with no profile flag. Accept unless
# the command advertises *some other* profile (an explicit -p/--profile) or
# a non-matching explicit HERMES_HOME= on the argv. HERMES_HOME is usually
# passed via the environment (not visible on the command line), so its mere
# absence is not disqualifying — only a conflicting explicit value is.
if "--profile " in command_lc or " -p " in command_lc:
return False
if "hermes_home=" in command_lc and f"hermes_home={home_lc}" not in command_lc:
return False
return True
def _record_matches_live_gateway_pid(
record: dict[str, Any],
pid: int,
*,
expected_home: Optional[Path] = None,
) -> bool:
"""Return True when a live PID still identifies as this gateway record.
Prefer the live command line whenever it is readable. Runtime status files
can outlive the gateway process they describe; if PID reuse leaves the same
PID occupied by an s6 supervisor/log process, the stale record's argv should
not make that unrelated process count as a running gateway.
When ``expected_home`` is provided (the dashboard enumerating a specific
profile's state file), the readable live command line must additionally
belong to *that* profile otherwise a PID recycled onto a different
profile's live gateway would make the dead profile look alive. When the
live command line cannot be read (Windows/permission), fall back to the
persisted record so cross-platform behavior is preserved.
"""
live_cmdline = _read_process_cmdline(pid)
if live_cmdline:
return looks_like_gateway_runtime_command_line(live_cmdline)
if not looks_like_gateway_runtime_command_line(live_cmdline):
return False
if expected_home is not None and not _command_line_belongs_to_profile(
live_cmdline, expected_home
):
return False
return True
return _record_looks_like_gateway(record)
@ -745,6 +813,8 @@ def derive_gateway_drainable(*, gateway_running: bool, gateway_state: Any) -> bo
def get_runtime_status_running_pid(
runtime: Optional[dict[str, Any]] = None,
*,
expected_home: Optional[Path] = None,
) -> Optional[int]:
"""Return a live gateway PID from the runtime status record, if valid.
@ -753,6 +823,13 @@ def get_runtime_status_running_pid(
a live process and a fresh ``gateway_state.json`` but no ``gateway.pid``; use
this as a conservative fallback by checking both the persisted state and the
OS process identity.
``expected_home`` scopes the OS-identity check to a specific profile's
HERMES_HOME. Pass it when validating *another* profile's state file (the
dashboard enumerating every profile): a stale record whose PID the OS has
recycled onto a different profile's live gateway must not be reported
running for the dead profile. Omit it (the default) for the active
profile, where any live gateway command line is acceptable.
"""
payload = runtime if runtime is not None else read_runtime_status()
if not isinstance(payload, dict):
@ -773,7 +850,7 @@ def get_runtime_status_running_pid(
):
return None
if _record_matches_live_gateway_pid(payload, pid):
if _record_matches_live_gateway_pid(payload, pid, expected_home=expected_home):
return pid
return None

View file

@ -639,7 +639,7 @@ def _check_gateway_running(profile_dir: Path) -> bool:
read_runtime_status,
)
runtime = read_runtime_status(profile_dir / "gateway_state.json")
return get_runtime_status_running_pid(runtime) is not None
return get_runtime_status_running_pid(runtime, expected_home=profile_dir) is not None
except Exception:
return False

View file

@ -397,6 +397,130 @@ class TestGatewayRuntimeStatus:
assert status.get_runtime_status_running_pid(payload) == 132
def test_runtime_status_running_pid_rejects_pid_reused_by_other_profile(self, monkeypatch):
"""Regression (user report): a stale profile's recycled PID must not be
reported running just because it now hosts a DIFFERENT profile's gateway.
Per-profile Docker supervision: ``coder``'s gateway died leaving a
``gateway_state=running`` record at PID 139. The OS then recycled 139
onto the live *default* gateway (``hermes gateway run``). The recorded
``start_time`` is absent (older state file), so the start-time PID-reuse
guard does not catch it. Without the profile scope the live command
line still ``looks_like_gateway`` and ``coder`` is wrongly reported up.
"""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
}
coder_home = Path("/opt/data/profiles/coder")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
# PID 139 is now the live DEFAULT gateway (bare, no -p coder).
monkeypatch.setattr(
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
)
assert (
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
is None
)
def test_runtime_status_running_pid_accepts_matching_profile_cmdline(self, monkeypatch):
"""A genuinely-live named gateway carries ``-p <profile>`` / ``--profile``
on its command line and must be reported running for that profile."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
}
coder_home = Path("/opt/data/profiles/coder")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
for cmdline in (
"hermes -p coder gateway run --replace",
"/opt/hermes/.venv/bin/hermes --profile coder gateway run --replace",
"hermes_home=/opt/data/profiles/coder hermes gateway run --replace",
):
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid, c=cmdline: c)
assert (
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
== 139
), cmdline
def test_runtime_status_running_pid_default_profile_rejects_named_cmdline(self, monkeypatch):
"""The default/root profile runs a bare gateway (no profile flag). A
recycled PID now hosting a *named* profile gateway must not be reported
running for the default profile."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
}
default_home = Path("/opt/data")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
monkeypatch.setattr(
status, "_read_process_cmdline", lambda pid: "hermes -p coder gateway run --replace"
)
assert (
status.get_runtime_status_running_pid(payload, expected_home=default_home)
is None
)
def test_runtime_status_running_pid_default_profile_accepts_bare_cmdline(self, monkeypatch):
"""The default/root gateway (bare ``hermes gateway run``) is reported
running for the default profile."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
}
default_home = Path("/opt/data")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
monkeypatch.setattr(
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
)
assert (
status.get_runtime_status_running_pid(payload, expected_home=default_home)
== 139
)
def test_runtime_status_running_pid_profile_scope_falls_back_when_cmdline_unreadable(self, monkeypatch):
"""When the live command line is unreadable (Windows/permission), the
profile scope cannot apply fall back to the persisted record so the
cross-platform behavior is preserved."""
payload = {
"pid": 139,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
}
coder_home = Path("/opt/data/profiles/coder")
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert (
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
== 139
)
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))

View file

@ -1483,8 +1483,15 @@ class TestEdgeCases:
# Primary pid-file/lock check returns None (no lock held by this reader),
# exactly as it does for a separate-process dashboard. The fallback must
# then read the state file and confirm the gateway is alive.
with patch("gateway.status.get_running_pid", return_value=None):
# then read the state file and confirm the gateway is alive by checking
# the recorded PID's live command line. In the real separate-process
# scenario that PID belongs to the live gateway, so mock its command
# line to a bare ``gateway run`` (this is the default/root home, which
# runs the gateway with no profile flag).
with patch("gateway.status.get_running_pid", return_value=None), patch(
"gateway.status._read_process_cmdline",
return_value="hermes gateway run --replace",
):
assert _check_gateway_running(default_home) is True
def test_gateway_running_check_runtime_state_stopped(self, profile_env):
@ -1511,6 +1518,77 @@ class TestEdgeCases:
with patch("gateway.status.get_running_pid", return_value=None):
assert _check_gateway_running(default_home) is False
def test_gateway_running_check_rejects_pid_reused_by_other_profile(self, profile_env):
"""Regression (user report): the dashboard showed a NAMED profile's
gateway green while ``hermes -p <name> gateway status`` showed it
stopped.
Per-profile Docker supervision: a named profile (``coder``) left a
``gateway_state=running`` record whose PID the OS later recycled onto a
DIFFERENT live process (here the default profile's gateway). The
``_check_gateway_running`` fallback must scope the live PID to *this*
profile's command line, so a recycled PID hosting another profile's
gateway is not reported running for ``coder``.
"""
from hermes_cli.profiles import _check_gateway_running
tmp_path = profile_env
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
coder_home.mkdir(parents=True, exist_ok=True)
(coder_home / "gateway_state.json").write_text(
json.dumps(
{
"pid": 139,
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"gateway_state": "running",
"active_agents": 0,
}
),
encoding="utf-8",
)
# PID 139 is alive but is the DEFAULT gateway (bare, no -p coder), not
# coder's. start_time is absent so the PID-reuse guard cannot catch it;
# the profile scope must.
with patch("gateway.status.get_running_pid", return_value=None), patch(
"gateway.status._pid_exists", return_value=True
), patch("gateway.status._get_process_start_time", return_value=None), patch(
"gateway.status._read_process_cmdline",
return_value="hermes gateway run --replace",
):
assert _check_gateway_running(coder_home) is False
def test_gateway_running_check_detects_matching_named_profile(self, profile_env):
"""A genuinely-live named gateway (``-p coder`` on its command line) is
still reported running for that profile."""
from hermes_cli.profiles import _check_gateway_running
tmp_path = profile_env
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
coder_home.mkdir(parents=True, exist_ok=True)
(coder_home / "gateway_state.json").write_text(
json.dumps(
{
"pid": 139,
"kind": "hermes-gateway",
"argv": ["hermes", "gateway", "run"],
"start_time": 1000,
"gateway_state": "running",
"active_agents": 0,
}
),
encoding="utf-8",
)
with patch("gateway.status.get_running_pid", return_value=None), patch(
"gateway.status._pid_exists", return_value=True
), patch("gateway.status._get_process_start_time", return_value=1000), patch(
"gateway.status._read_process_cmdline",
return_value="hermes -p coder gateway run --replace",
):
assert _check_gateway_running(coder_home) is True
def test_profile_name_boundary_single_char(self):
"""Single alphanumeric character is valid."""
validate_profile_name("a")