mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(gateway): scope dashboard liveness fallback to the profile
PR #52151 hardened the runtime-status liveness check to trust a readable live process command line over stale gateway_state.json argv, so a recycled PID now owned by an s6 supervisor no longer counts as a running gateway. That fix is correct but incomplete for the reported symptom: the web dashboard showed a named profile's gateway green while `hermes -p <name> gateway status` showed it stopped. Two further issues: 1. Cross-profile PID reuse. In per-profile Docker supervision, one profile's stale `gateway_state.json` can record a PID the OS later recycled onto a DIFFERENT profile's live gateway. That PID's command line still `looks_like_gateway`, so the dead profile was reported running. The recorded argv has its `-p <name>` selector stripped in-process by `_apply_profile_override`, so it cannot disambiguate; the live `/proc` cmdline still carries it. `get_runtime_status_running_pid` now accepts an `expected_home` and validates the live command line belongs to THAT profile (mirroring `hermes_cli.gateway._matches_current_profile`, the logic the CLI scan path already uses — which is why the CLI was correct). `_check_gateway_running` passes the enumerated profile dir. 2. The existing regression test `test_gateway_running_check_falls_back_to_ runtime_state` used the live pytest PID with a gateway-shaped record; once the live cmdline became authoritative it no longer looked like a gateway. Updated to mock the live cmdline to the real separate-process scenario it describes. The active-profile path (`get_running_pid`) is intentionally left unscoped: it is lock-verified and any live gateway cmdline is acceptable there. Multiplex mode is unaffected — `running` state is only ever written to a gateway's own home, never a secondary served profile's. Adds coverage for: cross-profile PID reuse (named + default), matching profile cmdline (`-p`, `--profile`, explicit HERMES_HOME=), the bare default gateway, and the unreadable-cmdline cross-platform fallback. Each new cross-profile assertion fails without the profile scope and passes with it. Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com>
This commit is contained in:
parent
f1617a7ebb
commit
538c419d2e
4 changed files with 285 additions and 6 deletions
|
|
@ -397,6 +397,130 @@ class TestGatewayRuntimeStatus:
|
|||
|
||||
assert status.get_runtime_status_running_pid(payload) == 132
|
||||
|
||||
def test_runtime_status_running_pid_rejects_pid_reused_by_other_profile(self, monkeypatch):
|
||||
"""Regression (user report): a stale profile's recycled PID must not be
|
||||
reported running just because it now hosts a DIFFERENT profile's gateway.
|
||||
|
||||
Per-profile Docker supervision: ``coder``'s gateway died leaving a
|
||||
``gateway_state=running`` record at PID 139. The OS then recycled 139
|
||||
onto the live *default* gateway (``hermes gateway run``). The recorded
|
||||
``start_time`` is absent (older state file), so the start-time PID-reuse
|
||||
guard does not catch it. Without the profile scope the live command
|
||||
line still ``looks_like_gateway`` and ``coder`` is wrongly reported up.
|
||||
"""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
}
|
||||
coder_home = Path("/opt/data/profiles/coder")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
# PID 139 is now the live DEFAULT gateway (bare, no -p coder).
|
||||
monkeypatch.setattr(
|
||||
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
|
||||
)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_runtime_status_running_pid_accepts_matching_profile_cmdline(self, monkeypatch):
|
||||
"""A genuinely-live named gateway carries ``-p <profile>`` / ``--profile``
|
||||
on its command line and must be reported running for that profile."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
}
|
||||
coder_home = Path("/opt/data/profiles/coder")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
|
||||
for cmdline in (
|
||||
"hermes -p coder gateway run --replace",
|
||||
"/opt/hermes/.venv/bin/hermes --profile coder gateway run --replace",
|
||||
"hermes_home=/opt/data/profiles/coder hermes gateway run --replace",
|
||||
):
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid, c=cmdline: c)
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
|
||||
== 139
|
||||
), cmdline
|
||||
|
||||
def test_runtime_status_running_pid_default_profile_rejects_named_cmdline(self, monkeypatch):
|
||||
"""The default/root profile runs a bare gateway (no profile flag). A
|
||||
recycled PID now hosting a *named* profile gateway must not be reported
|
||||
running for the default profile."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
}
|
||||
default_home = Path("/opt/data")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
monkeypatch.setattr(
|
||||
status, "_read_process_cmdline", lambda pid: "hermes -p coder gateway run --replace"
|
||||
)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=default_home)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_runtime_status_running_pid_default_profile_accepts_bare_cmdline(self, monkeypatch):
|
||||
"""The default/root gateway (bare ``hermes gateway run``) is reported
|
||||
running for the default profile."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
}
|
||||
default_home = Path("/opt/data")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
|
||||
monkeypatch.setattr(
|
||||
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
|
||||
)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=default_home)
|
||||
== 139
|
||||
)
|
||||
|
||||
def test_runtime_status_running_pid_profile_scope_falls_back_when_cmdline_unreadable(self, monkeypatch):
|
||||
"""When the live command line is unreadable (Windows/permission), the
|
||||
profile scope cannot apply — fall back to the persisted record so the
|
||||
cross-platform behavior is preserved."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
}
|
||||
coder_home = Path("/opt/data/profiles/coder")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
|
||||
== 139
|
||||
)
|
||||
|
||||
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
|
|
|
|||
|
|
@ -1483,8 +1483,15 @@ class TestEdgeCases:
|
|||
|
||||
# Primary pid-file/lock check returns None (no lock held by this reader),
|
||||
# exactly as it does for a separate-process dashboard. The fallback must
|
||||
# then read the state file and confirm the gateway is alive.
|
||||
with patch("gateway.status.get_running_pid", return_value=None):
|
||||
# then read the state file and confirm the gateway is alive by checking
|
||||
# the recorded PID's live command line. In the real separate-process
|
||||
# scenario that PID belongs to the live gateway, so mock its command
|
||||
# line to a bare ``gateway run`` (this is the default/root home, which
|
||||
# runs the gateway with no profile flag).
|
||||
with patch("gateway.status.get_running_pid", return_value=None), patch(
|
||||
"gateway.status._read_process_cmdline",
|
||||
return_value="hermes gateway run --replace",
|
||||
):
|
||||
assert _check_gateway_running(default_home) is True
|
||||
|
||||
def test_gateway_running_check_runtime_state_stopped(self, profile_env):
|
||||
|
|
@ -1511,6 +1518,77 @@ class TestEdgeCases:
|
|||
with patch("gateway.status.get_running_pid", return_value=None):
|
||||
assert _check_gateway_running(default_home) is False
|
||||
|
||||
def test_gateway_running_check_rejects_pid_reused_by_other_profile(self, profile_env):
|
||||
"""Regression (user report): the dashboard showed a NAMED profile's
|
||||
gateway green while ``hermes -p <name> gateway status`` showed it
|
||||
stopped.
|
||||
|
||||
Per-profile Docker supervision: a named profile (``coder``) left a
|
||||
``gateway_state=running`` record whose PID the OS later recycled onto a
|
||||
DIFFERENT live process (here the default profile's gateway). The
|
||||
``_check_gateway_running`` fallback must scope the live PID to *this*
|
||||
profile's command line, so a recycled PID hosting another profile's
|
||||
gateway is not reported running for ``coder``.
|
||||
"""
|
||||
from hermes_cli.profiles import _check_gateway_running
|
||||
|
||||
tmp_path = profile_env
|
||||
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
|
||||
coder_home.mkdir(parents=True, exist_ok=True)
|
||||
(coder_home / "gateway_state.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"pid": 139,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"gateway_state": "running",
|
||||
"active_agents": 0,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# PID 139 is alive but is the DEFAULT gateway (bare, no -p coder), not
|
||||
# coder's. start_time is absent so the PID-reuse guard cannot catch it;
|
||||
# the profile scope must.
|
||||
with patch("gateway.status.get_running_pid", return_value=None), patch(
|
||||
"gateway.status._pid_exists", return_value=True
|
||||
), patch("gateway.status._get_process_start_time", return_value=None), patch(
|
||||
"gateway.status._read_process_cmdline",
|
||||
return_value="hermes gateway run --replace",
|
||||
):
|
||||
assert _check_gateway_running(coder_home) is False
|
||||
|
||||
def test_gateway_running_check_detects_matching_named_profile(self, profile_env):
|
||||
"""A genuinely-live named gateway (``-p coder`` on its command line) is
|
||||
still reported running for that profile."""
|
||||
from hermes_cli.profiles import _check_gateway_running
|
||||
|
||||
tmp_path = profile_env
|
||||
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
|
||||
coder_home.mkdir(parents=True, exist_ok=True)
|
||||
(coder_home / "gateway_state.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"pid": 139,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
"gateway_state": "running",
|
||||
"active_agents": 0,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with patch("gateway.status.get_running_pid", return_value=None), patch(
|
||||
"gateway.status._pid_exists", return_value=True
|
||||
), patch("gateway.status._get_process_start_time", return_value=1000), patch(
|
||||
"gateway.status._read_process_cmdline",
|
||||
return_value="hermes -p coder gateway run --replace",
|
||||
):
|
||||
assert _check_gateway_running(coder_home) is True
|
||||
|
||||
def test_profile_name_boundary_single_char(self):
|
||||
"""Single alphanumeric character is valid."""
|
||||
validate_profile_name("a")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue