mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-27 11:22:03 +00:00
fix(gateway): scope dashboard liveness fallback to the profile
PR #52151 hardened the runtime-status liveness check to trust a readable live process command line over stale gateway_state.json argv, so a recycled PID now owned by an s6 supervisor no longer counts as a running gateway. That fix is correct but incomplete for the reported symptom: the web dashboard showed a named profile's gateway green while `hermes -p <name> gateway status` showed it stopped. Two further issues: 1. Cross-profile PID reuse. In per-profile Docker supervision, one profile's stale `gateway_state.json` can record a PID the OS later recycled onto a DIFFERENT profile's live gateway. That PID's command line still `looks_like_gateway`, so the dead profile was reported running. The recorded argv has its `-p <name>` selector stripped in-process by `_apply_profile_override`, so it cannot disambiguate; the live `/proc` cmdline still carries it. `get_runtime_status_running_pid` now accepts an `expected_home` and validates the live command line belongs to THAT profile (mirroring `hermes_cli.gateway._matches_current_profile`, the logic the CLI scan path already uses — which is why the CLI was correct). `_check_gateway_running` passes the enumerated profile dir. 2. The existing regression test `test_gateway_running_check_falls_back_to_ runtime_state` used the live pytest PID with a gateway-shaped record; once the live cmdline became authoritative it no longer looked like a gateway. Updated to mock the live cmdline to the real separate-process scenario it describes. The active-profile path (`get_running_pid`) is intentionally left unscoped: it is lock-verified and any live gateway cmdline is acceptable there. Multiplex mode is unaffected — `running` state is only ever written to a gateway's own home, never a secondary served profile's. Adds coverage for: cross-profile PID reuse (named + default), matching profile cmdline (`-p`, `--profile`, explicit HERMES_HOME=), the bare default gateway, and the unreadable-cmdline cross-platform fallback. Each new cross-profile assertion fails without the profile scope and passes with it. Co-authored-by: helix4u <4317663+helix4u@users.noreply.github.com>
This commit is contained in:
parent
f1617a7ebb
commit
538c419d2e
4 changed files with 285 additions and 6 deletions
|
|
@ -304,17 +304,85 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
|
|||
return looks_like_gateway_runtime_command_line(cmdline)
|
||||
|
||||
|
||||
def _record_matches_live_gateway_pid(record: dict[str, Any], pid: int) -> bool:
|
||||
def _profile_name_for_home(profile_home: Path) -> Optional[str]:
|
||||
"""Return the profile id a HERMES_HOME directory represents, or None.
|
||||
|
||||
A named profile's home is ``<root>/profiles/<name>`` (immediate parent is
|
||||
``profiles``). The root/default home (``~/.hermes`` or ``$HERMES_HOME``)
|
||||
has no such parent, so it maps to the default profile (``None`` here, which
|
||||
callers treat as "the bare, flag-less gateway").
|
||||
"""
|
||||
if profile_home.parent.name == "profiles":
|
||||
return profile_home.name
|
||||
return None
|
||||
|
||||
|
||||
def _command_line_belongs_to_profile(command: str, profile_home: Path) -> bool:
|
||||
"""Return True when a gateway command line belongs to ``profile_home``.
|
||||
|
||||
Mirrors ``hermes_cli.gateway._matches_current_profile`` so the dashboard's
|
||||
cross-profile liveness fallback scopes a live PID to the *right* profile.
|
||||
In a per-profile container, one profile's stale ``gateway_state.json`` can
|
||||
record a PID that the OS has since recycled onto a DIFFERENT profile's live
|
||||
gateway. That recycled PID's command line still ``looks_like_gateway`` —
|
||||
so without a profile check the dead profile is reported running. A named
|
||||
profile gateway carries ``-p <name>``/``--profile <name>`` (or, rarely, an
|
||||
explicit ``HERMES_HOME=<path>``) on its argv; the default/root gateway runs
|
||||
bare with no profile flag.
|
||||
"""
|
||||
command_lc = command.lower()
|
||||
profile_name = _profile_name_for_home(profile_home)
|
||||
home_lc = str(profile_home).lower()
|
||||
|
||||
if profile_name is not None and profile_name != "default":
|
||||
profile_lc = profile_name.lower()
|
||||
return (
|
||||
f"--profile {profile_lc}" in command_lc
|
||||
or f"-p {profile_lc}" in command_lc
|
||||
or f"hermes_home={home_lc}" in command_lc
|
||||
)
|
||||
|
||||
# Default/root profile: the gateway runs with no profile flag. Accept unless
|
||||
# the command advertises *some other* profile (an explicit -p/--profile) or
|
||||
# a non-matching explicit HERMES_HOME= on the argv. HERMES_HOME is usually
|
||||
# passed via the environment (not visible on the command line), so its mere
|
||||
# absence is not disqualifying — only a conflicting explicit value is.
|
||||
if "--profile " in command_lc or " -p " in command_lc:
|
||||
return False
|
||||
if "hermes_home=" in command_lc and f"hermes_home={home_lc}" not in command_lc:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _record_matches_live_gateway_pid(
|
||||
record: dict[str, Any],
|
||||
pid: int,
|
||||
*,
|
||||
expected_home: Optional[Path] = None,
|
||||
) -> bool:
|
||||
"""Return True when a live PID still identifies as this gateway record.
|
||||
|
||||
Prefer the live command line whenever it is readable. Runtime status files
|
||||
can outlive the gateway process they describe; if PID reuse leaves the same
|
||||
PID occupied by an s6 supervisor/log process, the stale record's argv should
|
||||
not make that unrelated process count as a running gateway.
|
||||
|
||||
When ``expected_home`` is provided (the dashboard enumerating a specific
|
||||
profile's state file), the readable live command line must additionally
|
||||
belong to *that* profile — otherwise a PID recycled onto a different
|
||||
profile's live gateway would make the dead profile look alive. When the
|
||||
live command line cannot be read (Windows/permission), fall back to the
|
||||
persisted record so cross-platform behavior is preserved.
|
||||
"""
|
||||
live_cmdline = _read_process_cmdline(pid)
|
||||
if live_cmdline:
|
||||
return looks_like_gateway_runtime_command_line(live_cmdline)
|
||||
if not looks_like_gateway_runtime_command_line(live_cmdline):
|
||||
return False
|
||||
if expected_home is not None and not _command_line_belongs_to_profile(
|
||||
live_cmdline, expected_home
|
||||
):
|
||||
return False
|
||||
return True
|
||||
return _record_looks_like_gateway(record)
|
||||
|
||||
|
||||
|
|
@ -745,6 +813,8 @@ def derive_gateway_drainable(*, gateway_running: bool, gateway_state: Any) -> bo
|
|||
|
||||
def get_runtime_status_running_pid(
|
||||
runtime: Optional[dict[str, Any]] = None,
|
||||
*,
|
||||
expected_home: Optional[Path] = None,
|
||||
) -> Optional[int]:
|
||||
"""Return a live gateway PID from the runtime status record, if valid.
|
||||
|
||||
|
|
@ -753,6 +823,13 @@ def get_runtime_status_running_pid(
|
|||
a live process and a fresh ``gateway_state.json`` but no ``gateway.pid``; use
|
||||
this as a conservative fallback by checking both the persisted state and the
|
||||
OS process identity.
|
||||
|
||||
``expected_home`` scopes the OS-identity check to a specific profile's
|
||||
HERMES_HOME. Pass it when validating *another* profile's state file (the
|
||||
dashboard enumerating every profile): a stale record whose PID the OS has
|
||||
recycled onto a different profile's live gateway must not be reported
|
||||
running for the dead profile. Omit it (the default) for the active
|
||||
profile, where any live gateway command line is acceptable.
|
||||
"""
|
||||
payload = runtime if runtime is not None else read_runtime_status()
|
||||
if not isinstance(payload, dict):
|
||||
|
|
@ -773,7 +850,7 @@ def get_runtime_status_running_pid(
|
|||
):
|
||||
return None
|
||||
|
||||
if _record_matches_live_gateway_pid(payload, pid):
|
||||
if _record_matches_live_gateway_pid(payload, pid, expected_home=expected_home):
|
||||
return pid
|
||||
return None
|
||||
|
||||
|
|
|
|||
|
|
@ -639,7 +639,7 @@ def _check_gateway_running(profile_dir: Path) -> bool:
|
|||
read_runtime_status,
|
||||
)
|
||||
runtime = read_runtime_status(profile_dir / "gateway_state.json")
|
||||
return get_runtime_status_running_pid(runtime) is not None
|
||||
return get_runtime_status_running_pid(runtime, expected_home=profile_dir) is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
|
|
|||
|
|
@ -397,6 +397,130 @@ class TestGatewayRuntimeStatus:
|
|||
|
||||
assert status.get_runtime_status_running_pid(payload) == 132
|
||||
|
||||
def test_runtime_status_running_pid_rejects_pid_reused_by_other_profile(self, monkeypatch):
|
||||
"""Regression (user report): a stale profile's recycled PID must not be
|
||||
reported running just because it now hosts a DIFFERENT profile's gateway.
|
||||
|
||||
Per-profile Docker supervision: ``coder``'s gateway died leaving a
|
||||
``gateway_state=running`` record at PID 139. The OS then recycled 139
|
||||
onto the live *default* gateway (``hermes gateway run``). The recorded
|
||||
``start_time`` is absent (older state file), so the start-time PID-reuse
|
||||
guard does not catch it. Without the profile scope the live command
|
||||
line still ``looks_like_gateway`` and ``coder`` is wrongly reported up.
|
||||
"""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
}
|
||||
coder_home = Path("/opt/data/profiles/coder")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
# PID 139 is now the live DEFAULT gateway (bare, no -p coder).
|
||||
monkeypatch.setattr(
|
||||
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
|
||||
)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_runtime_status_running_pid_accepts_matching_profile_cmdline(self, monkeypatch):
|
||||
"""A genuinely-live named gateway carries ``-p <profile>`` / ``--profile``
|
||||
on its command line and must be reported running for that profile."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
}
|
||||
coder_home = Path("/opt/data/profiles/coder")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
|
||||
for cmdline in (
|
||||
"hermes -p coder gateway run --replace",
|
||||
"/opt/hermes/.venv/bin/hermes --profile coder gateway run --replace",
|
||||
"hermes_home=/opt/data/profiles/coder hermes gateway run --replace",
|
||||
):
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid, c=cmdline: c)
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
|
||||
== 139
|
||||
), cmdline
|
||||
|
||||
def test_runtime_status_running_pid_default_profile_rejects_named_cmdline(self, monkeypatch):
|
||||
"""The default/root profile runs a bare gateway (no profile flag). A
|
||||
recycled PID now hosting a *named* profile gateway must not be reported
|
||||
running for the default profile."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
}
|
||||
default_home = Path("/opt/data")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||
monkeypatch.setattr(
|
||||
status, "_read_process_cmdline", lambda pid: "hermes -p coder gateway run --replace"
|
||||
)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=default_home)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_runtime_status_running_pid_default_profile_accepts_bare_cmdline(self, monkeypatch):
|
||||
"""The default/root gateway (bare ``hermes gateway run``) is reported
|
||||
running for the default profile."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
}
|
||||
default_home = Path("/opt/data")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
|
||||
monkeypatch.setattr(
|
||||
status, "_read_process_cmdline", lambda pid: "hermes gateway run --replace"
|
||||
)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=default_home)
|
||||
== 139
|
||||
)
|
||||
|
||||
def test_runtime_status_running_pid_profile_scope_falls_back_when_cmdline_unreadable(self, monkeypatch):
|
||||
"""When the live command line is unreadable (Windows/permission), the
|
||||
profile scope cannot apply — fall back to the persisted record so the
|
||||
cross-platform behavior is preserved."""
|
||||
payload = {
|
||||
"pid": 139,
|
||||
"gateway_state": "running",
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
}
|
||||
coder_home = Path("/opt/data/profiles/coder")
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||
|
||||
assert (
|
||||
status.get_runtime_status_running_pid(payload, expected_home=coder_home)
|
||||
== 139
|
||||
)
|
||||
|
||||
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
|
||||
|
|
|
|||
|
|
@ -1483,8 +1483,15 @@ class TestEdgeCases:
|
|||
|
||||
# Primary pid-file/lock check returns None (no lock held by this reader),
|
||||
# exactly as it does for a separate-process dashboard. The fallback must
|
||||
# then read the state file and confirm the gateway is alive.
|
||||
with patch("gateway.status.get_running_pid", return_value=None):
|
||||
# then read the state file and confirm the gateway is alive by checking
|
||||
# the recorded PID's live command line. In the real separate-process
|
||||
# scenario that PID belongs to the live gateway, so mock its command
|
||||
# line to a bare ``gateway run`` (this is the default/root home, which
|
||||
# runs the gateway with no profile flag).
|
||||
with patch("gateway.status.get_running_pid", return_value=None), patch(
|
||||
"gateway.status._read_process_cmdline",
|
||||
return_value="hermes gateway run --replace",
|
||||
):
|
||||
assert _check_gateway_running(default_home) is True
|
||||
|
||||
def test_gateway_running_check_runtime_state_stopped(self, profile_env):
|
||||
|
|
@ -1511,6 +1518,77 @@ class TestEdgeCases:
|
|||
with patch("gateway.status.get_running_pid", return_value=None):
|
||||
assert _check_gateway_running(default_home) is False
|
||||
|
||||
def test_gateway_running_check_rejects_pid_reused_by_other_profile(self, profile_env):
|
||||
"""Regression (user report): the dashboard showed a NAMED profile's
|
||||
gateway green while ``hermes -p <name> gateway status`` showed it
|
||||
stopped.
|
||||
|
||||
Per-profile Docker supervision: a named profile (``coder``) left a
|
||||
``gateway_state=running`` record whose PID the OS later recycled onto a
|
||||
DIFFERENT live process (here the default profile's gateway). The
|
||||
``_check_gateway_running`` fallback must scope the live PID to *this*
|
||||
profile's command line, so a recycled PID hosting another profile's
|
||||
gateway is not reported running for ``coder``.
|
||||
"""
|
||||
from hermes_cli.profiles import _check_gateway_running
|
||||
|
||||
tmp_path = profile_env
|
||||
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
|
||||
coder_home.mkdir(parents=True, exist_ok=True)
|
||||
(coder_home / "gateway_state.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"pid": 139,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"gateway_state": "running",
|
||||
"active_agents": 0,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
# PID 139 is alive but is the DEFAULT gateway (bare, no -p coder), not
|
||||
# coder's. start_time is absent so the PID-reuse guard cannot catch it;
|
||||
# the profile scope must.
|
||||
with patch("gateway.status.get_running_pid", return_value=None), patch(
|
||||
"gateway.status._pid_exists", return_value=True
|
||||
), patch("gateway.status._get_process_start_time", return_value=None), patch(
|
||||
"gateway.status._read_process_cmdline",
|
||||
return_value="hermes gateway run --replace",
|
||||
):
|
||||
assert _check_gateway_running(coder_home) is False
|
||||
|
||||
def test_gateway_running_check_detects_matching_named_profile(self, profile_env):
|
||||
"""A genuinely-live named gateway (``-p coder`` on its command line) is
|
||||
still reported running for that profile."""
|
||||
from hermes_cli.profiles import _check_gateway_running
|
||||
|
||||
tmp_path = profile_env
|
||||
coder_home = tmp_path / ".hermes" / "profiles" / "coder"
|
||||
coder_home.mkdir(parents=True, exist_ok=True)
|
||||
(coder_home / "gateway_state.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"pid": 139,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["hermes", "gateway", "run"],
|
||||
"start_time": 1000,
|
||||
"gateway_state": "running",
|
||||
"active_agents": 0,
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
with patch("gateway.status.get_running_pid", return_value=None), patch(
|
||||
"gateway.status._pid_exists", return_value=True
|
||||
), patch("gateway.status._get_process_start_time", return_value=1000), patch(
|
||||
"gateway.status._read_process_cmdline",
|
||||
return_value="hermes -p coder gateway run --replace",
|
||||
):
|
||||
assert _check_gateway_running(coder_home) is True
|
||||
|
||||
def test_profile_name_boundary_single_char(self):
|
||||
"""Single alphanumeric character is valid."""
|
||||
validate_profile_name("a")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue