fix(gateway): validate runtime status pid command line

This commit is contained in:
helix4u 2026-06-24 15:22:31 -06:00 committed by Ben Barclay
parent d335164833
commit f1617a7ebb
2 changed files with 54 additions and 2 deletions

View file

@ -304,6 +304,20 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool:
return looks_like_gateway_runtime_command_line(cmdline)
def _record_matches_live_gateway_pid(record: dict[str, Any], pid: int) -> bool:
"""Return True when a live PID still identifies as this gateway record.
Prefer the live command line whenever it is readable. Runtime status files
can outlive the gateway process they describe; if PID reuse leaves the same
PID occupied by an s6 supervisor/log process, the stale record's argv should
not make that unrelated process count as a running gateway.
"""
live_cmdline = _read_process_cmdline(pid)
if live_cmdline:
return looks_like_gateway_runtime_command_line(live_cmdline)
return _record_looks_like_gateway(record)
def _build_pid_record() -> dict:
return {
"pid": os.getpid(),
@ -759,7 +773,7 @@ def get_runtime_status_running_pid(
):
return None
if _looks_like_gateway_process(pid) or _record_looks_like_gateway(payload):
if _record_matches_live_gateway_pid(payload, pid):
return pid
return None
@ -1261,7 +1275,7 @@ def get_running_pid(
if recorded_start is not None and current_start is not None and current_start != recorded_start:
continue
if _looks_like_gateway_process(pid) or _record_looks_like_gateway(record):
if _record_matches_live_gateway_pid(record, pid):
return pid
_cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale)

View file

@ -359,6 +359,44 @@ class TestGatewayRuntimeStatus:
assert payload["pid"] == os.getpid()
assert payload["start_time"] == 2000
def test_runtime_status_running_pid_rejects_stale_record_for_supervisor_pid(self, monkeypatch):
"""Regression: stale profile runtime state must not mark s6 supervisors live.
Docker per-profile supervision can leave a named profile with
``gateway_state=running`` metadata while the real gateway process is gone
and the recorded PID now belongs to ``s6-supervise`` or ``s6-log``. If
the live command line is readable, it wins over the stale record argv.
"""
payload = {
"pid": 132,
"start_time": 123,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run", "--replace"],
}
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "s6-supervise gateway-coder")
assert status.get_runtime_status_running_pid(payload) is None
def test_runtime_status_running_pid_uses_record_when_cmdline_unreadable(self, monkeypatch):
"""Keep the cross-platform fallback for hosts where cmdline is unavailable."""
payload = {
"pid": 132,
"start_time": 123,
"gateway_state": "running",
"kind": "hermes-gateway",
"argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run", "--replace"],
}
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
assert status.get_runtime_status_running_pid(payload) == 132
def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))