From f1617a7ebb979925741b99343d4f39dfcb139750 Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:22:31 -0600 Subject: [PATCH] fix(gateway): validate runtime status pid command line --- gateway/status.py | 18 +++++++++++++++-- tests/gateway/test_status.py | 38 ++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index 9958e0d5553..cc27de23549 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -304,6 +304,20 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool: return looks_like_gateway_runtime_command_line(cmdline) +def _record_matches_live_gateway_pid(record: dict[str, Any], pid: int) -> bool: + """Return True when a live PID still identifies as this gateway record. + + Prefer the live command line whenever it is readable. Runtime status files + can outlive the gateway process they describe; if PID reuse leaves the same + PID occupied by an s6 supervisor/log process, the stale record's argv should + not make that unrelated process count as a running gateway. + """ + live_cmdline = _read_process_cmdline(pid) + if live_cmdline: + return looks_like_gateway_runtime_command_line(live_cmdline) + return _record_looks_like_gateway(record) + + def _build_pid_record() -> dict: return { "pid": os.getpid(), @@ -759,7 +773,7 @@ def get_runtime_status_running_pid( ): return None - if _looks_like_gateway_process(pid) or _record_looks_like_gateway(payload): + if _record_matches_live_gateway_pid(payload, pid): return pid return None @@ -1261,7 +1275,7 @@ def get_running_pid( if recorded_start is not None and current_start is not None and current_start != recorded_start: continue - if _looks_like_gateway_process(pid) or _record_looks_like_gateway(record): + if _record_matches_live_gateway_pid(record, pid): return pid _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale) diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index a70c028ca15..acb4c9850ab 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -359,6 +359,44 @@ class TestGatewayRuntimeStatus: assert payload["pid"] == os.getpid() assert payload["start_time"] == 2000 + def test_runtime_status_running_pid_rejects_stale_record_for_supervisor_pid(self, monkeypatch): + """Regression: stale profile runtime state must not mark s6 supervisors live. + + Docker per-profile supervision can leave a named profile with + ``gateway_state=running`` metadata while the real gateway process is gone + and the recorded PID now belongs to ``s6-supervise`` or ``s6-log``. If + the live command line is readable, it wins over the stale record argv. + """ + payload = { + "pid": 132, + "start_time": 123, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run", "--replace"], + } + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "s6-supervise gateway-coder") + + assert status.get_runtime_status_running_pid(payload) is None + + def test_runtime_status_running_pid_uses_record_when_cmdline_unreadable(self, monkeypatch): + """Keep the cross-platform fallback for hosts where cmdline is unavailable.""" + payload = { + "pid": 132, + "start_time": 123, + "gateway_state": "running", + "kind": "hermes-gateway", + "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run", "--replace"], + } + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None) + + assert status.get_runtime_status_running_pid(payload) == 132 + def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path))