From 402d048eb6c65d4e486d5b1b3900ed20f6d065fa Mon Sep 17 00:00:00 2001 From: Teknium Date: Wed, 22 Apr 2026 16:33:27 -0700 Subject: [PATCH] fix(gateway): also unlink stale PID + lock files on cleanup Follow-up for salvaged PR #14179. `_cleanup_invalid_pid_path` previously called `remove_pid_file()` for the default PID path, but that helper defensively refuses to delete a PID file whose pid field differs from `os.getpid()` (to protect --replace handoffs). Every realistic stale-PID scenario is exactly that case: a crashed/Ctrl+C'd gateway left behind a PID file owned by a now-dead foreign PID. Once `get_running_pid()` has confirmed the runtime lock is inactive, the on-disk metadata is known to belong to a dead process, so we can force-unlink both the PID file and the sibling `gateway.lock` directly instead of going through the defensive helper. Also adds a regression test with a dead foreign PID that would have failed against the previous cleanup logic. --- gateway/status.py | 18 ++++++++++++++---- tests/gateway/test_status.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index 5682bfc02..55456b0a3 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -241,13 +241,23 @@ def _pid_from_record(record: Optional[dict[str, Any]]) -> Optional[int]: def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None: + """Delete a stale gateway PID file (and its sibling lock metadata). + + Called from ``get_running_pid()`` after the runtime lock has already been + confirmed inactive, so the on-disk metadata is known to belong to a dead + process. Unlike ``remove_pid_file()`` (which defensively refuses to delete + a PID file whose ``pid`` field differs from ``os.getpid()`` to protect + ``--replace`` handoffs), this path force-unlinks both files so the next + startup sees a clean slate. + """ if not cleanup_stale: return try: - if pid_path == _get_pid_path(): - remove_pid_file() - else: - pid_path.unlink(missing_ok=True) + pid_path.unlink(missing_ok=True) + except Exception: + pass + try: + _get_gateway_lock_path(pid_path).unlink(missing_ok=True) except Exception: pass diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 68445b1cb..f2b6b1b1f 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -150,6 +150,40 @@ class TestGatewayPidState: assert status.get_running_pid() is None assert not pid_path.exists() + def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch): + """Stale PID file from a *different* PID (crashed process) must still be cleaned. + + Regression for: ``remove_pid_file()`` defensively refuses to delete a + PID file whose pid != ``os.getpid()`` to protect ``--replace`` + handoffs. Stale-cleanup must not go through that path or real + crashed-process PID files never get removed. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + pid_path = tmp_path / "gateway.pid" + lock_path = tmp_path / "gateway.lock" + + # PID that is guaranteed not alive and not our own. + dead_foreign_pid = 999999 + assert dead_foreign_pid != os.getpid() + + pid_path.write_text(json.dumps({ + "pid": dead_foreign_pid, + "kind": "hermes-gateway", + "argv": ["python", "-m", "hermes_cli.main", "gateway"], + "start_time": 123, + })) + lock_path.write_text(json.dumps({ + "pid": dead_foreign_pid, + "kind": "hermes-gateway", + "argv": ["python", "-m", "hermes_cli.main", "gateway"], + "start_time": 123, + })) + + # No live lock holder → get_running_pid should clean both files. + assert status.get_running_pid() is None + assert not pid_path.exists() + assert not lock_path.exists() + def test_get_running_pid_falls_back_to_live_lock_record(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path)) pid_path = tmp_path / "gateway.pid"