diff --git a/gateway/status.py b/gateway/status.py index 5682bfc022..55456b0a35 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -241,13 +241,23 @@ def _pid_from_record(record: Optional[dict[str, Any]]) -> Optional[int]: def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None: + """Delete a stale gateway PID file (and its sibling lock metadata). + + Called from ``get_running_pid()`` after the runtime lock has already been + confirmed inactive, so the on-disk metadata is known to belong to a dead + process. Unlike ``remove_pid_file()`` (which defensively refuses to delete + a PID file whose ``pid`` field differs from ``os.getpid()`` to protect + ``--replace`` handoffs), this path force-unlinks both files so the next + startup sees a clean slate. + """ if not cleanup_stale: return try: - if pid_path == _get_pid_path(): - remove_pid_file() - else: - pid_path.unlink(missing_ok=True) + pid_path.unlink(missing_ok=True) + except Exception: + pass + try: + _get_gateway_lock_path(pid_path).unlink(missing_ok=True) except Exception: pass diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 68445b1cb9..f2b6b1b1f3 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -150,6 +150,40 @@ class TestGatewayPidState: assert status.get_running_pid() is None assert not pid_path.exists() + def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch): + """Stale PID file from a *different* PID (crashed process) must still be cleaned. + + Regression for: ``remove_pid_file()`` defensively refuses to delete a + PID file whose pid != ``os.getpid()`` to protect ``--replace`` + handoffs. Stale-cleanup must not go through that path or real + crashed-process PID files never get removed. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + pid_path = tmp_path / "gateway.pid" + lock_path = tmp_path / "gateway.lock" + + # PID that is guaranteed not alive and not our own. + dead_foreign_pid = 999999 + assert dead_foreign_pid != os.getpid() + + pid_path.write_text(json.dumps({ + "pid": dead_foreign_pid, + "kind": "hermes-gateway", + "argv": ["python", "-m", "hermes_cli.main", "gateway"], + "start_time": 123, + })) + lock_path.write_text(json.dumps({ + "pid": dead_foreign_pid, + "kind": "hermes-gateway", + "argv": ["python", "-m", "hermes_cli.main", "gateway"], + "start_time": 123, + })) + + # No live lock holder → get_running_pid should clean both files. + assert status.get_running_pid() is None + assert not pid_path.exists() + assert not lock_path.exists() + def test_get_running_pid_falls_back_to_live_lock_record(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_HOME", str(tmp_path)) pid_path = tmp_path / "gateway.pid"