fix(gateway): also unlink stale PID + lock files on cleanup

Follow-up for salvaged PR #14179.

`_cleanup_invalid_pid_path` previously called `remove_pid_file()` for the
default PID path, but that helper defensively refuses to delete a PID file
whose pid field differs from `os.getpid()` (to protect --replace handoffs).
Every realistic stale-PID scenario is exactly that case: a crashed/Ctrl+C'd
gateway left behind a PID file owned by a now-dead foreign PID.

Once `get_running_pid()` has confirmed the runtime lock is inactive, the
on-disk metadata is known to belong to a dead process, so we can force-unlink
both the PID file and the sibling `gateway.lock` directly instead of going
through the defensive helper.

Also adds a regression test with a dead foreign PID that would have failed
against the previous cleanup logic.
This commit is contained in:
Teknium 2026-04-22 16:33:27 -07:00 committed by Teknium
parent b52123eb15
commit 402d048eb6
2 changed files with 48 additions and 4 deletions

View file

@ -241,13 +241,23 @@ def _pid_from_record(record: Optional[dict[str, Any]]) -> Optional[int]:
def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None:
"""Delete a stale gateway PID file (and its sibling lock metadata).
Called from ``get_running_pid()`` after the runtime lock has already been
confirmed inactive, so the on-disk metadata is known to belong to a dead
process. Unlike ``remove_pid_file()`` (which defensively refuses to delete
a PID file whose ``pid`` field differs from ``os.getpid()`` to protect
``--replace`` handoffs), this path force-unlinks both files so the next
startup sees a clean slate.
"""
if not cleanup_stale:
return
try:
if pid_path == _get_pid_path():
remove_pid_file()
else:
pid_path.unlink(missing_ok=True)
pid_path.unlink(missing_ok=True)
except Exception:
pass
try:
_get_gateway_lock_path(pid_path).unlink(missing_ok=True)
except Exception:
pass

View file

@ -150,6 +150,40 @@ class TestGatewayPidState:
assert status.get_running_pid() is None
assert not pid_path.exists()
def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch):
"""Stale PID file from a *different* PID (crashed process) must still be cleaned.
Regression for: ``remove_pid_file()`` defensively refuses to delete a
PID file whose pid != ``os.getpid()`` to protect ``--replace``
handoffs. Stale-cleanup must not go through that path or real
crashed-process PID files never get removed.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"
lock_path = tmp_path / "gateway.lock"
# PID that is guaranteed not alive and not our own.
dead_foreign_pid = 999999
assert dead_foreign_pid != os.getpid()
pid_path.write_text(json.dumps({
"pid": dead_foreign_pid,
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
"start_time": 123,
}))
lock_path.write_text(json.dumps({
"pid": dead_foreign_pid,
"kind": "hermes-gateway",
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
"start_time": 123,
}))
# No live lock holder → get_running_pid should clean both files.
assert status.get_running_pid() is None
assert not pid_path.exists()
assert not lock_path.exists()
def test_get_running_pid_falls_back_to_live_lock_record(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
pid_path = tmp_path / "gateway.pid"