mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(gateway): also unlink stale PID + lock files on cleanup
Follow-up for salvaged PR #14179. `_cleanup_invalid_pid_path` previously called `remove_pid_file()` for the default PID path, but that helper defensively refuses to delete a PID file whose pid field differs from `os.getpid()` (to protect --replace handoffs). Every realistic stale-PID scenario is exactly that case: a crashed/Ctrl+C'd gateway left behind a PID file owned by a now-dead foreign PID. Once `get_running_pid()` has confirmed the runtime lock is inactive, the on-disk metadata is known to belong to a dead process, so we can force-unlink both the PID file and the sibling `gateway.lock` directly instead of going through the defensive helper. Also adds a regression test with a dead foreign PID that would have failed against the previous cleanup logic.
This commit is contained in:
parent
b52123eb15
commit
402d048eb6
2 changed files with 48 additions and 4 deletions
|
|
@ -241,13 +241,23 @@ def _pid_from_record(record: Optional[dict[str, Any]]) -> Optional[int]:
|
|||
|
||||
|
||||
def _cleanup_invalid_pid_path(pid_path: Path, *, cleanup_stale: bool) -> None:
|
||||
"""Delete a stale gateway PID file (and its sibling lock metadata).
|
||||
|
||||
Called from ``get_running_pid()`` after the runtime lock has already been
|
||||
confirmed inactive, so the on-disk metadata is known to belong to a dead
|
||||
process. Unlike ``remove_pid_file()`` (which defensively refuses to delete
|
||||
a PID file whose ``pid`` field differs from ``os.getpid()`` to protect
|
||||
``--replace`` handoffs), this path force-unlinks both files so the next
|
||||
startup sees a clean slate.
|
||||
"""
|
||||
if not cleanup_stale:
|
||||
return
|
||||
try:
|
||||
if pid_path == _get_pid_path():
|
||||
remove_pid_file()
|
||||
else:
|
||||
pid_path.unlink(missing_ok=True)
|
||||
pid_path.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
_get_gateway_lock_path(pid_path).unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -150,6 +150,40 @@ class TestGatewayPidState:
|
|||
assert status.get_running_pid() is None
|
||||
assert not pid_path.exists()
|
||||
|
||||
def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch):
|
||||
"""Stale PID file from a *different* PID (crashed process) must still be cleaned.
|
||||
|
||||
Regression for: ``remove_pid_file()`` defensively refuses to delete a
|
||||
PID file whose pid != ``os.getpid()`` to protect ``--replace``
|
||||
handoffs. Stale-cleanup must not go through that path or real
|
||||
crashed-process PID files never get removed.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
lock_path = tmp_path / "gateway.lock"
|
||||
|
||||
# PID that is guaranteed not alive and not our own.
|
||||
dead_foreign_pid = 999999
|
||||
assert dead_foreign_pid != os.getpid()
|
||||
|
||||
pid_path.write_text(json.dumps({
|
||||
"pid": dead_foreign_pid,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
||||
"start_time": 123,
|
||||
}))
|
||||
lock_path.write_text(json.dumps({
|
||||
"pid": dead_foreign_pid,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["python", "-m", "hermes_cli.main", "gateway"],
|
||||
"start_time": 123,
|
||||
}))
|
||||
|
||||
# No live lock holder → get_running_pid should clean both files.
|
||||
assert status.get_running_pid() is None
|
||||
assert not pid_path.exists()
|
||||
assert not lock_path.exists()
|
||||
|
||||
def test_get_running_pid_falls_back_to_live_lock_record(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
pid_path = tmp_path / "gateway.pid"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue