mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-17 09:41:58 +00:00
fix(gateway): evict scoped lock when PID+start_time match but process is not a gateway
On Linux, systemd spawns core services (cron, nginx, sshd) with deterministic PIDs and jiffy start_times across reboots. A service can land on the exact same PID and start_time as a previous gateway, causing acquire_scoped_lock to mistake it for a live gateway and block startup. The existing stale-detection paths only covered: - start_times both non-None and different (clear mismatch) - start_times both None (macOS/Windows fallback to cmdline check) The boot-time collision falls through both: times are non-None and equal, so neither branch fired. Add a third check: when both start_times are known and match but the live process fails _looks_like_gateway_process, read its cmdline. If the cmdline is readable (non-None), we have positive evidence of an impostor and mark the lock stale. Requiring a readable cmdline keeps the check conservative — if cmdline is unreadable we do not evict.
This commit is contained in:
parent
a376ca0081
commit
ec05d2bc3e
2 changed files with 45 additions and 0 deletions
|
|
@ -643,6 +643,21 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
|||
live_cmdline = _read_process_cmdline(existing_pid)
|
||||
if live_cmdline is not None or not _record_looks_like_gateway(existing):
|
||||
stale = True
|
||||
# Secondary defence against boot-time PID+start_time collisions:
|
||||
# systemd spawns core services deterministically, so an unrelated
|
||||
# process (e.g. cron) can land on the exact same PID and jiffy
|
||||
# count as a previous gateway. If both start_times are known and
|
||||
# match but the live process is not a gateway, and we can confirm
|
||||
# that by reading its cmdline, the lock is stale.
|
||||
if (
|
||||
not stale
|
||||
and existing.get("start_time") is not None
|
||||
and current_start is not None
|
||||
and not _looks_like_gateway_process(existing_pid)
|
||||
):
|
||||
live_cmdline = _read_process_cmdline(existing_pid)
|
||||
if live_cmdline is not None:
|
||||
stale = True
|
||||
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
||||
# processes still appear alive to _pid_exists but are not
|
||||
# actually running. Treat them as stale so --replace works.
|
||||
|
|
|
|||
|
|
@ -636,6 +636,36 @@ class TestScopedLocks:
|
|||
assert removed == 0
|
||||
assert reused_pid_lock.exists()
|
||||
|
||||
def test_acquire_scoped_lock_replaces_reused_pid_even_with_matching_start_time(self, tmp_path, monkeypatch):
|
||||
"""Regression: boot-time PID+start_time collision must not block gateway startup.
|
||||
|
||||
On Linux, systemd assigns PIDs and jiffy start_times deterministically
|
||||
across reboots. A core service (e.g. cron) can land on the exact same
|
||||
PID and start_time as a previous gateway. The start_time check passes,
|
||||
but the live process is not a gateway — the lock must be evicted.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
lock_path.write_text(json.dumps({
|
||||
"pid": 840,
|
||||
"start_time": 123,
|
||||
"kind": "hermes-gateway",
|
||||
"argv": ["/usr/bin/python", "-m", "hermes_cli.main", "gateway", "run"],
|
||||
}))
|
||||
|
||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
|
||||
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
|
||||
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/sbin/nginx")
|
||||
|
||||
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
||||
|
||||
assert acquired is True
|
||||
payload = json.loads(lock_path.read_text())
|
||||
assert payload["pid"] == os.getpid()
|
||||
assert payload["metadata"]["platform"] == "telegram"
|
||||
|
||||
|
||||
class TestTakeoverMarker:
|
||||
"""Tests for the --replace takeover marker.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue