From ec05d2bc3eb343968b9c2b1fc04b8195d48de40b Mon Sep 17 00:00:00 2001 From: Tharushka Dinujaya Date: Mon, 15 Jun 2026 16:48:14 +0530 Subject: [PATCH] fix(gateway): evict scoped lock when PID+start_time match but process is not a gateway MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Linux, systemd spawns core services (cron, nginx, sshd) with deterministic PIDs and jiffy start_times across reboots. A service can land on the exact same PID and start_time as a previous gateway, causing acquire_scoped_lock to mistake it for a live gateway and block startup. The existing stale-detection paths only covered: - start_times both non-None and different (clear mismatch) - start_times both None (macOS/Windows fallback to cmdline check) The boot-time collision falls through both: times are non-None and equal, so neither branch fired. Add a third check: when both start_times are known and match but the live process fails _looks_like_gateway_process, read its cmdline. If the cmdline is readable (non-None), we have positive evidence of an impostor and mark the lock stale. Requiring a readable cmdline keeps the check conservative — if cmdline is unreadable we do not evict. --- gateway/status.py | 15 +++++++++++++++ tests/gateway/test_status.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/gateway/status.py b/gateway/status.py index 8d2640af0f8..a49999e712f 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -643,6 +643,21 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, live_cmdline = _read_process_cmdline(existing_pid) if live_cmdline is not None or not _record_looks_like_gateway(existing): stale = True + # Secondary defence against boot-time PID+start_time collisions: + # systemd spawns core services deterministically, so an unrelated + # process (e.g. cron) can land on the exact same PID and jiffy + # count as a previous gateway. If both start_times are known and + # match but the live process is not a gateway, and we can confirm + # that by reading its cmdline, the lock is stale. + if ( + not stale + and existing.get("start_time") is not None + and current_start is not None + and not _looks_like_gateway_process(existing_pid) + ): + live_cmdline = _read_process_cmdline(existing_pid) + if live_cmdline is not None: + stale = True # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped # processes still appear alive to _pid_exists but are not # actually running. Treat them as stale so --replace works. diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index bbf9d95709e..e8d2f57485c 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -636,6 +636,36 @@ class TestScopedLocks: assert removed == 0 assert reused_pid_lock.exists() + def test_acquire_scoped_lock_replaces_reused_pid_even_with_matching_start_time(self, tmp_path, monkeypatch): + """Regression: boot-time PID+start_time collision must not block gateway startup. + + On Linux, systemd assigns PIDs and jiffy start_times deterministically + across reboots. A core service (e.g. cron) can land on the exact same + PID and start_time as a previous gateway. The start_time check passes, + but the live process is not a gateway — the lock must be evicted. + """ + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 840, + "start_time": 123, + "kind": "hermes-gateway", + "argv": ["/usr/bin/python", "-m", "hermes_cli.main", "gateway", "run"], + })) + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/sbin/nginx") + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + assert payload["metadata"]["platform"] == "telegram" + class TestTakeoverMarker: """Tests for the --replace takeover marker.