diff --git a/gateway/status.py b/gateway/status.py index 8d2640af0f8..a49999e712f 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -643,6 +643,21 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, live_cmdline = _read_process_cmdline(existing_pid) if live_cmdline is not None or not _record_looks_like_gateway(existing): stale = True + # Secondary defence against boot-time PID+start_time collisions: + # systemd spawns core services deterministically, so an unrelated + # process (e.g. cron) can land on the exact same PID and jiffy + # count as a previous gateway. If both start_times are known and + # match but the live process is not a gateway, and we can confirm + # that by reading its cmdline, the lock is stale. + if ( + not stale + and existing.get("start_time") is not None + and current_start is not None + and not _looks_like_gateway_process(existing_pid) + ): + live_cmdline = _read_process_cmdline(existing_pid) + if live_cmdline is not None: + stale = True # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped # processes still appear alive to _pid_exists but are not # actually running. Treat them as stale so --replace works. diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index bbf9d95709e..e8d2f57485c 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -636,6 +636,36 @@ class TestScopedLocks: assert removed == 0 assert reused_pid_lock.exists() + def test_acquire_scoped_lock_replaces_reused_pid_even_with_matching_start_time(self, tmp_path, monkeypatch): + """Regression: boot-time PID+start_time collision must not block gateway startup. + + On Linux, systemd assigns PIDs and jiffy start_times deterministically + across reboots. A core service (e.g. cron) can land on the exact same + PID and start_time as a previous gateway. The start_time check passes, + but the live process is not a gateway — the lock must be evicted. + """ + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 840, + "start_time": 123, + "kind": "hermes-gateway", + "argv": ["/usr/bin/python", "-m", "hermes_cli.main", "gateway", "run"], + })) + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/sbin/nginx") + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + assert payload["metadata"]["platform"] == "telegram" + class TestTakeoverMarker: """Tests for the --replace takeover marker.