fix(gateway): evict scoped lock when PID+start_time match but process is not a gateway

On Linux, systemd spawns core services (cron, nginx, sshd) with
deterministic PIDs and jiffy start_times across reboots. A service can
land on the exact same PID and start_time as a previous gateway, causing
acquire_scoped_lock to mistake it for a live gateway and block startup.

The existing stale-detection paths only covered:
  - start_times both non-None and different (clear mismatch)
  - start_times both None (macOS/Windows fallback to cmdline check)

The boot-time collision falls through both: times are non-None and
equal, so neither branch fired.

Add a third check: when both start_times are known and match but the
live process fails _looks_like_gateway_process, read its cmdline. If
the cmdline is readable (non-None), we have positive evidence of an
impostor and mark the lock stale. Requiring a readable cmdline keeps the
check conservative — if cmdline is unreadable we do not evict.
This commit is contained in:
Tharushka Dinujaya 2026-06-15 16:48:14 +05:30 committed by Teknium
parent a376ca0081
commit ec05d2bc3e
2 changed files with 45 additions and 0 deletions

View file

@ -643,6 +643,21 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
live_cmdline = _read_process_cmdline(existing_pid)
if live_cmdline is not None or not _record_looks_like_gateway(existing):
stale = True
# Secondary defence against boot-time PID+start_time collisions:
# systemd spawns core services deterministically, so an unrelated
# process (e.g. cron) can land on the exact same PID and jiffy
# count as a previous gateway. If both start_times are known and
# match but the live process is not a gateway, and we can confirm
# that by reading its cmdline, the lock is stale.
if (
not stale
and existing.get("start_time") is not None
and current_start is not None
and not _looks_like_gateway_process(existing_pid)
):
live_cmdline = _read_process_cmdline(existing_pid)
if live_cmdline is not None:
stale = True
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
# processes still appear alive to _pid_exists but are not
# actually running. Treat them as stale so --replace works.

View file

@ -636,6 +636,36 @@ class TestScopedLocks:
assert removed == 0
assert reused_pid_lock.exists()
def test_acquire_scoped_lock_replaces_reused_pid_even_with_matching_start_time(self, tmp_path, monkeypatch):
"""Regression: boot-time PID+start_time collision must not block gateway startup.
On Linux, systemd assigns PIDs and jiffy start_times deterministically
across reboots. A core service (e.g. cron) can land on the exact same
PID and start_time as a previous gateway. The start_time check passes,
but the live process is not a gateway the lock must be evicted.
"""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 840,
"start_time": 123,
"kind": "hermes-gateway",
"argv": ["/usr/bin/python", "-m", "hermes_cli.main", "gateway", "run"],
}))
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/sbin/nginx")
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is True
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "telegram"
class TestTakeoverMarker:
"""Tests for the --replace takeover marker.