fix(gateway): consult lock record argv when cmdline unreadable in scoped-lock stale check

PR #24500 introduced stale-lock detection that calls
`_looks_like_gateway_process` to confirm a running PID is not an
unrelated process that reused the slot.  On Windows neither `/proc`
nor `ps` is available, so `_read_process_cmdline` always returns
`None` and `_looks_like_gateway_process` always returns `False` —
causing every valid Windows gateway lock to be marked stale and
immediately evicted.

Fix: after `_looks_like_gateway_process` returns `False`, call
`_read_process_cmdline` directly.  If the result is non-`None` the
live cmdline was readable and confirms the PID is foreign → stale.
If it is `None` (cmdline unreadable, e.g. Windows without ps), fall
back to `_record_looks_like_gateway` which validates the stored
`argv` the gateway wrote into the lock file at startup.  Both
oracles must say "not a gateway" before the lock is evicted — the
same two-oracle pattern already used in `get_running_pid` (line 941).

Adds a regression test that simulates a Windows host where
`_looks_like_gateway_process` returns `False` for every PID and
`_read_process_cmdline` returns `None`, confirming the lock is kept
when the record's argv identifies it as a gateway process.
This commit is contained in:
EloquentBrush 2026-05-13 02:15:57 +03:00 committed by Teknium
parent 24e2151cd6
commit f9559c39c4
2 changed files with 42 additions and 3 deletions

View file

@ -613,15 +613,20 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
stale = True stale = True
# When start_time comparison is unavailable (macOS / Windows # When start_time comparison is unavailable (macOS / Windows
# have no /proc, so both sides are None), fall back to # have no /proc, so both sides are None), fall back to
# checking the live process command line. If the PID was # checking the live process command line. When cmdline is
# reused by an unrelated process the lock is stale. # also unreadable (Windows has no ps), consult the lock
# record's own argv — the gateway writes it at startup and
# it's the only identity signal on platforms without ps.
# Both oracles must indicate "not a gateway" to mark stale.
if ( if (
not stale not stale
and existing.get("start_time") is None and existing.get("start_time") is None
and current_start is None and current_start is None
and not _looks_like_gateway_process(existing_pid) and not _looks_like_gateway_process(existing_pid)
): ):
stale = True live_cmdline = _read_process_cmdline(existing_pid)
if live_cmdline is not None or not _record_looks_like_gateway(existing):
stale = True
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
# processes still appear alive to _pid_exists but are not # processes still appear alive to _pid_exists but are not
# actually running. Treat them as stale so --replace works. # actually running. Treat them as stale so --replace works.

View file

@ -468,6 +468,9 @@ class TestScopedLocks:
monkeypatch.setattr(status, "_pid_exists", lambda pid: True) monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False) monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
# On macOS ``ps`` is available, so _read_process_cmdline returns the
# unrelated process's name. This confirms the PID was reused.
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/libexec/bluetoothuserd")
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
@ -476,6 +479,37 @@ class TestScopedLocks:
assert payload["pid"] == os.getpid() assert payload["pid"] == os.getpid()
assert payload["metadata"]["platform"] == "telegram" assert payload["metadata"]["platform"] == "telegram"
def test_acquire_scoped_lock_keeps_lock_when_cmdline_unreadable_but_record_is_gateway(self, tmp_path, monkeypatch):
"""Windows regression: ps unavailable so cmdline cannot be read.
When start_time is None on both sides and _looks_like_gateway_process
returns False because ps is missing (not because the PID belongs to an
unrelated process), the stale check must not delete a valid gateway
lock. Fall back to the lock record's own argv — written by the
gateway at startup before declaring the lock stale.
"""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 99999,
"start_time": None,
"kind": "hermes-gateway",
"argv": ["hermes_cli/main.py", "gateway", "run"],
}))
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
# Windows: ps not available, so _read_process_cmdline returns None
# and _looks_like_gateway_process returns False for every process.
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
assert acquired is False
assert existing["pid"] == 99999
def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch): def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch):
"""When start_time is None but the live PID still looks like a gateway, keep the lock.""" """When start_time is None but the live PID still looks like a gateway, keep the lock."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))