From f9559c39c4ee7cc7c40f79efb37a6530b2bf0e0e Mon Sep 17 00:00:00 2001 From: EloquentBrush <147827411+EloquentBrush@users.noreply.github.com> Date: Wed, 13 May 2026 02:15:57 +0300 Subject: [PATCH] fix(gateway): consult lock record argv when cmdline unreadable in scoped-lock stale check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #24500 introduced stale-lock detection that calls `_looks_like_gateway_process` to confirm a running PID is not an unrelated process that reused the slot. On Windows neither `/proc` nor `ps` is available, so `_read_process_cmdline` always returns `None` and `_looks_like_gateway_process` always returns `False` — causing every valid Windows gateway lock to be marked stale and immediately evicted. Fix: after `_looks_like_gateway_process` returns `False`, call `_read_process_cmdline` directly. If the result is non-`None` the live cmdline was readable and confirms the PID is foreign → stale. If it is `None` (cmdline unreadable, e.g. Windows without ps), fall back to `_record_looks_like_gateway` which validates the stored `argv` the gateway wrote into the lock file at startup. Both oracles must say "not a gateway" before the lock is evicted — the same two-oracle pattern already used in `get_running_pid` (line 941). Adds a regression test that simulates a Windows host where `_looks_like_gateway_process` returns `False` for every PID and `_read_process_cmdline` returns `None`, confirming the lock is kept when the record's argv identifies it as a gateway process. --- gateway/status.py | 11 ++++++++--- tests/gateway/test_status.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/gateway/status.py b/gateway/status.py index 0cc8abddb47..3c619856025 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -613,15 +613,20 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, stale = True # When start_time comparison is unavailable (macOS / Windows # have no /proc, so both sides are None), fall back to - # checking the live process command line. If the PID was - # reused by an unrelated process the lock is stale. + # checking the live process command line. When cmdline is + # also unreadable (Windows has no ps), consult the lock + # record's own argv — the gateway writes it at startup and + # it's the only identity signal on platforms without ps. + # Both oracles must indicate "not a gateway" to mark stale. if ( not stale and existing.get("start_time") is None and current_start is None and not _looks_like_gateway_process(existing_pid) ): - stale = True + live_cmdline = _read_process_cmdline(existing_pid) + if live_cmdline is not None or not _record_looks_like_gateway(existing): + stale = True # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped # processes still appear alive to _pid_exists but are not # actually running. Treat them as stale so --replace works. diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 91a52104ded..b92c0cd4d11 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -468,6 +468,9 @@ class TestScopedLocks: monkeypatch.setattr(status, "_pid_exists", lambda pid: True) monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False) + # On macOS ``ps`` is available, so _read_process_cmdline returns the + # unrelated process's name. This confirms the PID was reused. + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/libexec/bluetoothuserd") acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) @@ -476,6 +479,37 @@ class TestScopedLocks: assert payload["pid"] == os.getpid() assert payload["metadata"]["platform"] == "telegram" + def test_acquire_scoped_lock_keeps_lock_when_cmdline_unreadable_but_record_is_gateway(self, tmp_path, monkeypatch): + """Windows regression: ps unavailable so cmdline cannot be read. + + When start_time is None on both sides and _looks_like_gateway_process + returns False because ps is missing (not because the PID belongs to an + unrelated process), the stale check must not delete a valid gateway + lock. Fall back to the lock record's own argv — written by the + gateway at startup — before declaring the lock stale. + """ + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 99999, + "start_time": None, + "kind": "hermes-gateway", + "argv": ["hermes_cli/main.py", "gateway", "run"], + })) + + monkeypatch.setattr(status, "_pid_exists", lambda pid: True) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + # Windows: ps not available, so _read_process_cmdline returns None + # and _looks_like_gateway_process returns False for every process. + monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None) + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is False + assert existing["pid"] == 99999 + def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch): """When start_time is None but the live PID still looks like a gateway, keep the lock.""" monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))