mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(gateway): consult lock record argv when cmdline unreadable in scoped-lock stale check
PR #24500 introduced stale-lock detection that calls `_looks_like_gateway_process` to confirm a running PID is not an unrelated process that reused the slot. On Windows neither `/proc` nor `ps` is available, so `_read_process_cmdline` always returns `None` and `_looks_like_gateway_process` always returns `False` — causing every valid Windows gateway lock to be marked stale and immediately evicted. Fix: after `_looks_like_gateway_process` returns `False`, call `_read_process_cmdline` directly. If the result is non-`None` the live cmdline was readable and confirms the PID is foreign → stale. If it is `None` (cmdline unreadable, e.g. Windows without ps), fall back to `_record_looks_like_gateway` which validates the stored `argv` the gateway wrote into the lock file at startup. Both oracles must say "not a gateway" before the lock is evicted — the same two-oracle pattern already used in `get_running_pid` (line 941). Adds a regression test that simulates a Windows host where `_looks_like_gateway_process` returns `False` for every PID and `_read_process_cmdline` returns `None`, confirming the lock is kept when the record's argv identifies it as a gateway process.
This commit is contained in:
parent
24e2151cd6
commit
f9559c39c4
2 changed files with 42 additions and 3 deletions
|
|
@ -613,15 +613,20 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
||||||
stale = True
|
stale = True
|
||||||
# When start_time comparison is unavailable (macOS / Windows
|
# When start_time comparison is unavailable (macOS / Windows
|
||||||
# have no /proc, so both sides are None), fall back to
|
# have no /proc, so both sides are None), fall back to
|
||||||
# checking the live process command line. If the PID was
|
# checking the live process command line. When cmdline is
|
||||||
# reused by an unrelated process the lock is stale.
|
# also unreadable (Windows has no ps), consult the lock
|
||||||
|
# record's own argv — the gateway writes it at startup and
|
||||||
|
# it's the only identity signal on platforms without ps.
|
||||||
|
# Both oracles must indicate "not a gateway" to mark stale.
|
||||||
if (
|
if (
|
||||||
not stale
|
not stale
|
||||||
and existing.get("start_time") is None
|
and existing.get("start_time") is None
|
||||||
and current_start is None
|
and current_start is None
|
||||||
and not _looks_like_gateway_process(existing_pid)
|
and not _looks_like_gateway_process(existing_pid)
|
||||||
):
|
):
|
||||||
stale = True
|
live_cmdline = _read_process_cmdline(existing_pid)
|
||||||
|
if live_cmdline is not None or not _record_looks_like_gateway(existing):
|
||||||
|
stale = True
|
||||||
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
||||||
# processes still appear alive to _pid_exists but are not
|
# processes still appear alive to _pid_exists but are not
|
||||||
# actually running. Treat them as stale so --replace works.
|
# actually running. Treat them as stale so --replace works.
|
||||||
|
|
|
||||||
|
|
@ -468,6 +468,9 @@ class TestScopedLocks:
|
||||||
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||||
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
|
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
|
||||||
|
# On macOS ``ps`` is available, so _read_process_cmdline returns the
|
||||||
|
# unrelated process's name. This confirms the PID was reused.
|
||||||
|
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/libexec/bluetoothuserd")
|
||||||
|
|
||||||
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
||||||
|
|
||||||
|
|
@ -476,6 +479,37 @@ class TestScopedLocks:
|
||||||
assert payload["pid"] == os.getpid()
|
assert payload["pid"] == os.getpid()
|
||||||
assert payload["metadata"]["platform"] == "telegram"
|
assert payload["metadata"]["platform"] == "telegram"
|
||||||
|
|
||||||
|
def test_acquire_scoped_lock_keeps_lock_when_cmdline_unreadable_but_record_is_gateway(self, tmp_path, monkeypatch):
|
||||||
|
"""Windows regression: ps unavailable so cmdline cannot be read.
|
||||||
|
|
||||||
|
When start_time is None on both sides and _looks_like_gateway_process
|
||||||
|
returns False because ps is missing (not because the PID belongs to an
|
||||||
|
unrelated process), the stale check must not delete a valid gateway
|
||||||
|
lock. Fall back to the lock record's own argv — written by the
|
||||||
|
gateway at startup — before declaring the lock stale.
|
||||||
|
"""
|
||||||
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||||
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
||||||
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
lock_path.write_text(json.dumps({
|
||||||
|
"pid": 99999,
|
||||||
|
"start_time": None,
|
||||||
|
"kind": "hermes-gateway",
|
||||||
|
"argv": ["hermes_cli/main.py", "gateway", "run"],
|
||||||
|
}))
|
||||||
|
|
||||||
|
monkeypatch.setattr(status, "_pid_exists", lambda pid: True)
|
||||||
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||||
|
# Windows: ps not available, so _read_process_cmdline returns None
|
||||||
|
# and _looks_like_gateway_process returns False for every process.
|
||||||
|
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
|
||||||
|
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
|
||||||
|
|
||||||
|
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
||||||
|
|
||||||
|
assert acquired is False
|
||||||
|
assert existing["pid"] == 99999
|
||||||
|
|
||||||
def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch):
|
def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch):
|
||||||
"""When start_time is None but the live PID still looks like a gateway, keep the lock."""
|
"""When start_time is None but the live PID still looks like a gateway, keep the lock."""
|
||||||
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue