mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-25 05:52:34 +00:00
fix(gateway): detect stale scoped locks via cmdline when start_time is absent on macOS
On macOS (and Windows), /proc is unavailable so _get_process_start_time()
always returns None. When a gateway creates a scoped lock record with
start_time=None and then exits, macOS can reuse that PID for an unrelated
process. On restart, acquire_scoped_lock() sees:
1. os.kill(pid, 0) succeeds (PID is alive — but it's bluetoothuserd, not
the gateway)
2. existing.start_time is None and current_start is None, so the
start_time comparison is inconclusive
3. The lock is treated as active, blocking gateway startup with:
"Telegram bot token already in use (PID 873). Stop the other gateway
first."
Root cause: _read_process_cmdline() only reads /proc/<pid>/cmdline, which
doesn't exist on macOS. It always returns None, making
_looks_like_gateway_process() always return False, so the cmdline fallback
path in acquire_scoped_lock() was unreachable on macOS.
Fix (two parts):
1. _read_process_cmdline(): Add a ps(1) fallback for platforms without
/proc. When /proc/<pid>/cmdline doesn't exist, we now run
"ps -p <pid> -o command=" to retrieve the process command line. The
/proc path is tried first (preserving Linux performance); ps is only
invoked as a fallback.
2. acquire_scoped_lock(): When both the lock record's start_time and the
live process's start_time are None (the macOS case), fall back to
checking whether the live PID still looks like a Hermes gateway process
via _looks_like_gateway_process(). If it doesn't, the lock is stale.
Closes #16376
This commit is contained in:
parent
642768c5c7
commit
653d304290
2 changed files with 126 additions and 5 deletions
|
|
@ -124,16 +124,33 @@ def get_process_start_time(pid: int) -> Optional[int]:
|
||||||
|
|
||||||
|
|
||||||
def _read_process_cmdline(pid: int) -> Optional[str]:
|
def _read_process_cmdline(pid: int) -> Optional[str]:
|
||||||
"""Return the process command line as a space-separated string."""
|
"""Return the process command line as a space-separated string.
|
||||||
|
|
||||||
|
On Linux, reads /proc/<pid>/cmdline directly. On macOS and other
|
||||||
|
platforms without /proc, falls back to ``ps -p <pid> -o command=``.
|
||||||
|
"""
|
||||||
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
||||||
try:
|
try:
|
||||||
raw = cmdline_path.read_bytes()
|
raw = cmdline_path.read_bytes()
|
||||||
except (FileNotFoundError, PermissionError, OSError):
|
except (FileNotFoundError, PermissionError, OSError):
|
||||||
return None
|
pass
|
||||||
|
else:
|
||||||
|
if raw:
|
||||||
|
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
||||||
|
|
||||||
if not raw:
|
try:
|
||||||
return None
|
result = subprocess.run(
|
||||||
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
["ps", "-p", str(pid), "-o", "command="],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout.strip():
|
||||||
|
return result.stdout.strip()
|
||||||
|
except (OSError, subprocess.TimeoutExpired):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _looks_like_gateway_process(pid: int) -> bool:
|
def _looks_like_gateway_process(pid: int) -> bool:
|
||||||
|
|
@ -594,6 +611,17 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
|
||||||
and current_start != existing.get("start_time")
|
and current_start != existing.get("start_time")
|
||||||
):
|
):
|
||||||
stale = True
|
stale = True
|
||||||
|
# When start_time comparison is unavailable (macOS / Windows
|
||||||
|
# have no /proc, so both sides are None), fall back to
|
||||||
|
# checking the live process command line. If the PID was
|
||||||
|
# reused by an unrelated process the lock is stale.
|
||||||
|
if (
|
||||||
|
not stale
|
||||||
|
and existing.get("start_time") is None
|
||||||
|
and current_start is None
|
||||||
|
and not _looks_like_gateway_process(existing_pid)
|
||||||
|
):
|
||||||
|
stale = True
|
||||||
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
# Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped
|
||||||
# processes still appear alive to _pid_exists but are not
|
# processes still appear alive to _pid_exists but are not
|
||||||
# actually running. Treat them as stale so --replace works.
|
# actually running. Treat them as stale so --replace works.
|
||||||
|
|
|
||||||
|
|
@ -444,6 +444,56 @@ class TestScopedLocks:
|
||||||
assert acquired is False
|
assert acquired is False
|
||||||
assert existing["pid"] == 99999
|
assert existing["pid"] == 99999
|
||||||
|
|
||||||
|
def test_acquire_scoped_lock_replaces_pid_reused_by_unrelated_process(self, tmp_path, monkeypatch):
|
||||||
|
"""macOS regression: PID reused by an unrelated process with start_time=None.
|
||||||
|
|
||||||
|
On macOS /proc is unavailable, so both the lock record and the live
|
||||||
|
process report start_time=None. The live PID is alive (os.kill
|
||||||
|
succeeds) but belongs to a completely different program. The lock
|
||||||
|
must be treated as stale.
|
||||||
|
"""
|
||||||
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||||
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
||||||
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
lock_path.write_text(json.dumps({
|
||||||
|
"pid": 873,
|
||||||
|
"start_time": None,
|
||||||
|
"kind": "hermes-gateway",
|
||||||
|
"argv": ["/Users/user/.hermes/hermes-agent/hermes_cli/main.py", "gateway", "run", "--replace"],
|
||||||
|
}))
|
||||||
|
|
||||||
|
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
||||||
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||||
|
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False)
|
||||||
|
|
||||||
|
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
||||||
|
|
||||||
|
assert acquired is True
|
||||||
|
payload = json.loads(lock_path.read_text())
|
||||||
|
assert payload["pid"] == os.getpid()
|
||||||
|
assert payload["metadata"]["platform"] == "telegram"
|
||||||
|
|
||||||
|
def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch):
|
||||||
|
"""When start_time is None but the live PID still looks like a gateway, keep the lock."""
|
||||||
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||||
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
||||||
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
lock_path.write_text(json.dumps({
|
||||||
|
"pid": 99999,
|
||||||
|
"start_time": None,
|
||||||
|
"kind": "hermes-gateway",
|
||||||
|
"argv": ["/Users/user/.hermes/hermes-agent/hermes_cli/main.py", "gateway", "run", "--replace"],
|
||||||
|
}))
|
||||||
|
|
||||||
|
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
|
||||||
|
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
|
||||||
|
monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: True)
|
||||||
|
|
||||||
|
acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"})
|
||||||
|
|
||||||
|
assert acquired is False
|
||||||
|
assert existing["pid"] == 99999
|
||||||
|
|
||||||
def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch):
|
def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch):
|
||||||
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
|
||||||
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock"
|
||||||
|
|
@ -811,3 +861,46 @@ class TestPlannedStopMarker:
|
||||||
ok = status.write_planned_stop_marker(target_pid=12345)
|
ok = status.write_planned_stop_marker(target_pid=12345)
|
||||||
|
|
||||||
assert ok is False
|
assert ok is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestReadProcessCmdlinePsFallback:
|
||||||
|
"""Tests for _read_process_cmdline falling back to ps on non-Linux."""
|
||||||
|
|
||||||
|
def test_ps_fallback_when_proc_unavailable(self, monkeypatch):
|
||||||
|
monkeypatch.setattr(status.Path, "read_bytes", lambda self: (_ for _ in ()).throw(FileNotFoundError))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
status.subprocess, "run",
|
||||||
|
lambda args, **kwargs: SimpleNamespace(returncode=0, stdout="/usr/libexec/bluetoothuserd\n"),
|
||||||
|
)
|
||||||
|
result = status._read_process_cmdline(873)
|
||||||
|
assert result == "/usr/libexec/bluetoothuserd"
|
||||||
|
|
||||||
|
def test_ps_fallback_returns_none_on_failure(self, monkeypatch):
|
||||||
|
monkeypatch.setattr(status.Path, "read_bytes", lambda self: (_ for _ in ()).throw(FileNotFoundError))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
status.subprocess, "run",
|
||||||
|
lambda args, **kwargs: SimpleNamespace(returncode=1, stdout=""),
|
||||||
|
)
|
||||||
|
result = status._read_process_cmdline(99999)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
def test_proc_cmdline_takes_priority_over_ps(self, monkeypatch):
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_read_bytes(self):
|
||||||
|
calls.append("proc")
|
||||||
|
return b"python\x00hermes_cli/main.py\x00gateway\x00"
|
||||||
|
|
||||||
|
monkeypatch.setattr(status.Path, "read_bytes", fake_read_bytes)
|
||||||
|
result = status._read_process_cmdline(12345)
|
||||||
|
assert "hermes_cli/main.py" in result
|
||||||
|
assert calls == ["proc"]
|
||||||
|
|
||||||
|
def test_ps_fallback_used_when_proc_returns_empty(self, monkeypatch):
|
||||||
|
monkeypatch.setattr(status.Path, "read_bytes", lambda self: b"")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
status.subprocess, "run",
|
||||||
|
lambda args, **kwargs: SimpleNamespace(returncode=0, stdout="python hermes_cli/main.py gateway run\n"),
|
||||||
|
)
|
||||||
|
result = status._read_process_cmdline(12345)
|
||||||
|
assert "hermes_cli/main.py" in result
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue