fix(gateway): detect stale token locks on macOS via ps fallback

This commit is contained in:
konsisumer 2026-04-25 00:34:52 +02:00
parent 00c3d848d8
commit 9689793871
2 changed files with 238 additions and 7 deletions

View file

@ -104,13 +104,39 @@ def _get_scope_lock_path(scope: str, identity: str) -> Path:
def _get_process_start_time(pid: int) -> Optional[int]:
"""Return the kernel start time for a process when available."""
"""Return the kernel start time for a process when available.
On Linux, reads /proc/<pid>/stat (field 22, clock ticks since boot).
On macOS and other POSIX platforms, falls back to ``ps -p <pid> -o lstart=``
and converts the human-readable date to a Unix timestamp. Returns None when
the time cannot be determined (Windows, ps unavailable, process gone).
"""
stat_path = Path(f"/proc/{pid}/stat")
try:
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
return int(stat_path.read_text().split()[21])
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
return None
pass
# macOS / non-Linux POSIX: use ps to get the process start time.
if sys.platform != "win32":
try:
from email.utils import parsedate
import time as _time
result = subprocess.run(
["ps", "-p", str(pid), "-o", "lstart="],
capture_output=True, text=True, timeout=2,
)
if result.returncode == 0:
lstart = result.stdout.strip()
if lstart:
parsed = parsedate(lstart)
if parsed:
return int(_time.mktime(parsed))
except Exception:
pass
return None
def get_process_start_time(pid: int) -> Optional[int]:
@ -119,16 +145,33 @@ def get_process_start_time(pid: int) -> Optional[int]:
def _read_process_cmdline(pid: int) -> Optional[str]:
"""Return the process command line as a space-separated string."""
"""Return the process command line as a space-separated string.
On Linux, reads /proc/<pid>/cmdline. On macOS / other POSIX platforms,
falls back to ``ps -p <pid> -o command=``. Returns None on Windows or when
the command line cannot be read.
"""
cmdline_path = Path(f"/proc/{pid}/cmdline")
try:
raw = cmdline_path.read_bytes()
if raw:
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
except (FileNotFoundError, PermissionError, OSError):
return None
pass
if not raw:
return None
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
# macOS / non-Linux POSIX: use ps to get the command line.
if sys.platform != "win32":
try:
result = subprocess.run(
["ps", "-p", str(pid), "-o", "command="],
capture_output=True, text=True, timeout=2,
)
if result.returncode == 0:
return result.stdout.strip() or None
except Exception:
pass
return None
def _looks_like_gateway_process(pid: int) -> bool:
@ -527,6 +570,19 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str,
break
except (OSError, PermissionError):
pass
# Fallback for platforms where start_time is unavailable (e.g.
# macOS lock files written before this fix, or systems without ps).
# If the live PID's cmdline doesn't look like a gateway, the PID
# was reused by an unrelated process — the lock is stale.
if not stale and (
existing.get("start_time") is None or current_start is None
):
try:
live_cmdline = _read_process_cmdline(existing_pid)
if live_cmdline is not None and not _looks_like_gateway_process(existing_pid):
stale = True
except Exception:
pass
if stale:
try:
lock_path.unlink(missing_ok=True)

View file

@ -452,6 +452,181 @@ class TestScopedLocks:
assert reused_pid_lock.exists()
class TestProcessInfoHelpers:
"""Tests for cross-platform process info helpers (_get_process_start_time, _read_process_cmdline)."""
def test_get_process_start_time_returns_int_for_current_process(self):
"""Should return an integer for the current (live) process on any platform."""
result = status._get_process_start_time(os.getpid())
assert result is None or isinstance(result, int)
def test_get_process_start_time_ps_fallback_returns_int(self, monkeypatch):
"""When /proc is unavailable, ps lstart= fallback returns an integer."""
import types
from types import SimpleNamespace
fake_run_result = SimpleNamespace(returncode=0, stdout="Thu Apr 24 14:48:56 2026\n")
def fake_run(cmd, **kwargs):
return fake_run_result
# Force the /proc path to raise so the subprocess fallback runs.
monkeypatch.setattr(status.subprocess, "run", fake_run)
# Simulate non-Windows so the ps branch is entered.
monkeypatch.setattr(status.sys, "platform", "darwin")
# Directly test the fallback by calling with a PID whose /proc won't exist.
# On macOS the /proc branch is always skipped; on Linux we mock subprocess.
# We rely on _get_process_start_time gracefully handling the /proc miss.
result = status._get_process_start_time(99999999)
# The ps mock returns a valid date so we expect an int.
assert isinstance(result, int)
def test_get_process_start_time_ps_fallback_returns_none_when_ps_fails(self, monkeypatch):
"""ps returning non-zero (dead PID) → None."""
from types import SimpleNamespace
monkeypatch.setattr(
status.subprocess, "run",
lambda cmd, **kwargs: SimpleNamespace(returncode=1, stdout=""),
)
monkeypatch.setattr(status.sys, "platform", "darwin")
result = status._get_process_start_time(99999999)
assert result is None
def test_read_process_cmdline_ps_fallback(self, monkeypatch):
"""When /proc is unavailable, ps command= fallback returns the cmdline string."""
from types import SimpleNamespace
monkeypatch.setattr(
status.subprocess, "run",
lambda cmd, **kwargs: SimpleNamespace(
returncode=0, stdout="python -m hermes_cli.main gateway run\n"
),
)
monkeypatch.setattr(status.sys, "platform", "darwin")
result = status._read_process_cmdline(99999999)
assert result == "python -m hermes_cli.main gateway run"
def test_read_process_cmdline_ps_fallback_returns_none_on_dead_pid(self, monkeypatch):
from types import SimpleNamespace
monkeypatch.setattr(
status.subprocess, "run",
lambda cmd, **kwargs: SimpleNamespace(returncode=1, stdout=""),
)
monkeypatch.setattr(status.sys, "platform", "darwin")
result = status._read_process_cmdline(99999999)
assert result is None
class TestScopedLocksStalePidReuse:
"""Tests for the PID-reuse stale-detection fix (issue #15115).
On macOS, /proc does not exist so start_time is None in lock files written
by older Hermes versions. When a crashed gateway's PID is reused by a
non-gateway process, the lock must be detected as stale via cmdline check.
"""
def test_acquire_treats_as_stale_when_pid_reused_by_non_gateway(self, tmp_path, monkeypatch):
"""Lock with null start_time whose PID is held by a non-gateway process → stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
# Simulate a macOS-era lock: start_time is null because /proc wasn't available.
lock_path.write_text(json.dumps({
"pid": 78452,
"start_time": None,
"kind": "hermes-gateway",
}))
# PID 78452 is alive (reused by a non-gateway process).
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
# Cmdline returns something that is NOT a gateway.
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/bin/some_daemon --start")
acquired, _ = status.acquire_scoped_lock("weixin-bot-token", "secret", metadata={"platform": "weixin"})
assert acquired is True, "Lock with reused PID (non-gateway cmdline) should be treated as stale"
payload = json.loads(lock_path.read_text())
assert payload["pid"] == os.getpid()
def test_acquire_rejects_when_live_gateway_holds_lock_no_start_time(self, tmp_path, monkeypatch):
"""Lock with null start_time whose PID is a live gateway → NOT stale."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 78452,
"start_time": None,
"kind": "hermes-gateway",
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None)
# Cmdline confirms this IS a live gateway process.
monkeypatch.setattr(
status,
"_read_process_cmdline",
lambda pid: "python -m hermes_cli.main gateway run --replace",
)
acquired, existing = status.acquire_scoped_lock("weixin-bot-token", "secret", metadata={"platform": "weixin"})
assert acquired is False, "Live gateway holding lock must NOT be evicted"
assert existing["pid"] == 78452
def test_acquire_treats_as_stale_when_cmdline_unreadable_but_start_time_mismatch(self, tmp_path, monkeypatch):
"""Start-time mismatch should still win even when cmdline is unreadable."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 78452,
"start_time": 1000,
"kind": "hermes-gateway",
}))
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
# Different start_time → PID was reused.
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999)
monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
acquired, _ = status.acquire_scoped_lock("weixin-bot-token", "secret", metadata={"platform": "weixin"})
assert acquired is True
def test_acquire_does_not_trigger_cmdline_check_when_start_times_match(self, tmp_path, monkeypatch):
"""When start times match, cmdline check must NOT run (live owner confirmed)."""
monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks"))
lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
lock_path.write_text(json.dumps({
"pid": 78452,
"start_time": 1000,
"kind": "hermes-gateway",
}))
cmdline_calls = []
def fake_read_cmdline(pid):
cmdline_calls.append(pid)
return "/usr/bin/some_daemon" # non-gateway, but should not be reached
monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000)
monkeypatch.setattr(status, "_read_process_cmdline", fake_read_cmdline)
acquired, existing = status.acquire_scoped_lock("weixin-bot-token", "secret")
assert acquired is False
assert not cmdline_calls, "cmdline check should not run when start_times match"
class TestTakeoverMarker:
"""Tests for the --replace takeover marker.