diff --git a/gateway/status.py b/gateway/status.py index 7f7df182f5..8bcbb9e321 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -104,13 +104,39 @@ def _get_scope_lock_path(scope: str, identity: str) -> Path: def _get_process_start_time(pid: int) -> Optional[int]: - """Return the kernel start time for a process when available.""" + """Return the kernel start time for a process when available. + + On Linux, reads /proc//stat (field 22, clock ticks since boot). + On macOS and other POSIX platforms, falls back to ``ps -p -o lstart=`` + and converts the human-readable date to a Unix timestamp. Returns None when + the time cannot be determined (Windows, ps unavailable, process gone). + """ stat_path = Path(f"/proc/{pid}/stat") try: # Field 22 in /proc//stat is process start time (clock ticks). return int(stat_path.read_text().split()[21]) except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError): - return None + pass + + # macOS / non-Linux POSIX: use ps to get the process start time. + if sys.platform != "win32": + try: + from email.utils import parsedate + import time as _time + result = subprocess.run( + ["ps", "-p", str(pid), "-o", "lstart="], + capture_output=True, text=True, timeout=2, + ) + if result.returncode == 0: + lstart = result.stdout.strip() + if lstart: + parsed = parsedate(lstart) + if parsed: + return int(_time.mktime(parsed)) + except Exception: + pass + + return None def get_process_start_time(pid: int) -> Optional[int]: @@ -119,16 +145,33 @@ def get_process_start_time(pid: int) -> Optional[int]: def _read_process_cmdline(pid: int) -> Optional[str]: - """Return the process command line as a space-separated string.""" + """Return the process command line as a space-separated string. + + On Linux, reads /proc//cmdline. On macOS / other POSIX platforms, + falls back to ``ps -p -o command=``. Returns None on Windows or when + the command line cannot be read. + """ cmdline_path = Path(f"/proc/{pid}/cmdline") try: raw = cmdline_path.read_bytes() + if raw: + return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip() except (FileNotFoundError, PermissionError, OSError): - return None + pass - if not raw: - return None - return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip() + # macOS / non-Linux POSIX: use ps to get the command line. + if sys.platform != "win32": + try: + result = subprocess.run( + ["ps", "-p", str(pid), "-o", "command="], + capture_output=True, text=True, timeout=2, + ) + if result.returncode == 0: + return result.stdout.strip() or None + except Exception: + pass + + return None def _looks_like_gateway_process(pid: int) -> bool: @@ -527,6 +570,19 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, break except (OSError, PermissionError): pass + # Fallback for platforms where start_time is unavailable (e.g. + # macOS lock files written before this fix, or systems without ps). + # If the live PID's cmdline doesn't look like a gateway, the PID + # was reused by an unrelated process — the lock is stale. + if not stale and ( + existing.get("start_time") is None or current_start is None + ): + try: + live_cmdline = _read_process_cmdline(existing_pid) + if live_cmdline is not None and not _looks_like_gateway_process(existing_pid): + stale = True + except Exception: + pass if stale: try: lock_path.unlink(missing_ok=True) diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index e91bb6e419..1fff78bd31 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -452,6 +452,181 @@ class TestScopedLocks: assert reused_pid_lock.exists() +class TestProcessInfoHelpers: + """Tests for cross-platform process info helpers (_get_process_start_time, _read_process_cmdline).""" + + def test_get_process_start_time_returns_int_for_current_process(self): + """Should return an integer for the current (live) process on any platform.""" + result = status._get_process_start_time(os.getpid()) + assert result is None or isinstance(result, int) + + def test_get_process_start_time_ps_fallback_returns_int(self, monkeypatch): + """When /proc is unavailable, ps lstart= fallback returns an integer.""" + import types + from types import SimpleNamespace + + fake_run_result = SimpleNamespace(returncode=0, stdout="Thu Apr 24 14:48:56 2026\n") + + def fake_run(cmd, **kwargs): + return fake_run_result + + # Force the /proc path to raise so the subprocess fallback runs. + monkeypatch.setattr(status.subprocess, "run", fake_run) + # Simulate non-Windows so the ps branch is entered. + monkeypatch.setattr(status.sys, "platform", "darwin") + + # Directly test the fallback by calling with a PID whose /proc won't exist. + # On macOS the /proc branch is always skipped; on Linux we mock subprocess. + # We rely on _get_process_start_time gracefully handling the /proc miss. + result = status._get_process_start_time(99999999) + # The ps mock returns a valid date so we expect an int. + assert isinstance(result, int) + + def test_get_process_start_time_ps_fallback_returns_none_when_ps_fails(self, monkeypatch): + """ps returning non-zero (dead PID) → None.""" + from types import SimpleNamespace + + monkeypatch.setattr( + status.subprocess, "run", + lambda cmd, **kwargs: SimpleNamespace(returncode=1, stdout=""), + ) + monkeypatch.setattr(status.sys, "platform", "darwin") + + result = status._get_process_start_time(99999999) + assert result is None + + def test_read_process_cmdline_ps_fallback(self, monkeypatch): + """When /proc is unavailable, ps command= fallback returns the cmdline string.""" + from types import SimpleNamespace + + monkeypatch.setattr( + status.subprocess, "run", + lambda cmd, **kwargs: SimpleNamespace( + returncode=0, stdout="python -m hermes_cli.main gateway run\n" + ), + ) + monkeypatch.setattr(status.sys, "platform", "darwin") + + result = status._read_process_cmdline(99999999) + assert result == "python -m hermes_cli.main gateway run" + + def test_read_process_cmdline_ps_fallback_returns_none_on_dead_pid(self, monkeypatch): + from types import SimpleNamespace + + monkeypatch.setattr( + status.subprocess, "run", + lambda cmd, **kwargs: SimpleNamespace(returncode=1, stdout=""), + ) + monkeypatch.setattr(status.sys, "platform", "darwin") + + result = status._read_process_cmdline(99999999) + assert result is None + + +class TestScopedLocksStalePidReuse: + """Tests for the PID-reuse stale-detection fix (issue #15115). + + On macOS, /proc does not exist so start_time is None in lock files written + by older Hermes versions. When a crashed gateway's PID is reused by a + non-gateway process, the lock must be detected as stale via cmdline check. + """ + + def test_acquire_treats_as_stale_when_pid_reused_by_non_gateway(self, tmp_path, monkeypatch): + """Lock with null start_time whose PID is held by a non-gateway process → stale.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + # Simulate a macOS-era lock: start_time is null because /proc wasn't available. + lock_path.write_text(json.dumps({ + "pid": 78452, + "start_time": None, + "kind": "hermes-gateway", + })) + + # PID 78452 is alive (reused by a non-gateway process). + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + # Cmdline returns something that is NOT a gateway. + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: "/usr/bin/some_daemon --start") + + acquired, _ = status.acquire_scoped_lock("weixin-bot-token", "secret", metadata={"platform": "weixin"}) + + assert acquired is True, "Lock with reused PID (non-gateway cmdline) should be treated as stale" + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + + def test_acquire_rejects_when_live_gateway_holds_lock_no_start_time(self, tmp_path, monkeypatch): + """Lock with null start_time whose PID is a live gateway → NOT stale.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 78452, + "start_time": None, + "kind": "hermes-gateway", + })) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + # Cmdline confirms this IS a live gateway process. + monkeypatch.setattr( + status, + "_read_process_cmdline", + lambda pid: "python -m hermes_cli.main gateway run --replace", + ) + + acquired, existing = status.acquire_scoped_lock("weixin-bot-token", "secret", metadata={"platform": "weixin"}) + + assert acquired is False, "Live gateway holding lock must NOT be evicted" + assert existing["pid"] == 78452 + + def test_acquire_treats_as_stale_when_cmdline_unreadable_but_start_time_mismatch(self, tmp_path, monkeypatch): + """Start-time mismatch should still win even when cmdline is unreadable.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 78452, + "start_time": 1000, + "kind": "hermes-gateway", + })) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + # Different start_time → PID was reused. + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 9999) + monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None) + + acquired, _ = status.acquire_scoped_lock("weixin-bot-token", "secret", metadata={"platform": "weixin"}) + + assert acquired is True + + def test_acquire_does_not_trigger_cmdline_check_when_start_times_match(self, tmp_path, monkeypatch): + """When start times match, cmdline check must NOT run (live owner confirmed).""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "weixin-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 78452, + "start_time": 1000, + "kind": "hermes-gateway", + })) + + cmdline_calls = [] + + def fake_read_cmdline(pid): + cmdline_calls.append(pid) + return "/usr/bin/some_daemon" # non-gateway, but should not be reached + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 1000) + monkeypatch.setattr(status, "_read_process_cmdline", fake_read_cmdline) + + acquired, existing = status.acquire_scoped_lock("weixin-bot-token", "secret") + + assert acquired is False + assert not cmdline_calls, "cmdline check should not run when start_times match" + + class TestTakeoverMarker: """Tests for the --replace takeover marker.