diff --git a/gateway/status.py b/gateway/status.py index 2849e775080..0cc8abddb47 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -124,16 +124,33 @@ def get_process_start_time(pid: int) -> Optional[int]: def _read_process_cmdline(pid: int) -> Optional[str]: - """Return the process command line as a space-separated string.""" + """Return the process command line as a space-separated string. + + On Linux, reads /proc//cmdline directly. On macOS and other + platforms without /proc, falls back to ``ps -p -o command=``. + """ cmdline_path = Path(f"/proc/{pid}/cmdline") try: raw = cmdline_path.read_bytes() except (FileNotFoundError, PermissionError, OSError): - return None + pass + else: + if raw: + return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip() - if not raw: - return None - return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip() + try: + result = subprocess.run( + ["ps", "-p", str(pid), "-o", "command="], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout.strip(): + return result.stdout.strip() + except (OSError, subprocess.TimeoutExpired): + pass + + return None def _looks_like_gateway_process(pid: int) -> bool: @@ -594,6 +611,17 @@ def acquire_scoped_lock(scope: str, identity: str, metadata: Optional[dict[str, and current_start != existing.get("start_time") ): stale = True + # When start_time comparison is unavailable (macOS / Windows + # have no /proc, so both sides are None), fall back to + # checking the live process command line. If the PID was + # reused by an unrelated process the lock is stale. + if ( + not stale + and existing.get("start_time") is None + and current_start is None + and not _looks_like_gateway_process(existing_pid) + ): + stale = True # Check if process is stopped (Ctrl+Z / SIGTSTP) — stopped # processes still appear alive to _pid_exists but are not # actually running. Treat them as stale so --replace works. diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 3eed29758d7..8a603260205 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -444,6 +444,56 @@ class TestScopedLocks: assert acquired is False assert existing["pid"] == 99999 + def test_acquire_scoped_lock_replaces_pid_reused_by_unrelated_process(self, tmp_path, monkeypatch): + """macOS regression: PID reused by an unrelated process with start_time=None. + + On macOS /proc is unavailable, so both the lock record and the live + process report start_time=None. The live PID is alive (os.kill + succeeds) but belongs to a completely different program. The lock + must be treated as stale. + """ + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 873, + "start_time": None, + "kind": "hermes-gateway", + "argv": ["/Users/user/.hermes/hermes-agent/hermes_cli/main.py", "gateway", "run", "--replace"], + })) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: False) + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is True + payload = json.loads(lock_path.read_text()) + assert payload["pid"] == os.getpid() + assert payload["metadata"]["platform"] == "telegram" + + def test_acquire_scoped_lock_keeps_lock_when_pid_reused_by_gateway(self, tmp_path, monkeypatch): + """When start_time is None but the live PID still looks like a gateway, keep the lock.""" + monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) + lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + lock_path.write_text(json.dumps({ + "pid": 99999, + "start_time": None, + "kind": "hermes-gateway", + "argv": ["/Users/user/.hermes/hermes-agent/hermes_cli/main.py", "gateway", "run", "--replace"], + })) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: None) + monkeypatch.setattr(status, "_looks_like_gateway_process", lambda pid: True) + + acquired, existing = status.acquire_scoped_lock("telegram-bot-token", "secret", metadata={"platform": "telegram"}) + + assert acquired is False + assert existing["pid"] == 99999 + def test_acquire_scoped_lock_replaces_stale_record(self, tmp_path, monkeypatch): monkeypatch.setenv("HERMES_GATEWAY_LOCK_DIR", str(tmp_path / "locks")) lock_path = tmp_path / "locks" / "telegram-bot-token-2bb80d537b1da3e3.lock" @@ -811,3 +861,46 @@ class TestPlannedStopMarker: ok = status.write_planned_stop_marker(target_pid=12345) assert ok is False + + +class TestReadProcessCmdlinePsFallback: + """Tests for _read_process_cmdline falling back to ps on non-Linux.""" + + def test_ps_fallback_when_proc_unavailable(self, monkeypatch): + monkeypatch.setattr(status.Path, "read_bytes", lambda self: (_ for _ in ()).throw(FileNotFoundError)) + monkeypatch.setattr( + status.subprocess, "run", + lambda args, **kwargs: SimpleNamespace(returncode=0, stdout="/usr/libexec/bluetoothuserd\n"), + ) + result = status._read_process_cmdline(873) + assert result == "/usr/libexec/bluetoothuserd" + + def test_ps_fallback_returns_none_on_failure(self, monkeypatch): + monkeypatch.setattr(status.Path, "read_bytes", lambda self: (_ for _ in ()).throw(FileNotFoundError)) + monkeypatch.setattr( + status.subprocess, "run", + lambda args, **kwargs: SimpleNamespace(returncode=1, stdout=""), + ) + result = status._read_process_cmdline(99999) + assert result is None + + def test_proc_cmdline_takes_priority_over_ps(self, monkeypatch): + calls = [] + + def fake_read_bytes(self): + calls.append("proc") + return b"python\x00hermes_cli/main.py\x00gateway\x00" + + monkeypatch.setattr(status.Path, "read_bytes", fake_read_bytes) + result = status._read_process_cmdline(12345) + assert "hermes_cli/main.py" in result + assert calls == ["proc"] + + def test_ps_fallback_used_when_proc_returns_empty(self, monkeypatch): + monkeypatch.setattr(status.Path, "read_bytes", lambda self: b"") + monkeypatch.setattr( + status.subprocess, "run", + lambda args, **kwargs: SimpleNamespace(returncode=0, stdout="python hermes_cli/main.py gateway run\n"), + ) + result = status._read_process_cmdline(12345) + assert "hermes_cli/main.py" in result