diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index ac44a3d19f..6ca7894ee1 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -76,6 +76,7 @@ import os import re import secrets import sqlite3 +import subprocess import sys import time from dataclasses import dataclass, field @@ -2141,16 +2142,16 @@ def _pid_alive(pid: Optional[int]) -> bool: Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess`` on Windows. Returns False for falsy PIDs or on any OS error. - **Zombie handling (Linux):** ``os.kill(pid, 0)`` succeeds against + **Zombie handling:** ``os.kill(pid, 0)`` succeeds against zombie processes (post-exit, pre-reap) because the process table entry still exists. A worker that exits without being reaped by its parent would stay "alive" to the dispatcher forever. Dispatcher workers are started via ``start_new_session=True`` + intentional Popen handle abandonment, so init reaps them quickly — but during the window between exit and reap, we'd otherwise see stale "alive" - signals. On Linux we additionally peek at ``/proc//status`` - and treat ``State: Z`` as dead. On other POSIX or on Windows the - zombie check is a no-op. + signals. On Linux we peek at ``/proc//status`` and treat + ``State: Z`` as dead. On macOS we ask ``ps`` for the BSD ``stat`` + field and treat values containing ``Z`` as dead. """ if not pid or pid <= 0: return False @@ -2164,7 +2165,8 @@ def _pid_alive(pid: Optional[int]) -> bool: return True except OSError: return False - # Still here → kill(0) succeeded. Check for zombie on Linux. + # Still here → kill(0) succeeded. Check for zombie on platforms + # where we have a cheap, deterministic process-state probe. if sys.platform == "linux": try: with open(f"/proc/{int(pid)}/status", "r") as f: @@ -2179,6 +2181,23 @@ def _pid_alive(pid: Optional[int]) -> bool: # PermissionError shouldn't happen for our own children but # be defensive. pass + elif sys.platform == "darwin": + try: + proc = subprocess.run( + ["ps", "-o", "stat=", "-p", str(int(pid))], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + timeout=1, + check=False, + ) + if proc.returncode != 0: + return False + if "Z" in (proc.stdout or "").strip(): + return False + except (OSError, subprocess.SubprocessError, TimeoutError): + # If the secondary probe fails, keep the kill(0) answer. + pass return True diff --git a/tests/hermes_cli/test_kanban_core_functionality.py b/tests/hermes_cli/test_kanban_core_functionality.py index 3fe09086e5..6bc198ab99 100644 --- a/tests/hermes_cli/test_kanban_core_functionality.py +++ b/tests/hermes_cli/test_kanban_core_functionality.py @@ -13,9 +13,11 @@ from __future__ import annotations import argparse import json import os +import subprocess import threading import time from pathlib import Path +from types import SimpleNamespace from typing import Optional import pytest @@ -183,6 +185,20 @@ def test_pid_alive_helper(): assert not kb._pid_alive(2 ** 30) +def test_pid_alive_detects_darwin_zombie(monkeypatch): + monkeypatch.setattr(kb.sys, "platform", "darwin") + monkeypatch.setattr(kb.os, "kill", lambda pid, sig: None) + + def fake_run(args, **kwargs): + assert args == ["ps", "-o", "stat=", "-p", "123"] + assert kwargs["stdout"] is subprocess.PIPE + return SimpleNamespace(returncode=0, stdout="Z+\n") + + monkeypatch.setattr(kb.subprocess, "run", fake_run) + + assert kb._pid_alive(123) is False + + def test_detect_crashed_workers_reclaims(kanban_home): """A running task whose pid vanished gets dropped to ready with a ``crashed`` event, independent of the claim TTL."""