fix(kanban): detect darwin zombie workers

This commit is contained in:
LeonSGP43 2026-05-05 11:16:38 +08:00 committed by Teknium
parent f6b68f0f50
commit 1a03e3b1c6
2 changed files with 40 additions and 5 deletions

View file

@ -76,6 +76,7 @@ import os
import re import re
import secrets import secrets
import sqlite3 import sqlite3
import subprocess
import sys import sys
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
@ -2141,16 +2142,16 @@ def _pid_alive(pid: Optional[int]) -> bool:
Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess`` Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess``
on Windows. Returns False for falsy PIDs or on any OS error. on Windows. Returns False for falsy PIDs or on any OS error.
**Zombie handling (Linux):** ``os.kill(pid, 0)`` succeeds against **Zombie handling:** ``os.kill(pid, 0)`` succeeds against
zombie processes (post-exit, pre-reap) because the process table zombie processes (post-exit, pre-reap) because the process table
entry still exists. A worker that exits without being reaped by its entry still exists. A worker that exits without being reaped by its
parent would stay "alive" to the dispatcher forever. Dispatcher parent would stay "alive" to the dispatcher forever. Dispatcher
workers are started via ``start_new_session=True`` + intentional workers are started via ``start_new_session=True`` + intentional
Popen handle abandonment, so init reaps them quickly but during Popen handle abandonment, so init reaps them quickly but during
the window between exit and reap, we'd otherwise see stale "alive" the window between exit and reap, we'd otherwise see stale "alive"
signals. On Linux we additionally peek at ``/proc/<pid>/status`` signals. On Linux we peek at ``/proc/<pid>/status`` and treat
and treat ``State: Z`` as dead. On other POSIX or on Windows the ``State: Z`` as dead. On macOS we ask ``ps`` for the BSD ``stat``
zombie check is a no-op. field and treat values containing ``Z`` as dead.
""" """
if not pid or pid <= 0: if not pid or pid <= 0:
return False return False
@ -2164,7 +2165,8 @@ def _pid_alive(pid: Optional[int]) -> bool:
return True return True
except OSError: except OSError:
return False return False
# Still here → kill(0) succeeded. Check for zombie on Linux. # Still here → kill(0) succeeded. Check for zombie on platforms
# where we have a cheap, deterministic process-state probe.
if sys.platform == "linux": if sys.platform == "linux":
try: try:
with open(f"/proc/{int(pid)}/status", "r") as f: with open(f"/proc/{int(pid)}/status", "r") as f:
@ -2179,6 +2181,23 @@ def _pid_alive(pid: Optional[int]) -> bool:
# PermissionError shouldn't happen for our own children but # PermissionError shouldn't happen for our own children but
# be defensive. # be defensive.
pass pass
elif sys.platform == "darwin":
try:
proc = subprocess.run(
["ps", "-o", "stat=", "-p", str(int(pid))],
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
timeout=1,
check=False,
)
if proc.returncode != 0:
return False
if "Z" in (proc.stdout or "").strip():
return False
except (OSError, subprocess.SubprocessError, TimeoutError):
# If the secondary probe fails, keep the kill(0) answer.
pass
return True return True

View file

@ -13,9 +13,11 @@ from __future__ import annotations
import argparse import argparse
import json import json
import os import os
import subprocess
import threading import threading
import time import time
from pathlib import Path from pathlib import Path
from types import SimpleNamespace
from typing import Optional from typing import Optional
import pytest import pytest
@ -183,6 +185,20 @@ def test_pid_alive_helper():
assert not kb._pid_alive(2 ** 30) assert not kb._pid_alive(2 ** 30)
def test_pid_alive_detects_darwin_zombie(monkeypatch):
monkeypatch.setattr(kb.sys, "platform", "darwin")
monkeypatch.setattr(kb.os, "kill", lambda pid, sig: None)
def fake_run(args, **kwargs):
assert args == ["ps", "-o", "stat=", "-p", "123"]
assert kwargs["stdout"] is subprocess.PIPE
return SimpleNamespace(returncode=0, stdout="Z+\n")
monkeypatch.setattr(kb.subprocess, "run", fake_run)
assert kb._pid_alive(123) is False
def test_detect_crashed_workers_reclaims(kanban_home): def test_detect_crashed_workers_reclaims(kanban_home):
"""A running task whose pid vanished gets dropped to ready with a """A running task whose pid vanished gets dropped to ready with a
``crashed`` event, independent of the claim TTL.""" ``crashed`` event, independent of the claim TTL."""