mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-15 04:12:25 +00:00
fix(kanban): detect darwin zombie workers
This commit is contained in:
parent
f6b68f0f50
commit
1a03e3b1c6
2 changed files with 40 additions and 5 deletions
|
|
@ -76,6 +76,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import secrets
|
import secrets
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
@ -2141,16 +2142,16 @@ def _pid_alive(pid: Optional[int]) -> bool:
|
||||||
Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess``
|
Cross-platform: uses ``os.kill(pid, 0)`` on POSIX and ``OpenProcess``
|
||||||
on Windows. Returns False for falsy PIDs or on any OS error.
|
on Windows. Returns False for falsy PIDs or on any OS error.
|
||||||
|
|
||||||
**Zombie handling (Linux):** ``os.kill(pid, 0)`` succeeds against
|
**Zombie handling:** ``os.kill(pid, 0)`` succeeds against
|
||||||
zombie processes (post-exit, pre-reap) because the process table
|
zombie processes (post-exit, pre-reap) because the process table
|
||||||
entry still exists. A worker that exits without being reaped by its
|
entry still exists. A worker that exits without being reaped by its
|
||||||
parent would stay "alive" to the dispatcher forever. Dispatcher
|
parent would stay "alive" to the dispatcher forever. Dispatcher
|
||||||
workers are started via ``start_new_session=True`` + intentional
|
workers are started via ``start_new_session=True`` + intentional
|
||||||
Popen handle abandonment, so init reaps them quickly — but during
|
Popen handle abandonment, so init reaps them quickly — but during
|
||||||
the window between exit and reap, we'd otherwise see stale "alive"
|
the window between exit and reap, we'd otherwise see stale "alive"
|
||||||
signals. On Linux we additionally peek at ``/proc/<pid>/status``
|
signals. On Linux we peek at ``/proc/<pid>/status`` and treat
|
||||||
and treat ``State: Z`` as dead. On other POSIX or on Windows the
|
``State: Z`` as dead. On macOS we ask ``ps`` for the BSD ``stat``
|
||||||
zombie check is a no-op.
|
field and treat values containing ``Z`` as dead.
|
||||||
"""
|
"""
|
||||||
if not pid or pid <= 0:
|
if not pid or pid <= 0:
|
||||||
return False
|
return False
|
||||||
|
|
@ -2164,7 +2165,8 @@ def _pid_alive(pid: Optional[int]) -> bool:
|
||||||
return True
|
return True
|
||||||
except OSError:
|
except OSError:
|
||||||
return False
|
return False
|
||||||
# Still here → kill(0) succeeded. Check for zombie on Linux.
|
# Still here → kill(0) succeeded. Check for zombie on platforms
|
||||||
|
# where we have a cheap, deterministic process-state probe.
|
||||||
if sys.platform == "linux":
|
if sys.platform == "linux":
|
||||||
try:
|
try:
|
||||||
with open(f"/proc/{int(pid)}/status", "r") as f:
|
with open(f"/proc/{int(pid)}/status", "r") as f:
|
||||||
|
|
@ -2179,6 +2181,23 @@ def _pid_alive(pid: Optional[int]) -> bool:
|
||||||
# PermissionError shouldn't happen for our own children but
|
# PermissionError shouldn't happen for our own children but
|
||||||
# be defensive.
|
# be defensive.
|
||||||
pass
|
pass
|
||||||
|
elif sys.platform == "darwin":
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
["ps", "-o", "stat=", "-p", str(int(pid))],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.DEVNULL,
|
||||||
|
text=True,
|
||||||
|
timeout=1,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
if proc.returncode != 0:
|
||||||
|
return False
|
||||||
|
if "Z" in (proc.stdout or "").strip():
|
||||||
|
return False
|
||||||
|
except (OSError, subprocess.SubprocessError, TimeoutError):
|
||||||
|
# If the secondary probe fails, keep the kill(0) answer.
|
||||||
|
pass
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,9 +13,11 @@ from __future__ import annotations
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import subprocess
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
@ -183,6 +185,20 @@ def test_pid_alive_helper():
|
||||||
assert not kb._pid_alive(2 ** 30)
|
assert not kb._pid_alive(2 ** 30)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pid_alive_detects_darwin_zombie(monkeypatch):
|
||||||
|
monkeypatch.setattr(kb.sys, "platform", "darwin")
|
||||||
|
monkeypatch.setattr(kb.os, "kill", lambda pid, sig: None)
|
||||||
|
|
||||||
|
def fake_run(args, **kwargs):
|
||||||
|
assert args == ["ps", "-o", "stat=", "-p", "123"]
|
||||||
|
assert kwargs["stdout"] is subprocess.PIPE
|
||||||
|
return SimpleNamespace(returncode=0, stdout="Z+\n")
|
||||||
|
|
||||||
|
monkeypatch.setattr(kb.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
assert kb._pid_alive(123) is False
|
||||||
|
|
||||||
|
|
||||||
def test_detect_crashed_workers_reclaims(kanban_home):
|
def test_detect_crashed_workers_reclaims(kanban_home):
|
||||||
"""A running task whose pid vanished gets dropped to ready with a
|
"""A running task whose pid vanished gets dropped to ready with a
|
||||||
``crashed`` event, independent of the claim TTL."""
|
``crashed`` event, independent of the claim TTL."""
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue