fix(process): SIGKILL the whole tree on escalation, not just wait_procs survivors

Live testing against a real SIGTERM-ignoring process TREE (parent + children,
the agent-browser daemon + renderer shape) revealed psutil.wait_procs's
gone/alive partition mis-handles a parent/child tree: it reaps via
Process.wait() and could mark targets gone/alive inconsistently across the
tree, leaving survivors un-killed (flaky — sometimes the parent lived,
sometimes a child). Replace it with: sleep out the grace window, then
directly re-probe every captured target (_proc_alive, treating zombies as
dead) and SIGKILL any that's still running. Add a multi-child-tree regression
test. 6/6 escalation tests green across repeated runs; the real-tree E2E now
kills the full tree 6/6 runs.
This commit is contained in:
teknium1 2026-06-21 18:09:58 -07:00 committed by Teknium
parent 8cbb34b2bf
commit 8cfcbd327d
2 changed files with 76 additions and 5 deletions

View file

@ -1537,3 +1537,50 @@ class TestSigkillEscalation:
monkeypatch.setattr(cfg_mod, "read_raw_config",
lambda: {"terminal": {"daemon_term_grace_seconds": -5}})
assert ProcessRegistry._daemon_term_grace_seconds() == 0.0
def test_entire_tree_is_sigkilled_not_just_parent(self, monkeypatch):
"""A SIGTERM-ignoring parent + children are ALL force-killed.
Regression: an earlier implementation trusted psutil.wait_procs's
gone/alive partition, which mis-partitioned across a parent/child tree
and left survivors un-killed (flaky sometimes the parent lived,
sometimes a child). The escalation now re-probes every target directly.
"""
import psutil
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
staticmethod(lambda: 1.0))
# Parent spawns 2 children; all trap SIGTERM. Parent prints child pids
# after the handler is installed.
parent_src = (
"import signal, subprocess, sys, time;"
"child='import signal,time\\nsignal.signal(signal.SIGTERM, signal.SIG_IGN)\\n"
"[time.sleep(0.2) for _ in iter(int,1)]';"
"kids=[subprocess.Popen([sys.executable,'-c',child]) for _ in range(2)];"
"signal.signal(signal.SIGTERM, signal.SIG_IGN);"
"sys.stdout.write(' '.join(str(k.pid) for k in kids)+'\\n'); sys.stdout.flush();"
"[time.sleep(0.2) for _ in iter(int,1)]"
)
parent = subprocess.Popen([sys.executable, "-c", parent_src],
stdout=subprocess.PIPE, text=True)
child_pids = [int(x) for x in parent.stdout.readline().split()]
all_pids = [parent.pid] + child_pids
try:
ProcessRegistry._terminate_host_pid(parent.pid)
def _all_dead():
return not any(
psutil.pid_exists(p)
and ProcessRegistry._proc_alive(psutil.Process(p))
for p in all_pids
)
assert _wait_until(_all_dead, timeout=4.0), (
"entire SIGTERM-ignoring tree (parent + children) must be SIGKILLed"
)
finally:
for p in all_pids:
try:
os.kill(p, signal.SIGKILL)
except (ProcessLookupError, PermissionError, OSError):
pass
parent.wait()

View file

@ -483,6 +483,20 @@ class ProcessRegistry:
self._move_to_finished(session)
return session
@staticmethod
def _proc_alive(proc) -> bool:
"""True if a psutil.Process is running and not a zombie.
A zombie is already dead (just unreaped), so there's nothing to SIGKILL.
"""
try:
import psutil
if not proc.is_running():
return False
return proc.status() != psutil.STATUS_ZOMBIE
except Exception:
return False
@staticmethod
def _daemon_term_grace_seconds() -> float:
"""Grace window (s) between SIGTERM and escalated SIGKILL.
@ -603,12 +617,22 @@ class ProcessRegistry:
grace = cls._daemon_term_grace_seconds()
if grace <= 0:
return
try:
_gone, alive = psutil.wait_procs(targets, timeout=grace)
except (psutil.Error, OSError):
alive = []
for proc in alive:
# Sleep out the grace window, then independently re-probe every target
# and SIGKILL any survivor. We deliberately do NOT trust
# ``psutil.wait_procs``'s gone/alive partition here: it reaps via
# ``Process.wait()`` and can mis-partition when a target transitions
# through a zombie state or when reaping is racy across a parent/child
# tree, which left survivors un-killed. A direct liveness re-probe is
# deterministic.
deadline = time.monotonic() + grace
while time.monotonic() < deadline:
if not any(cls._proc_alive(_p) for _p in targets):
break
time.sleep(0.05)
for proc in targets:
try:
if not cls._proc_alive(proc):
continue
proc.kill() # SIGKILL on POSIX
logger.info(
"Escalated to SIGKILL for pid %d (ignored SIGTERM within "