mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(process): SIGKILL the whole tree on escalation, not just wait_procs survivors
Live testing against a real SIGTERM-ignoring process TREE (parent + children, the agent-browser daemon + renderer shape) revealed psutil.wait_procs's gone/alive partition mis-handles a parent/child tree: it reaps via Process.wait() and could mark targets gone/alive inconsistently across the tree, leaving survivors un-killed (flaky — sometimes the parent lived, sometimes a child). Replace it with: sleep out the grace window, then directly re-probe every captured target (_proc_alive, treating zombies as dead) and SIGKILL any that's still running. Add a multi-child-tree regression test. 6/6 escalation tests green across repeated runs; the real-tree E2E now kills the full tree 6/6 runs.
This commit is contained in:
parent
8cbb34b2bf
commit
8cfcbd327d
2 changed files with 76 additions and 5 deletions
|
|
@ -1537,3 +1537,50 @@ class TestSigkillEscalation:
|
|||
monkeypatch.setattr(cfg_mod, "read_raw_config",
|
||||
lambda: {"terminal": {"daemon_term_grace_seconds": -5}})
|
||||
assert ProcessRegistry._daemon_term_grace_seconds() == 0.0
|
||||
|
||||
def test_entire_tree_is_sigkilled_not_just_parent(self, monkeypatch):
|
||||
"""A SIGTERM-ignoring parent + children are ALL force-killed.
|
||||
|
||||
Regression: an earlier implementation trusted psutil.wait_procs's
|
||||
gone/alive partition, which mis-partitioned across a parent/child tree
|
||||
and left survivors un-killed (flaky — sometimes the parent lived,
|
||||
sometimes a child). The escalation now re-probes every target directly.
|
||||
"""
|
||||
import psutil
|
||||
monkeypatch.setattr(ProcessRegistry, "_daemon_term_grace_seconds",
|
||||
staticmethod(lambda: 1.0))
|
||||
# Parent spawns 2 children; all trap SIGTERM. Parent prints child pids
|
||||
# after the handler is installed.
|
||||
parent_src = (
|
||||
"import signal, subprocess, sys, time;"
|
||||
"child='import signal,time\\nsignal.signal(signal.SIGTERM, signal.SIG_IGN)\\n"
|
||||
"[time.sleep(0.2) for _ in iter(int,1)]';"
|
||||
"kids=[subprocess.Popen([sys.executable,'-c',child]) for _ in range(2)];"
|
||||
"signal.signal(signal.SIGTERM, signal.SIG_IGN);"
|
||||
"sys.stdout.write(' '.join(str(k.pid) for k in kids)+'\\n'); sys.stdout.flush();"
|
||||
"[time.sleep(0.2) for _ in iter(int,1)]"
|
||||
)
|
||||
parent = subprocess.Popen([sys.executable, "-c", parent_src],
|
||||
stdout=subprocess.PIPE, text=True)
|
||||
child_pids = [int(x) for x in parent.stdout.readline().split()]
|
||||
all_pids = [parent.pid] + child_pids
|
||||
try:
|
||||
ProcessRegistry._terminate_host_pid(parent.pid)
|
||||
|
||||
def _all_dead():
|
||||
return not any(
|
||||
psutil.pid_exists(p)
|
||||
and ProcessRegistry._proc_alive(psutil.Process(p))
|
||||
for p in all_pids
|
||||
)
|
||||
|
||||
assert _wait_until(_all_dead, timeout=4.0), (
|
||||
"entire SIGTERM-ignoring tree (parent + children) must be SIGKILLed"
|
||||
)
|
||||
finally:
|
||||
for p in all_pids:
|
||||
try:
|
||||
os.kill(p, signal.SIGKILL)
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
pass
|
||||
parent.wait()
|
||||
|
|
|
|||
|
|
@ -483,6 +483,20 @@ class ProcessRegistry:
|
|||
self._move_to_finished(session)
|
||||
return session
|
||||
|
||||
@staticmethod
|
||||
def _proc_alive(proc) -> bool:
|
||||
"""True if a psutil.Process is running and not a zombie.
|
||||
|
||||
A zombie is already dead (just unreaped), so there's nothing to SIGKILL.
|
||||
"""
|
||||
try:
|
||||
import psutil
|
||||
if not proc.is_running():
|
||||
return False
|
||||
return proc.status() != psutil.STATUS_ZOMBIE
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _daemon_term_grace_seconds() -> float:
|
||||
"""Grace window (s) between SIGTERM and escalated SIGKILL.
|
||||
|
|
@ -603,12 +617,22 @@ class ProcessRegistry:
|
|||
grace = cls._daemon_term_grace_seconds()
|
||||
if grace <= 0:
|
||||
return
|
||||
try:
|
||||
_gone, alive = psutil.wait_procs(targets, timeout=grace)
|
||||
except (psutil.Error, OSError):
|
||||
alive = []
|
||||
for proc in alive:
|
||||
# Sleep out the grace window, then independently re-probe every target
|
||||
# and SIGKILL any survivor. We deliberately do NOT trust
|
||||
# ``psutil.wait_procs``'s gone/alive partition here: it reaps via
|
||||
# ``Process.wait()`` and can mis-partition when a target transitions
|
||||
# through a zombie state or when reaping is racy across a parent/child
|
||||
# tree, which left survivors un-killed. A direct liveness re-probe is
|
||||
# deterministic.
|
||||
deadline = time.monotonic() + grace
|
||||
while time.monotonic() < deadline:
|
||||
if not any(cls._proc_alive(_p) for _p in targets):
|
||||
break
|
||||
time.sleep(0.05)
|
||||
for proc in targets:
|
||||
try:
|
||||
if not cls._proc_alive(proc):
|
||||
continue
|
||||
proc.kill() # SIGKILL on POSIX
|
||||
logger.info(
|
||||
"Escalated to SIGKILL for pid %d (ignored SIGTERM within "
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue