hermes-agent/tests/tools/test_local_interrupt_cleanup.py
alt-glitch 4b16341975 refactor(restructure): rewrite all imports for hermes_agent package
Rewrite all import statements, patch() targets, sys.modules keys,
importlib.import_module() strings, and subprocess -m references to use
hermes_agent.* paths.

Strip sys.path.insert hacks from production code (rely on editable install).
Update COMPONENT_PREFIXES for logger filtering.
Fix 3 hardcoded getLogger() calls to use __name__.
Update transport and tool registry discovery paths.
Update plugin module path strings.
Add legacy process-name patterns for gateway PID detection.
Add main() to skills_sync for console_script entry point.
Fix _get_bundled_dir() path traversal after move.

Part of #14182, #14183
2026-04-23 08:35:34 +05:30

145 lines
6.1 KiB
Python

"""Regression tests for _wait_for_process subprocess cleanup on exception exit.
When the poll loop exits via KeyboardInterrupt or SystemExit (SIGTERM via
cli.py signal handler, SIGINT on the main thread in non-interactive -q mode,
or explicit sys.exit from some caller), the child subprocess must be killed
before the exception propagates — otherwise the local backend's use of
os.setsid leaves an orphan with PPID=1.
The live repro that motivated this: hermes chat -q ... 'sleep 300', SIGTERM
to the python process, sleep 300 survived with PPID=1 for the full 300 s
because _wait_for_process never got to call _kill_process before python
died. See commit message for full context.
"""
import os
import signal
import subprocess
import threading
import time
import pytest
from hermes_agent.backends.local import LocalEnvironment
@pytest.fixture(autouse=True)
def _isolate_hermes_home(tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
(tmp_path / "logs").mkdir(exist_ok=True)
def _pgid_still_alive(pgid: int) -> bool:
"""Return True if any process in the given process group is still alive."""
try:
os.killpg(pgid, 0) # signal 0 = existence check
return True
except ProcessLookupError:
return False
def test_wait_for_process_kills_subprocess_on_keyboardinterrupt():
"""When KeyboardInterrupt arrives mid-poll, the subprocess group must be
killed before the exception is re-raised."""
env = LocalEnvironment(cwd="/tmp")
try:
result_holder = {}
proc_holder = {}
started = threading.Event()
raise_at = [None] # set by the main thread to tell worker when
# Drive execute() on a separate thread so we can SIGNAL-interrupt it
# via a thread-targeted exception without killing our test process.
def worker():
# Spawn a subprocess that will definitely be alive long enough
# to observe the cleanup, via env.execute(...) — the normal path
# that goes through _wait_for_process.
try:
result_holder["result"] = env.execute("sleep 30", timeout=60)
except BaseException as e: # noqa: BLE001 — we want to observe it
result_holder["exception"] = type(e).__name__
t = threading.Thread(target=worker, daemon=True)
t.start()
# Wait until the subprocess actually exists. LocalEnvironment.execute
# does init_session() (one spawn) before the real command, so we need
# to wait until a sleep 30 is visible. Use pgrep-style lookup via
# /proc to find the bash process running our sleep.
deadline = time.monotonic() + 5.0
target_pid = None
while time.monotonic() < deadline:
# Walk our children and grand-children to find one running 'sleep 30'
try:
import psutil # optional — fall back if absent
for p in psutil.Process(os.getpid()).children(recursive=True):
try:
if "sleep 30" in " ".join(p.cmdline()):
target_pid = p.pid
break
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
except ImportError:
# Fall back to ps
ps = subprocess.run(
["ps", "-eo", "pid,ppid,pgid,cmd"], capture_output=True, text=True,
)
for line in ps.stdout.splitlines():
if "sleep 30" in line and "grep" not in line:
parts = line.split()
if parts and parts[0].isdigit():
target_pid = int(parts[0])
break
if target_pid:
break
time.sleep(0.1)
assert target_pid is not None, (
"test setup: couldn't find 'sleep 30' subprocess after 5 s"
)
pgid = os.getpgid(target_pid)
assert _pgid_still_alive(pgid), "sanity: subprocess should be alive"
# Now inject a KeyboardInterrupt into the worker thread the same
# way CPython's signal machinery would. We use ctypes.PyThreadState_SetAsyncExc
# which is how signal delivery to non-main threads is simulated.
import ctypes
import sys as _sys
# py-thread-state exception targets need the ident, not the Thread
tid = t.ident
assert tid is not None
# Fire KeyboardInterrupt into the worker thread
ret = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_ulong(tid), ctypes.py_object(KeyboardInterrupt),
)
assert ret == 1, f"SetAsyncExc returned {ret}, expected 1"
# Give the worker a moment to: hit the exception at the next poll,
# run the except-block cleanup (_kill_process), and exit.
t.join(timeout=5.0)
assert not t.is_alive(), "worker didn't exit within 5 s of the interrupt"
# The critical assertion: the subprocess GROUP must be dead. Not
# just the bash wrapper — the 'sleep 30' child too.
# Give the SIGTERM+1s wait+SIGKILL escalation a moment to complete.
deadline = time.monotonic() + 3.0
while time.monotonic() < deadline:
if not _pgid_still_alive(pgid):
break
time.sleep(0.1)
assert not _pgid_still_alive(pgid), (
f"subprocess group {pgid} is STILL ALIVE after worker received "
f"KeyboardInterrupt — orphan bug regressed. This is the "
f"sleep-300-survives-SIGTERM scenario from Physikal's Apr 2026 "
f"report. See tools/environments/base.py _wait_for_process "
f"except-block."
)
# And the worker should have observed the KeyboardInterrupt (i.e.
# it re-raised cleanly, not silently swallowed).
assert result_holder.get("exception") == "KeyboardInterrupt", (
f"worker result: {result_holder!r} — expected KeyboardInterrupt "
f"propagation after cleanup"
)
finally:
try:
env.cleanup()
except Exception:
pass