mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 08:32:09 +00:00
The per-file test runner re-runs a file once when pytest exits 4 ("file or
directory not found") while the file exists on disk — a transient seen on
loaded shared CI runners where the planner collects a file (--collect-only
counts its tests) but the per-file subprocess fails to stat it moments later.
A single immediate retry could land in the same brief high-load window and
fail again, and the retry was gated on one Path.exists() check that can itself
be a flaky stat under that load — so a freshly-added test file that LPT pins to
one shard would deterministically red that shard on every run (no actual test
failure; the file just never executes).
- Extract the subprocess spawn/communicate/process-tree-kill logic into a
shared _spawn_pytest_once() helper (removes ~90 lines of duplication between
the primary run and the retry).
- Replace the single-shot retry with a bounded backoff loop
(_EXIT4_RETRY_ATTEMPTS, escalating sleep) that re-runs while the file is
present on disk.
- Add _file_present() which re-checks existence across a few spaced stats, so a
single flaky negative stat doesn't wrongly conclude the file is missing. A
genuinely-missing file (typo/deleted) still fails fast — exit 4 is not
swallowed when the file truly does not exist.
- Tests: transient-then-pass recovery, genuinely-missing fails fast with no
retry, give-up after max attempts, and _file_present transient/missing cases.
295 lines
11 KiB
Python
295 lines
11 KiB
Python
"""Verify scripts/run_tests_parallel.py kills test-spawned grandchildren.
|
|
|
|
Setup
|
|
-----
|
|
A test in this file spawns a long-lived Python grandchild that writes
|
|
its PID + a nonce to a tempfile, then exits without cleaning up.
|
|
With the old ``subprocess.run`` runner, that grandchild would orphan
|
|
and outlive the test (and the whole runner). With the current Popen +
|
|
``start_new_session`` + ``_kill_tree`` runner, the grandchild gets
|
|
SIGKILL'd via process-group kill when its file's pytest exits.
|
|
|
|
The leaker test always passes — its only job is to spawn a grandchild
|
|
and walk away. The verifier runs the runner over the leaker file in a
|
|
subprocess, then waits for the grandchild PID to disappear from the
|
|
kernel's process table.
|
|
|
|
POSIX-only: Windows has its own grandchild lifecycle (no shared session,
|
|
``taskkill /F /T`` semantics). Marked accordingly.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import textwrap
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
|
|
# Both tests share the same handoff file: the leaker writes here, the
|
|
# verifier reads here. We park it in $TMPDIR with a unique-per-run name
|
|
# so concurrent invocations of the suite don't clobber each other.
|
|
_HANDOFF_DIR = Path(os.environ.get("TMPDIR", "/tmp")) / "hermes-isolation-probe"
|
|
_HANDOFF_DIR.mkdir(exist_ok=True)
|
|
|
|
|
|
def _handoff_path_for(nonce: str) -> Path:
|
|
return _HANDOFF_DIR / f"grandchild-{nonce}.json"
|
|
|
|
|
|
def _pid_alive(pid: int) -> bool:
|
|
"""POSIX: send signal 0 to probe whether ``pid`` is still alive.
|
|
|
|
``os.kill(pid, 0)`` raises ``ProcessLookupError`` if the process is
|
|
gone, ``PermissionError`` if it exists but we can't signal it
|
|
(someone else's pid). We treat PermissionError as "alive" because
|
|
the process exists and that's all we need to know.
|
|
"""
|
|
if sys.platform == "win32": # pragma: no cover — POSIX-only test
|
|
# On Windows we'd use OpenProcess + GetExitCodeProcess; this
|
|
# test is skipped on Windows so the path is unreachable.
|
|
raise RuntimeError("_pid_alive POSIX-only")
|
|
try:
|
|
os.kill(pid, 0)
|
|
except ProcessLookupError:
|
|
return False
|
|
except PermissionError:
|
|
return True
|
|
return True
|
|
|
|
|
|
@pytest.mark.skipif(sys.platform == "win32", reason="POSIX-only probe")
|
|
@pytest.mark.live_system_guard_bypass
|
|
def test_grandchild_leak_is_killed_by_runner(tmp_path: Path) -> None:
|
|
"""Run the parallel runner over a probe file and verify cleanup.
|
|
|
|
1. Materialize a probe file that spawns a long-lived grandchild and
|
|
writes its PID to disk before exiting.
|
|
2. Invoke ``scripts/run_tests_parallel.py`` against the probe file.
|
|
3. Wait for the grandchild PID to vanish (poll for ~5s).
|
|
4. Assert the runner exited cleanly AND the grandchild is dead.
|
|
"""
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
runner = repo_root / "scripts" / "run_tests_parallel.py"
|
|
assert runner.exists(), f"runner missing at {runner}"
|
|
|
|
# Probe lives in a temp dir, NOT under tests/, so the regular suite
|
|
# never picks it up — only our explicit invocation does.
|
|
probe_dir = tmp_path / "probe"
|
|
probe_dir.mkdir()
|
|
probe = probe_dir / "test_probe_leaker.py"
|
|
nonce = f"{os.getpid()}-{int(time.time() * 1000)}"
|
|
handoff = _handoff_path_for(nonce)
|
|
if handoff.exists():
|
|
handoff.unlink()
|
|
|
|
probe_src = textwrap.dedent(f"""
|
|
import json, os, subprocess, sys, time
|
|
from pathlib import Path
|
|
|
|
HANDOFF = Path({str(handoff)!r})
|
|
|
|
def test_spawns_grandchild_and_walks_away():
|
|
# Long-lived grandchild: detached, ignores SIGTERM (we want
|
|
# SIGKILL or process-group kill to be the only thing that
|
|
# works, simulating a misbehaving server).
|
|
child = subprocess.Popen(
|
|
[
|
|
sys.executable, "-c",
|
|
"import os, signal, sys, time; "
|
|
"signal.signal(signal.SIGTERM, signal.SIG_IGN); "
|
|
"sys.stdout.write(f'gc-pgid={{os.getpgid(0)}} gc-pid={{os.getpid()}}\\\\n'); "
|
|
"sys.stdout.flush(); "
|
|
"time.sleep(600)",
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
# IMPORTANT: do NOT pass start_new_session here. We want
|
|
# the grandchild to inherit the pytest subprocess's
|
|
# process group, so when the runner kills the group the
|
|
# grandchild dies too.
|
|
)
|
|
# Read the first line so we can record gc's pgid in the
|
|
# handoff, then walk away — don't close the pipe (would
|
|
# signal EOF and let the child see SIGPIPE on next write).
|
|
first_line = child.stdout.readline().decode().strip()
|
|
HANDOFF.write_text(json.dumps({{
|
|
"pid": child.pid,
|
|
"diag": first_line,
|
|
"test_pid": os.getpid(),
|
|
"test_pgid": os.getpgid(0),
|
|
}}))
|
|
assert child.pid > 0
|
|
""").strip()
|
|
probe.write_text(probe_src + "\n")
|
|
|
|
# Run the parallel runner against just the probe file. The runner
|
|
# discovers under ``tests/`` by default, so we override via --paths.
|
|
proc = subprocess.run(
|
|
[
|
|
sys.executable,
|
|
str(runner),
|
|
"--paths",
|
|
str(probe_dir),
|
|
"-j",
|
|
"1",
|
|
# Tight per-file timeout: the probe finishes in <1s, no
|
|
# need for 10min.
|
|
"--file-timeout",
|
|
"30",
|
|
],
|
|
cwd=repo_root,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.STDOUT,
|
|
text=True,
|
|
timeout=60,
|
|
)
|
|
|
|
assert handoff.exists(), (
|
|
f"probe never wrote handoff file; runner output:\n{proc.stdout}"
|
|
)
|
|
handoff_data = json.loads(handoff.read_text())
|
|
grandchild_pid = handoff_data["pid"]
|
|
diag = handoff_data.get("diag", "(no diag)")
|
|
test_pid = handoff_data.get("test_pid")
|
|
test_pgid = handoff_data.get("test_pgid")
|
|
handoff.unlink()
|
|
|
|
# The runner must have exited cleanly (probe test passes).
|
|
assert proc.returncode == 0, (
|
|
f"runner exited {proc.returncode}; output:\n{proc.stdout}"
|
|
)
|
|
|
|
# The grandchild must be gone. Poll for a bit because process-group
|
|
# SIGKILL + reaping isn't synchronous; on a loaded box it can take
|
|
# a beat.
|
|
deadline = time.monotonic() + 5.0
|
|
while time.monotonic() < deadline:
|
|
if not _pid_alive(grandchild_pid):
|
|
break
|
|
time.sleep(0.05)
|
|
else:
|
|
# Test cleanup: kill the leaked grandchild ourselves so a
|
|
# FAILED assertion doesn't leave a sleep(600) running.
|
|
try:
|
|
os.kill(grandchild_pid, 9)
|
|
except ProcessLookupError:
|
|
pass
|
|
pytest.fail(
|
|
f"grandchild PID {grandchild_pid} survived runner exit; "
|
|
f"diag={diag!r} test_pid={test_pid} test_pgid={test_pgid}; "
|
|
f"runner output:\n{proc.stdout}"
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# exit-4 retry loop (transient "file or directory not found" on loaded runners)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
import importlib.util as _importlib_util # noqa: E402
|
|
|
|
|
|
def _load_runner_module():
|
|
"""Import scripts/run_tests_parallel.py as a module for in-process tests."""
|
|
repo_root = Path(__file__).resolve().parent.parent
|
|
path = repo_root / "scripts" / "run_tests_parallel.py"
|
|
spec = _importlib_util.spec_from_file_location("_rtp_under_test", path)
|
|
mod = _importlib_util.module_from_spec(spec)
|
|
spec.loader.exec_module(mod)
|
|
return mod
|
|
|
|
|
|
def test_exit4_retry_recovers_when_file_exists(tmp_path, monkeypatch):
|
|
"""A file that exits 4 transiently then passes must be retried and recover.
|
|
|
|
Simulates the loaded-CI transient: the per-file pytest subprocess reports
|
|
"file or directory not found" (exit 4) on the first attempts even though
|
|
the file is on disk, then succeeds. The runner must retry and report pass.
|
|
"""
|
|
rtp = _load_runner_module()
|
|
f = tmp_path / "test_transient.py"
|
|
f.write_text("def test_ok():\n assert True\n")
|
|
|
|
calls = {"n": 0}
|
|
|
|
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
|
|
calls["n"] += 1
|
|
# First two attempts: transient exit-4. Third: success.
|
|
if calls["n"] < 3:
|
|
return 4, "ERROR: file or directory not found\nno tests ran in 0.00s"
|
|
return 0, "1 passed"
|
|
|
|
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
|
|
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0) # no real sleep
|
|
|
|
file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0)
|
|
assert rc == 0, f"expected recovery to pass, got rc={rc}, output={output!r}"
|
|
assert calls["n"] == 3, f"expected 3 attempts (1 + 2 retries), got {calls['n']}"
|
|
|
|
|
|
def test_exit4_no_retry_when_file_genuinely_missing(tmp_path, monkeypatch):
|
|
"""Exit 4 on a file that does NOT exist must fail fast without retrying.
|
|
|
|
Guards the narrowing: we only retry while the file is present on disk, so a
|
|
real typo / deleted file surfaces immediately instead of looping.
|
|
"""
|
|
rtp = _load_runner_module()
|
|
missing = tmp_path / "test_does_not_exist.py" # never created
|
|
|
|
calls = {"n": 0}
|
|
|
|
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
|
|
calls["n"] += 1
|
|
return 4, "ERROR: file or directory not found"
|
|
|
|
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
|
|
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0)
|
|
|
|
file, rc, output, summary, _wall = rtp._run_one_file(missing, [], tmp_path, 30.0)
|
|
assert rc == 4, f"genuinely-missing file should keep rc=4, got {rc}"
|
|
assert calls["n"] == 1, f"missing file must NOT be retried, got {calls['n']} calls"
|
|
|
|
|
|
def test_exit4_retry_gives_up_after_max_attempts(tmp_path, monkeypatch):
|
|
"""If the transient never clears, we stop after the bounded attempt count."""
|
|
rtp = _load_runner_module()
|
|
f = tmp_path / "test_persistent_transient.py"
|
|
f.write_text("def test_ok():\n assert True\n")
|
|
|
|
calls = {"n": 0}
|
|
|
|
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
|
|
calls["n"] += 1
|
|
return 4, "ERROR: file or directory not found"
|
|
|
|
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
|
|
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0)
|
|
|
|
file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0)
|
|
assert rc == 4
|
|
# 1 initial + _EXIT4_RETRY_ATTEMPTS retries.
|
|
assert calls["n"] == 1 + rtp._EXIT4_RETRY_ATTEMPTS
|
|
|
|
|
|
def test_file_present_tolerates_transient_negative(tmp_path, monkeypatch):
|
|
"""_file_present must not conclude 'missing' on a single flaky stat."""
|
|
rtp = _load_runner_module()
|
|
f = tmp_path / "test_flaky_stat.py"
|
|
f.write_text("x = 1\n")
|
|
|
|
seq = iter([False, False, True]) # first two stats flake, third succeeds
|
|
monkeypatch.setattr(rtp.Path, "exists", lambda self: next(seq))
|
|
assert rtp._file_present(f, attempts=3, delay=0.0) is True
|
|
|
|
|
|
def test_file_present_reports_truly_missing(tmp_path, monkeypatch):
|
|
"""_file_present returns False when the file is absent across all checks."""
|
|
rtp = _load_runner_module()
|
|
f = tmp_path / "nope.py"
|
|
monkeypatch.setattr(rtp.Path, "exists", lambda self: False)
|
|
assert rtp._file_present(f, attempts=3, delay=0.0) is False
|