mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-19 10:02:16 +00:00
fix(ci): make parallel runner's exit-4 retry robust for newly-added test files (#42994)
The per-file test runner re-runs a file once when pytest exits 4 ("file or
directory not found") while the file exists on disk — a transient seen on
loaded shared CI runners where the planner collects a file (--collect-only
counts its tests) but the per-file subprocess fails to stat it moments later.
A single immediate retry could land in the same brief high-load window and
fail again, and the retry was gated on one Path.exists() check that can itself
be a flaky stat under that load — so a freshly-added test file that LPT pins to
one shard would deterministically red that shard on every run (no actual test
failure; the file just never executes).
- Extract the subprocess spawn/communicate/process-tree-kill logic into a
shared _spawn_pytest_once() helper (removes ~90 lines of duplication between
the primary run and the retry).
- Replace the single-shot retry with a bounded backoff loop
(_EXIT4_RETRY_ATTEMPTS, escalating sleep) that re-runs while the file is
present on disk.
- Add _file_present() which re-checks existence across a few spaced stats, so a
single flaky negative stat doesn't wrongly conclude the file is missing. A
genuinely-missing file (typo/deleted) still fails fast — exit 4 is not
swallowed when the file truly does not exist.
- Tests: transient-then-pass recovery, genuinely-missing fails fast with no
retry, give-up after max attempts, and _file_present transient/missing cases.
This commit is contained in:
parent
6b330522e1
commit
f082b4ec5c
2 changed files with 220 additions and 96 deletions
|
|
@ -246,6 +246,98 @@ def _kill_tree(proc: "subprocess.Popen", pgid: int | None = None) -> None:
|
|||
pass
|
||||
|
||||
|
||||
def _spawn_pytest_once(
|
||||
cmd: List[str],
|
||||
repo_root: Path,
|
||||
file_timeout: float,
|
||||
*,
|
||||
timeout_note: str = "per-file timeout",
|
||||
) -> Tuple[int, str]:
|
||||
"""Run one ``pytest`` subprocess to completion and return ``(rc, output)``.
|
||||
|
||||
Spawns the child in its own process group / session so a hung file and
|
||||
its grandchildren (uvicorn servers, async runtimes, etc.) can be SIGKILL'd
|
||||
as a tree on timeout rather than orphaning onto PID 1. Shared by the
|
||||
primary per-file run and the exit-4 retry loop so the lifecycle/cleanup
|
||||
logic lives in exactly one place.
|
||||
"""
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
# POSIX: place the child at the head of its own process group so
|
||||
# _kill_tree can SIGKILL the group atomically.
|
||||
# Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
|
||||
# _kill_tree handles the Windows path via taskkill /F /T.
|
||||
start_new_session=True,
|
||||
)
|
||||
|
||||
# Capture the pgid NOW, before the leader can exit and be reaped. Once
|
||||
# the leader is reaped, os.getpgid(proc.pid) raises ProcessLookupError
|
||||
# even though grandchildren in that group are still alive — defeating
|
||||
# the whole cleanup. None on Windows where the pgid concept doesn't apply.
|
||||
pgid: int | None = None
|
||||
if sys.platform != "win32":
|
||||
try:
|
||||
pgid = os.getpgid(proc.pid)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
pgid = None
|
||||
|
||||
try:
|
||||
output, _ = proc.communicate(timeout=file_timeout)
|
||||
rc = proc.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
try:
|
||||
output, _ = proc.communicate(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
output = "(file timeout exceeded; output unavailable)"
|
||||
rc = 124 # de facto convention for "killed by timeout".
|
||||
output = (
|
||||
f"({timeout_note}: {file_timeout:.0f}s exceeded; "
|
||||
f"process tree SIGKILL'd)\n{output}"
|
||||
)
|
||||
except BaseException:
|
||||
# KeyboardInterrupt / runner crash — make sure no zombie
|
||||
# grandchildren outlive us.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
raise
|
||||
else:
|
||||
# Happy path: pytest exited on its own. Kill the group anyway in
|
||||
# case it left grandchildren behind; already-dead is a no-op.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
|
||||
return rc, output
|
||||
|
||||
|
||||
# How many times to re-run a file that exits 4 ("file or directory not found")
|
||||
# while the file demonstrably exists on disk. On loaded shared CI runners the
|
||||
# planner can enumerate a file (tests counted via --collect-only) but the
|
||||
# per-file subprocess fail to stat it moments later — and a SINGLE immediate
|
||||
# retry can land in the same brief high-load window and fail again. We retry a
|
||||
# few times with a short backoff so transient I/O pressure has time to settle.
|
||||
_EXIT4_RETRY_ATTEMPTS = 3
|
||||
_EXIT4_RETRY_BACKOFF_SECONDS = 0.5
|
||||
|
||||
|
||||
def _file_present(file: Path, *, attempts: int = 3, delay: float = 0.2) -> bool:
|
||||
"""Return True if ``file`` exists, re-checking a few times.
|
||||
|
||||
``Path.exists()`` itself issues a ``stat`` that can transiently fail under
|
||||
the same load that makes pytest report "file or directory not found", so a
|
||||
single negative check is not authoritative. Only conclude the file is
|
||||
genuinely missing if it's absent across several spaced checks.
|
||||
"""
|
||||
for i in range(attempts):
|
||||
if file.exists():
|
||||
return True
|
||||
if i < attempts - 1:
|
||||
time.sleep(delay)
|
||||
return False
|
||||
|
||||
|
||||
def _run_one_file(
|
||||
file: Path,
|
||||
pytest_args: List[str],
|
||||
|
|
@ -280,104 +372,28 @@ def _run_one_file(
|
|||
"""
|
||||
cmd = [sys.executable, "-m", "pytest", str(file), *pytest_args]
|
||||
subproc_start = time.monotonic()
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
# POSIX: place the child at the head of its own process group so
|
||||
# _kill_tree can SIGKILL the group atomically.
|
||||
# Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
|
||||
# _kill_tree handles the Windows path via taskkill /F /T.
|
||||
start_new_session=True,
|
||||
)
|
||||
rc, output = _spawn_pytest_once(cmd, repo_root, file_timeout)
|
||||
|
||||
# Capture the pgid NOW, before the leader can exit and be reaped.
|
||||
# Once the leader is reaped, os.getpgid(proc.pid) raises
|
||||
# ProcessLookupError even though grandchildren in that group are
|
||||
# still alive — defeating the whole cleanup. None on Windows where
|
||||
# the pgid concept doesn't apply (taskkill walks ppid chain instead).
|
||||
pgid: int | None = None
|
||||
if sys.platform != "win32":
|
||||
try:
|
||||
pgid = os.getpgid(proc.pid)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
# Astonishingly fast child? Already dead. _kill_tree's
|
||||
# fallback will handle this case as a no-op.
|
||||
pgid = None
|
||||
|
||||
try:
|
||||
output, _ = proc.communicate(timeout=file_timeout)
|
||||
rc = proc.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
# Drain whatever the child wrote before we killed it so we have
|
||||
# something to surface in the failure dump.
|
||||
try:
|
||||
output, _ = proc.communicate(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
output = "(file timeout exceeded; output unavailable)"
|
||||
rc = 124 # de facto convention for "killed by timeout".
|
||||
output = (
|
||||
f"(per-file timeout: {file_timeout:.0f}s exceeded; "
|
||||
f"process tree SIGKILL'd)\n{output}"
|
||||
# pytest exit 4 = "file or directory not found" at exec time. On loaded
|
||||
# shared CI runners we have seen the planner enumerate a file (its tests
|
||||
# counted via --collect-only) but the per-file subprocess fail to stat it
|
||||
# moments later — a transient the deterministic LPT slicer otherwise
|
||||
# reproduces on every rerun (same file set → same shard). Re-run the file a
|
||||
# few times with a short backoff so the I/O pressure has time to settle,
|
||||
# but ONLY while the file demonstrably exists on disk. A single immediate
|
||||
# retry (the old behaviour) could land in the same brief high-load window
|
||||
# and fail again; a single Path.exists() check could itself be a flaky stat
|
||||
# under that load, so we re-check existence across spaced attempts.
|
||||
# We do NOT widen the exit-5 rule: exit 4 on a file that genuinely does not
|
||||
# exist must still fail.
|
||||
attempt = 0
|
||||
while rc == 4 and attempt < _EXIT4_RETRY_ATTEMPTS and _file_present(file):
|
||||
attempt += 1
|
||||
time.sleep(_EXIT4_RETRY_BACKOFF_SECONDS * attempt)
|
||||
rc, output = _spawn_pytest_once(
|
||||
cmd, repo_root, file_timeout,
|
||||
timeout_note=f"per-file timeout on exit-4 retry {attempt}",
|
||||
)
|
||||
except BaseException:
|
||||
# KeyboardInterrupt / runner crash — make sure no zombie
|
||||
# grandchildren outlive us.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
raise
|
||||
else:
|
||||
# Happy path: pytest exited on its own. The child process already
|
||||
# cleaned up its grandchildren if it's well-behaved, but
|
||||
# well-behaved is not universal — kill the group anyway. Already-
|
||||
# dead processes are a no-op.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
|
||||
if rc == 4 and Path(file).exists():
|
||||
# pytest exit 4 = "file or directory not found" at exec time, yet the
|
||||
# file is present on disk now. On loaded shared CI runners we have seen
|
||||
# the planner enumerate a file (its tests counted via --collect-only)
|
||||
# but the per-file subprocess fail to stat it moments later — a
|
||||
# transient the deterministic LPT slicer otherwise reproduces on every
|
||||
# rerun (same file set → same shard). Retry the file ONCE before
|
||||
# surfacing it as a hard failure. We do NOT widen the exit-5 rule:
|
||||
# exit 4 on a file that genuinely does not exist must still fail.
|
||||
retry_proc = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
start_new_session=True,
|
||||
)
|
||||
retry_pgid: int | None = None
|
||||
if sys.platform != "win32":
|
||||
try:
|
||||
retry_pgid = os.getpgid(retry_proc.pid)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
retry_pgid = None
|
||||
try:
|
||||
retry_output, _ = retry_proc.communicate(timeout=file_timeout)
|
||||
retry_rc = retry_proc.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
_kill_tree(retry_proc, pgid=retry_pgid)
|
||||
try:
|
||||
retry_output, _ = retry_proc.communicate(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
retry_output = "(file timeout exceeded on retry; output unavailable)"
|
||||
retry_rc = 124
|
||||
retry_output = (
|
||||
f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; "
|
||||
f"process tree SIGKILL'd)\n{retry_output}"
|
||||
)
|
||||
except BaseException:
|
||||
_kill_tree(retry_proc, pgid=retry_pgid)
|
||||
raise
|
||||
else:
|
||||
_kill_tree(retry_proc, pgid=retry_pgid)
|
||||
rc, output = retry_rc, retry_output
|
||||
|
||||
if rc == 5:
|
||||
# No tests collected — every test in the file was filtered out.
|
||||
|
|
|
|||
|
|
@ -185,3 +185,111 @@ def test_grandchild_leak_is_killed_by_runner(tmp_path: Path) -> None:
|
|||
f"diag={diag!r} test_pid={test_pid} test_pgid={test_pgid}; "
|
||||
f"runner output:\n{proc.stdout}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# exit-4 retry loop (transient "file or directory not found" on loaded runners)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import importlib.util as _importlib_util # noqa: E402
|
||||
|
||||
|
||||
def _load_runner_module():
|
||||
"""Import scripts/run_tests_parallel.py as a module for in-process tests."""
|
||||
repo_root = Path(__file__).resolve().parent.parent
|
||||
path = repo_root / "scripts" / "run_tests_parallel.py"
|
||||
spec = _importlib_util.spec_from_file_location("_rtp_under_test", path)
|
||||
mod = _importlib_util.module_from_spec(spec)
|
||||
spec.loader.exec_module(mod)
|
||||
return mod
|
||||
|
||||
|
||||
def test_exit4_retry_recovers_when_file_exists(tmp_path, monkeypatch):
|
||||
"""A file that exits 4 transiently then passes must be retried and recover.
|
||||
|
||||
Simulates the loaded-CI transient: the per-file pytest subprocess reports
|
||||
"file or directory not found" (exit 4) on the first attempts even though
|
||||
the file is on disk, then succeeds. The runner must retry and report pass.
|
||||
"""
|
||||
rtp = _load_runner_module()
|
||||
f = tmp_path / "test_transient.py"
|
||||
f.write_text("def test_ok():\n assert True\n")
|
||||
|
||||
calls = {"n": 0}
|
||||
|
||||
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
|
||||
calls["n"] += 1
|
||||
# First two attempts: transient exit-4. Third: success.
|
||||
if calls["n"] < 3:
|
||||
return 4, "ERROR: file or directory not found\nno tests ran in 0.00s"
|
||||
return 0, "1 passed"
|
||||
|
||||
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
|
||||
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0) # no real sleep
|
||||
|
||||
file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0)
|
||||
assert rc == 0, f"expected recovery to pass, got rc={rc}, output={output!r}"
|
||||
assert calls["n"] == 3, f"expected 3 attempts (1 + 2 retries), got {calls['n']}"
|
||||
|
||||
|
||||
def test_exit4_no_retry_when_file_genuinely_missing(tmp_path, monkeypatch):
|
||||
"""Exit 4 on a file that does NOT exist must fail fast without retrying.
|
||||
|
||||
Guards the narrowing: we only retry while the file is present on disk, so a
|
||||
real typo / deleted file surfaces immediately instead of looping.
|
||||
"""
|
||||
rtp = _load_runner_module()
|
||||
missing = tmp_path / "test_does_not_exist.py" # never created
|
||||
|
||||
calls = {"n": 0}
|
||||
|
||||
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
|
||||
calls["n"] += 1
|
||||
return 4, "ERROR: file or directory not found"
|
||||
|
||||
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
|
||||
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0)
|
||||
|
||||
file, rc, output, summary, _wall = rtp._run_one_file(missing, [], tmp_path, 30.0)
|
||||
assert rc == 4, f"genuinely-missing file should keep rc=4, got {rc}"
|
||||
assert calls["n"] == 1, f"missing file must NOT be retried, got {calls['n']} calls"
|
||||
|
||||
|
||||
def test_exit4_retry_gives_up_after_max_attempts(tmp_path, monkeypatch):
|
||||
"""If the transient never clears, we stop after the bounded attempt count."""
|
||||
rtp = _load_runner_module()
|
||||
f = tmp_path / "test_persistent_transient.py"
|
||||
f.write_text("def test_ok():\n assert True\n")
|
||||
|
||||
calls = {"n": 0}
|
||||
|
||||
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
|
||||
calls["n"] += 1
|
||||
return 4, "ERROR: file or directory not found"
|
||||
|
||||
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
|
||||
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0)
|
||||
|
||||
file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0)
|
||||
assert rc == 4
|
||||
# 1 initial + _EXIT4_RETRY_ATTEMPTS retries.
|
||||
assert calls["n"] == 1 + rtp._EXIT4_RETRY_ATTEMPTS
|
||||
|
||||
|
||||
def test_file_present_tolerates_transient_negative(tmp_path, monkeypatch):
|
||||
"""_file_present must not conclude 'missing' on a single flaky stat."""
|
||||
rtp = _load_runner_module()
|
||||
f = tmp_path / "test_flaky_stat.py"
|
||||
f.write_text("x = 1\n")
|
||||
|
||||
seq = iter([False, False, True]) # first two stats flake, third succeeds
|
||||
monkeypatch.setattr(rtp.Path, "exists", lambda self: next(seq))
|
||||
assert rtp._file_present(f, attempts=3, delay=0.0) is True
|
||||
|
||||
|
||||
def test_file_present_reports_truly_missing(tmp_path, monkeypatch):
|
||||
"""_file_present returns False when the file is absent across all checks."""
|
||||
rtp = _load_runner_module()
|
||||
f = tmp_path / "nope.py"
|
||||
monkeypatch.setattr(rtp.Path, "exists", lambda self: False)
|
||||
assert rtp._file_present(f, attempts=3, delay=0.0) is False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue