fix(ci): make parallel runner's exit-4 retry robust for newly-added test files (#42994)

The per-file test runner re-runs a file once when pytest exits 4 ("file or
directory not found") while the file exists on disk — a transient seen on
loaded shared CI runners where the planner collects a file (--collect-only
counts its tests) but the per-file subprocess fails to stat it moments later.

A single immediate retry could land in the same brief high-load window and
fail again, and the retry was gated on one Path.exists() check that can itself
be a flaky stat under that load — so a freshly-added test file that LPT pins to
one shard would deterministically red that shard on every run (no actual test
failure; the file just never executes).

- Extract the subprocess spawn/communicate/process-tree-kill logic into a
  shared _spawn_pytest_once() helper (removes ~90 lines of duplication between
  the primary run and the retry).
- Replace the single-shot retry with a bounded backoff loop
  (_EXIT4_RETRY_ATTEMPTS, escalating sleep) that re-runs while the file is
  present on disk.
- Add _file_present() which re-checks existence across a few spaced stats, so a
  single flaky negative stat doesn't wrongly conclude the file is missing. A
  genuinely-missing file (typo/deleted) still fails fast — exit 4 is not
  swallowed when the file truly does not exist.
- Tests: transient-then-pass recovery, genuinely-missing fails fast with no
  retry, give-up after max attempts, and _file_present transient/missing cases.
This commit is contained in:
Teknium 2026-06-09 21:39:09 -07:00 committed by GitHub
parent 6b330522e1
commit f082b4ec5c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 220 additions and 96 deletions

View file

@ -246,6 +246,98 @@ def _kill_tree(proc: "subprocess.Popen", pgid: int | None = None) -> None:
pass
def _spawn_pytest_once(
cmd: List[str],
repo_root: Path,
file_timeout: float,
*,
timeout_note: str = "per-file timeout",
) -> Tuple[int, str]:
"""Run one ``pytest`` subprocess to completion and return ``(rc, output)``.
Spawns the child in its own process group / session so a hung file and
its grandchildren (uvicorn servers, async runtimes, etc.) can be SIGKILL'd
as a tree on timeout rather than orphaning onto PID 1. Shared by the
primary per-file run and the exit-4 retry loop so the lifecycle/cleanup
logic lives in exactly one place.
"""
proc = subprocess.Popen(
cmd,
cwd=repo_root,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
# POSIX: place the child at the head of its own process group so
# _kill_tree can SIGKILL the group atomically.
# Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
# _kill_tree handles the Windows path via taskkill /F /T.
start_new_session=True,
)
# Capture the pgid NOW, before the leader can exit and be reaped. Once
# the leader is reaped, os.getpgid(proc.pid) raises ProcessLookupError
# even though grandchildren in that group are still alive — defeating
# the whole cleanup. None on Windows where the pgid concept doesn't apply.
pgid: int | None = None
if sys.platform != "win32":
try:
pgid = os.getpgid(proc.pid)
except (ProcessLookupError, PermissionError):
pgid = None
try:
output, _ = proc.communicate(timeout=file_timeout)
rc = proc.returncode
except subprocess.TimeoutExpired:
_kill_tree(proc, pgid=pgid)
try:
output, _ = proc.communicate(timeout=10)
except subprocess.TimeoutExpired:
output = "(file timeout exceeded; output unavailable)"
rc = 124 # de facto convention for "killed by timeout".
output = (
f"({timeout_note}: {file_timeout:.0f}s exceeded; "
f"process tree SIGKILL'd)\n{output}"
)
except BaseException:
# KeyboardInterrupt / runner crash — make sure no zombie
# grandchildren outlive us.
_kill_tree(proc, pgid=pgid)
raise
else:
# Happy path: pytest exited on its own. Kill the group anyway in
# case it left grandchildren behind; already-dead is a no-op.
_kill_tree(proc, pgid=pgid)
return rc, output
# How many times to re-run a file that exits 4 ("file or directory not found")
# while the file demonstrably exists on disk. On loaded shared CI runners the
# planner can enumerate a file (tests counted via --collect-only) but the
# per-file subprocess fail to stat it moments later — and a SINGLE immediate
# retry can land in the same brief high-load window and fail again. We retry a
# few times with a short backoff so transient I/O pressure has time to settle.
_EXIT4_RETRY_ATTEMPTS = 3
_EXIT4_RETRY_BACKOFF_SECONDS = 0.5
def _file_present(file: Path, *, attempts: int = 3, delay: float = 0.2) -> bool:
"""Return True if ``file`` exists, re-checking a few times.
``Path.exists()`` itself issues a ``stat`` that can transiently fail under
the same load that makes pytest report "file or directory not found", so a
single negative check is not authoritative. Only conclude the file is
genuinely missing if it's absent across several spaced checks.
"""
for i in range(attempts):
if file.exists():
return True
if i < attempts - 1:
time.sleep(delay)
return False
def _run_one_file(
file: Path,
pytest_args: List[str],
@ -280,104 +372,28 @@ def _run_one_file(
"""
cmd = [sys.executable, "-m", "pytest", str(file), *pytest_args]
subproc_start = time.monotonic()
proc = subprocess.Popen(
cmd,
cwd=repo_root,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
# POSIX: place the child at the head of its own process group so
# _kill_tree can SIGKILL the group atomically.
# Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
# _kill_tree handles the Windows path via taskkill /F /T.
start_new_session=True,
)
rc, output = _spawn_pytest_once(cmd, repo_root, file_timeout)
# Capture the pgid NOW, before the leader can exit and be reaped.
# Once the leader is reaped, os.getpgid(proc.pid) raises
# ProcessLookupError even though grandchildren in that group are
# still alive — defeating the whole cleanup. None on Windows where
# the pgid concept doesn't apply (taskkill walks ppid chain instead).
pgid: int | None = None
if sys.platform != "win32":
try:
pgid = os.getpgid(proc.pid)
except (ProcessLookupError, PermissionError):
# Astonishingly fast child? Already dead. _kill_tree's
# fallback will handle this case as a no-op.
pgid = None
try:
output, _ = proc.communicate(timeout=file_timeout)
rc = proc.returncode
except subprocess.TimeoutExpired:
_kill_tree(proc, pgid=pgid)
# Drain whatever the child wrote before we killed it so we have
# something to surface in the failure dump.
try:
output, _ = proc.communicate(timeout=10)
except subprocess.TimeoutExpired:
output = "(file timeout exceeded; output unavailable)"
rc = 124 # de facto convention for "killed by timeout".
output = (
f"(per-file timeout: {file_timeout:.0f}s exceeded; "
f"process tree SIGKILL'd)\n{output}"
# pytest exit 4 = "file or directory not found" at exec time. On loaded
# shared CI runners we have seen the planner enumerate a file (its tests
# counted via --collect-only) but the per-file subprocess fail to stat it
# moments later — a transient the deterministic LPT slicer otherwise
# reproduces on every rerun (same file set → same shard). Re-run the file a
# few times with a short backoff so the I/O pressure has time to settle,
# but ONLY while the file demonstrably exists on disk. A single immediate
# retry (the old behaviour) could land in the same brief high-load window
# and fail again; a single Path.exists() check could itself be a flaky stat
# under that load, so we re-check existence across spaced attempts.
# We do NOT widen the exit-5 rule: exit 4 on a file that genuinely does not
# exist must still fail.
attempt = 0
while rc == 4 and attempt < _EXIT4_RETRY_ATTEMPTS and _file_present(file):
attempt += 1
time.sleep(_EXIT4_RETRY_BACKOFF_SECONDS * attempt)
rc, output = _spawn_pytest_once(
cmd, repo_root, file_timeout,
timeout_note=f"per-file timeout on exit-4 retry {attempt}",
)
except BaseException:
# KeyboardInterrupt / runner crash — make sure no zombie
# grandchildren outlive us.
_kill_tree(proc, pgid=pgid)
raise
else:
# Happy path: pytest exited on its own. The child process already
# cleaned up its grandchildren if it's well-behaved, but
# well-behaved is not universal — kill the group anyway. Already-
# dead processes are a no-op.
_kill_tree(proc, pgid=pgid)
if rc == 4 and Path(file).exists():
# pytest exit 4 = "file or directory not found" at exec time, yet the
# file is present on disk now. On loaded shared CI runners we have seen
# the planner enumerate a file (its tests counted via --collect-only)
# but the per-file subprocess fail to stat it moments later — a
# transient the deterministic LPT slicer otherwise reproduces on every
# rerun (same file set → same shard). Retry the file ONCE before
# surfacing it as a hard failure. We do NOT widen the exit-5 rule:
# exit 4 on a file that genuinely does not exist must still fail.
retry_proc = subprocess.Popen(
cmd,
cwd=repo_root,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
start_new_session=True,
)
retry_pgid: int | None = None
if sys.platform != "win32":
try:
retry_pgid = os.getpgid(retry_proc.pid)
except (ProcessLookupError, PermissionError):
retry_pgid = None
try:
retry_output, _ = retry_proc.communicate(timeout=file_timeout)
retry_rc = retry_proc.returncode
except subprocess.TimeoutExpired:
_kill_tree(retry_proc, pgid=retry_pgid)
try:
retry_output, _ = retry_proc.communicate(timeout=10)
except subprocess.TimeoutExpired:
retry_output = "(file timeout exceeded on retry; output unavailable)"
retry_rc = 124
retry_output = (
f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; "
f"process tree SIGKILL'd)\n{retry_output}"
)
except BaseException:
_kill_tree(retry_proc, pgid=retry_pgid)
raise
else:
_kill_tree(retry_proc, pgid=retry_pgid)
rc, output = retry_rc, retry_output
if rc == 5:
# No tests collected — every test in the file was filtered out.

View file

@ -185,3 +185,111 @@ def test_grandchild_leak_is_killed_by_runner(tmp_path: Path) -> None:
f"diag={diag!r} test_pid={test_pid} test_pgid={test_pgid}; "
f"runner output:\n{proc.stdout}"
)
# ---------------------------------------------------------------------------
# exit-4 retry loop (transient "file or directory not found" on loaded runners)
# ---------------------------------------------------------------------------
import importlib.util as _importlib_util # noqa: E402
def _load_runner_module():
"""Import scripts/run_tests_parallel.py as a module for in-process tests."""
repo_root = Path(__file__).resolve().parent.parent
path = repo_root / "scripts" / "run_tests_parallel.py"
spec = _importlib_util.spec_from_file_location("_rtp_under_test", path)
mod = _importlib_util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
def test_exit4_retry_recovers_when_file_exists(tmp_path, monkeypatch):
"""A file that exits 4 transiently then passes must be retried and recover.
Simulates the loaded-CI transient: the per-file pytest subprocess reports
"file or directory not found" (exit 4) on the first attempts even though
the file is on disk, then succeeds. The runner must retry and report pass.
"""
rtp = _load_runner_module()
f = tmp_path / "test_transient.py"
f.write_text("def test_ok():\n assert True\n")
calls = {"n": 0}
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
calls["n"] += 1
# First two attempts: transient exit-4. Third: success.
if calls["n"] < 3:
return 4, "ERROR: file or directory not found\nno tests ran in 0.00s"
return 0, "1 passed"
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0) # no real sleep
file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0)
assert rc == 0, f"expected recovery to pass, got rc={rc}, output={output!r}"
assert calls["n"] == 3, f"expected 3 attempts (1 + 2 retries), got {calls['n']}"
def test_exit4_no_retry_when_file_genuinely_missing(tmp_path, monkeypatch):
"""Exit 4 on a file that does NOT exist must fail fast without retrying.
Guards the narrowing: we only retry while the file is present on disk, so a
real typo / deleted file surfaces immediately instead of looping.
"""
rtp = _load_runner_module()
missing = tmp_path / "test_does_not_exist.py" # never created
calls = {"n": 0}
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
calls["n"] += 1
return 4, "ERROR: file or directory not found"
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0)
file, rc, output, summary, _wall = rtp._run_one_file(missing, [], tmp_path, 30.0)
assert rc == 4, f"genuinely-missing file should keep rc=4, got {rc}"
assert calls["n"] == 1, f"missing file must NOT be retried, got {calls['n']} calls"
def test_exit4_retry_gives_up_after_max_attempts(tmp_path, monkeypatch):
"""If the transient never clears, we stop after the bounded attempt count."""
rtp = _load_runner_module()
f = tmp_path / "test_persistent_transient.py"
f.write_text("def test_ok():\n assert True\n")
calls = {"n": 0}
def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"):
calls["n"] += 1
return 4, "ERROR: file or directory not found"
monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn)
monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0)
file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0)
assert rc == 4
# 1 initial + _EXIT4_RETRY_ATTEMPTS retries.
assert calls["n"] == 1 + rtp._EXIT4_RETRY_ATTEMPTS
def test_file_present_tolerates_transient_negative(tmp_path, monkeypatch):
"""_file_present must not conclude 'missing' on a single flaky stat."""
rtp = _load_runner_module()
f = tmp_path / "test_flaky_stat.py"
f.write_text("x = 1\n")
seq = iter([False, False, True]) # first two stats flake, third succeeds
monkeypatch.setattr(rtp.Path, "exists", lambda self: next(seq))
assert rtp._file_present(f, attempts=3, delay=0.0) is True
def test_file_present_reports_truly_missing(tmp_path, monkeypatch):
"""_file_present returns False when the file is absent across all checks."""
rtp = _load_runner_module()
f = tmp_path / "nope.py"
monkeypatch.setattr(rtp.Path, "exists", lambda self: False)
assert rtp._file_present(f, attempts=3, delay=0.0) is False