diff --git a/scripts/run_tests_parallel.py b/scripts/run_tests_parallel.py index be8bba8ad20..f1b437d7860 100755 --- a/scripts/run_tests_parallel.py +++ b/scripts/run_tests_parallel.py @@ -246,6 +246,98 @@ def _kill_tree(proc: "subprocess.Popen", pgid: int | None = None) -> None: pass +def _spawn_pytest_once( + cmd: List[str], + repo_root: Path, + file_timeout: float, + *, + timeout_note: str = "per-file timeout", +) -> Tuple[int, str]: + """Run one ``pytest`` subprocess to completion and return ``(rc, output)``. + + Spawns the child in its own process group / session so a hung file and + its grandchildren (uvicorn servers, async runtimes, etc.) can be SIGKILL'd + as a tree on timeout rather than orphaning onto PID 1. Shared by the + primary per-file run and the exit-4 retry loop so the lifecycle/cleanup + logic lives in exactly one place. + """ + proc = subprocess.Popen( + cmd, + cwd=repo_root, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + # POSIX: place the child at the head of its own process group so + # _kill_tree can SIGKILL the group atomically. + # Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+; + # _kill_tree handles the Windows path via taskkill /F /T. + start_new_session=True, + ) + + # Capture the pgid NOW, before the leader can exit and be reaped. Once + # the leader is reaped, os.getpgid(proc.pid) raises ProcessLookupError + # even though grandchildren in that group are still alive — defeating + # the whole cleanup. None on Windows where the pgid concept doesn't apply. + pgid: int | None = None + if sys.platform != "win32": + try: + pgid = os.getpgid(proc.pid) + except (ProcessLookupError, PermissionError): + pgid = None + + try: + output, _ = proc.communicate(timeout=file_timeout) + rc = proc.returncode + except subprocess.TimeoutExpired: + _kill_tree(proc, pgid=pgid) + try: + output, _ = proc.communicate(timeout=10) + except subprocess.TimeoutExpired: + output = "(file timeout exceeded; output unavailable)" + rc = 124 # de facto convention for "killed by timeout". + output = ( + f"({timeout_note}: {file_timeout:.0f}s exceeded; " + f"process tree SIGKILL'd)\n{output}" + ) + except BaseException: + # KeyboardInterrupt / runner crash — make sure no zombie + # grandchildren outlive us. + _kill_tree(proc, pgid=pgid) + raise + else: + # Happy path: pytest exited on its own. Kill the group anyway in + # case it left grandchildren behind; already-dead is a no-op. + _kill_tree(proc, pgid=pgid) + + return rc, output + + +# How many times to re-run a file that exits 4 ("file or directory not found") +# while the file demonstrably exists on disk. On loaded shared CI runners the +# planner can enumerate a file (tests counted via --collect-only) but the +# per-file subprocess fail to stat it moments later — and a SINGLE immediate +# retry can land in the same brief high-load window and fail again. We retry a +# few times with a short backoff so transient I/O pressure has time to settle. +_EXIT4_RETRY_ATTEMPTS = 3 +_EXIT4_RETRY_BACKOFF_SECONDS = 0.5 + + +def _file_present(file: Path, *, attempts: int = 3, delay: float = 0.2) -> bool: + """Return True if ``file`` exists, re-checking a few times. + + ``Path.exists()`` itself issues a ``stat`` that can transiently fail under + the same load that makes pytest report "file or directory not found", so a + single negative check is not authoritative. Only conclude the file is + genuinely missing if it's absent across several spaced checks. + """ + for i in range(attempts): + if file.exists(): + return True + if i < attempts - 1: + time.sleep(delay) + return False + + def _run_one_file( file: Path, pytest_args: List[str], @@ -280,104 +372,28 @@ def _run_one_file( """ cmd = [sys.executable, "-m", "pytest", str(file), *pytest_args] subproc_start = time.monotonic() - proc = subprocess.Popen( - cmd, - cwd=repo_root, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - # POSIX: place the child at the head of its own process group so - # _kill_tree can SIGKILL the group atomically. - # Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+; - # _kill_tree handles the Windows path via taskkill /F /T. - start_new_session=True, - ) + rc, output = _spawn_pytest_once(cmd, repo_root, file_timeout) - # Capture the pgid NOW, before the leader can exit and be reaped. - # Once the leader is reaped, os.getpgid(proc.pid) raises - # ProcessLookupError even though grandchildren in that group are - # still alive — defeating the whole cleanup. None on Windows where - # the pgid concept doesn't apply (taskkill walks ppid chain instead). - pgid: int | None = None - if sys.platform != "win32": - try: - pgid = os.getpgid(proc.pid) - except (ProcessLookupError, PermissionError): - # Astonishingly fast child? Already dead. _kill_tree's - # fallback will handle this case as a no-op. - pgid = None - - try: - output, _ = proc.communicate(timeout=file_timeout) - rc = proc.returncode - except subprocess.TimeoutExpired: - _kill_tree(proc, pgid=pgid) - # Drain whatever the child wrote before we killed it so we have - # something to surface in the failure dump. - try: - output, _ = proc.communicate(timeout=10) - except subprocess.TimeoutExpired: - output = "(file timeout exceeded; output unavailable)" - rc = 124 # de facto convention for "killed by timeout". - output = ( - f"(per-file timeout: {file_timeout:.0f}s exceeded; " - f"process tree SIGKILL'd)\n{output}" + # pytest exit 4 = "file or directory not found" at exec time. On loaded + # shared CI runners we have seen the planner enumerate a file (its tests + # counted via --collect-only) but the per-file subprocess fail to stat it + # moments later — a transient the deterministic LPT slicer otherwise + # reproduces on every rerun (same file set → same shard). Re-run the file a + # few times with a short backoff so the I/O pressure has time to settle, + # but ONLY while the file demonstrably exists on disk. A single immediate + # retry (the old behaviour) could land in the same brief high-load window + # and fail again; a single Path.exists() check could itself be a flaky stat + # under that load, so we re-check existence across spaced attempts. + # We do NOT widen the exit-5 rule: exit 4 on a file that genuinely does not + # exist must still fail. + attempt = 0 + while rc == 4 and attempt < _EXIT4_RETRY_ATTEMPTS and _file_present(file): + attempt += 1 + time.sleep(_EXIT4_RETRY_BACKOFF_SECONDS * attempt) + rc, output = _spawn_pytest_once( + cmd, repo_root, file_timeout, + timeout_note=f"per-file timeout on exit-4 retry {attempt}", ) - except BaseException: - # KeyboardInterrupt / runner crash — make sure no zombie - # grandchildren outlive us. - _kill_tree(proc, pgid=pgid) - raise - else: - # Happy path: pytest exited on its own. The child process already - # cleaned up its grandchildren if it's well-behaved, but - # well-behaved is not universal — kill the group anyway. Already- - # dead processes are a no-op. - _kill_tree(proc, pgid=pgid) - - if rc == 4 and Path(file).exists(): - # pytest exit 4 = "file or directory not found" at exec time, yet the - # file is present on disk now. On loaded shared CI runners we have seen - # the planner enumerate a file (its tests counted via --collect-only) - # but the per-file subprocess fail to stat it moments later — a - # transient the deterministic LPT slicer otherwise reproduces on every - # rerun (same file set → same shard). Retry the file ONCE before - # surfacing it as a hard failure. We do NOT widen the exit-5 rule: - # exit 4 on a file that genuinely does not exist must still fail. - retry_proc = subprocess.Popen( - cmd, - cwd=repo_root, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - start_new_session=True, - ) - retry_pgid: int | None = None - if sys.platform != "win32": - try: - retry_pgid = os.getpgid(retry_proc.pid) - except (ProcessLookupError, PermissionError): - retry_pgid = None - try: - retry_output, _ = retry_proc.communicate(timeout=file_timeout) - retry_rc = retry_proc.returncode - except subprocess.TimeoutExpired: - _kill_tree(retry_proc, pgid=retry_pgid) - try: - retry_output, _ = retry_proc.communicate(timeout=10) - except subprocess.TimeoutExpired: - retry_output = "(file timeout exceeded on retry; output unavailable)" - retry_rc = 124 - retry_output = ( - f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; " - f"process tree SIGKILL'd)\n{retry_output}" - ) - except BaseException: - _kill_tree(retry_proc, pgid=retry_pgid) - raise - else: - _kill_tree(retry_proc, pgid=retry_pgid) - rc, output = retry_rc, retry_output if rc == 5: # No tests collected — every test in the file was filtered out. diff --git a/tests/test_run_tests_parallel.py b/tests/test_run_tests_parallel.py index 743ba792189..d21e5e01eb5 100644 --- a/tests/test_run_tests_parallel.py +++ b/tests/test_run_tests_parallel.py @@ -185,3 +185,111 @@ def test_grandchild_leak_is_killed_by_runner(tmp_path: Path) -> None: f"diag={diag!r} test_pid={test_pid} test_pgid={test_pgid}; " f"runner output:\n{proc.stdout}" ) + + +# --------------------------------------------------------------------------- +# exit-4 retry loop (transient "file or directory not found" on loaded runners) +# --------------------------------------------------------------------------- + +import importlib.util as _importlib_util # noqa: E402 + + +def _load_runner_module(): + """Import scripts/run_tests_parallel.py as a module for in-process tests.""" + repo_root = Path(__file__).resolve().parent.parent + path = repo_root / "scripts" / "run_tests_parallel.py" + spec = _importlib_util.spec_from_file_location("_rtp_under_test", path) + mod = _importlib_util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_exit4_retry_recovers_when_file_exists(tmp_path, monkeypatch): + """A file that exits 4 transiently then passes must be retried and recover. + + Simulates the loaded-CI transient: the per-file pytest subprocess reports + "file or directory not found" (exit 4) on the first attempts even though + the file is on disk, then succeeds. The runner must retry and report pass. + """ + rtp = _load_runner_module() + f = tmp_path / "test_transient.py" + f.write_text("def test_ok():\n assert True\n") + + calls = {"n": 0} + + def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"): + calls["n"] += 1 + # First two attempts: transient exit-4. Third: success. + if calls["n"] < 3: + return 4, "ERROR: file or directory not found\nno tests ran in 0.00s" + return 0, "1 passed" + + monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn) + monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0) # no real sleep + + file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0) + assert rc == 0, f"expected recovery to pass, got rc={rc}, output={output!r}" + assert calls["n"] == 3, f"expected 3 attempts (1 + 2 retries), got {calls['n']}" + + +def test_exit4_no_retry_when_file_genuinely_missing(tmp_path, monkeypatch): + """Exit 4 on a file that does NOT exist must fail fast without retrying. + + Guards the narrowing: we only retry while the file is present on disk, so a + real typo / deleted file surfaces immediately instead of looping. + """ + rtp = _load_runner_module() + missing = tmp_path / "test_does_not_exist.py" # never created + + calls = {"n": 0} + + def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"): + calls["n"] += 1 + return 4, "ERROR: file or directory not found" + + monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn) + monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0) + + file, rc, output, summary, _wall = rtp._run_one_file(missing, [], tmp_path, 30.0) + assert rc == 4, f"genuinely-missing file should keep rc=4, got {rc}" + assert calls["n"] == 1, f"missing file must NOT be retried, got {calls['n']} calls" + + +def test_exit4_retry_gives_up_after_max_attempts(tmp_path, monkeypatch): + """If the transient never clears, we stop after the bounded attempt count.""" + rtp = _load_runner_module() + f = tmp_path / "test_persistent_transient.py" + f.write_text("def test_ok():\n assert True\n") + + calls = {"n": 0} + + def fake_spawn(cmd, repo_root, file_timeout, *, timeout_note="per-file timeout"): + calls["n"] += 1 + return 4, "ERROR: file or directory not found" + + monkeypatch.setattr(rtp, "_spawn_pytest_once", fake_spawn) + monkeypatch.setattr(rtp, "_EXIT4_RETRY_BACKOFF_SECONDS", 0.0) + + file, rc, output, summary, _wall = rtp._run_one_file(f, [], tmp_path, 30.0) + assert rc == 4 + # 1 initial + _EXIT4_RETRY_ATTEMPTS retries. + assert calls["n"] == 1 + rtp._EXIT4_RETRY_ATTEMPTS + + +def test_file_present_tolerates_transient_negative(tmp_path, monkeypatch): + """_file_present must not conclude 'missing' on a single flaky stat.""" + rtp = _load_runner_module() + f = tmp_path / "test_flaky_stat.py" + f.write_text("x = 1\n") + + seq = iter([False, False, True]) # first two stats flake, third succeeds + monkeypatch.setattr(rtp.Path, "exists", lambda self: next(seq)) + assert rtp._file_present(f, attempts=3, delay=0.0) is True + + +def test_file_present_reports_truly_missing(tmp_path, monkeypatch): + """_file_present returns False when the file is absent across all checks.""" + rtp = _load_runner_module() + f = tmp_path / "nope.py" + monkeypatch.setattr(rtp.Path, "exists", lambda self: False) + assert rtp._file_present(f, attempts=3, delay=0.0) is False