diff --git a/scripts/run_tests_parallel.py b/scripts/run_tests_parallel.py index 7fe0b57947a..be8bba8ad20 100755 --- a/scripts/run_tests_parallel.py +++ b/scripts/run_tests_parallel.py @@ -335,6 +335,50 @@ def _run_one_file( # dead processes are a no-op. _kill_tree(proc, pgid=pgid) + if rc == 4 and Path(file).exists(): + # pytest exit 4 = "file or directory not found" at exec time, yet the + # file is present on disk now. On loaded shared CI runners we have seen + # the planner enumerate a file (its tests counted via --collect-only) + # but the per-file subprocess fail to stat it moments later — a + # transient the deterministic LPT slicer otherwise reproduces on every + # rerun (same file set → same shard). Retry the file ONCE before + # surfacing it as a hard failure. We do NOT widen the exit-5 rule: + # exit 4 on a file that genuinely does not exist must still fail. + retry_proc = subprocess.Popen( + cmd, + cwd=repo_root, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + start_new_session=True, + ) + retry_pgid: int | None = None + if sys.platform != "win32": + try: + retry_pgid = os.getpgid(retry_proc.pid) + except (ProcessLookupError, PermissionError): + retry_pgid = None + try: + retry_output, _ = retry_proc.communicate(timeout=file_timeout) + retry_rc = retry_proc.returncode + except subprocess.TimeoutExpired: + _kill_tree(retry_proc, pgid=retry_pgid) + try: + retry_output, _ = retry_proc.communicate(timeout=10) + except subprocess.TimeoutExpired: + retry_output = "(file timeout exceeded on retry; output unavailable)" + retry_rc = 124 + retry_output = ( + f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; " + f"process tree SIGKILL'd)\n{retry_output}" + ) + except BaseException: + _kill_tree(retry_proc, pgid=retry_pgid) + raise + else: + _kill_tree(retry_proc, pgid=retry_pgid) + rc, output = retry_rc, retry_output + if rc == 5: # No tests collected — every test in the file was filtered out. # Treat as a pass; surface info in a slightly distinct status