From 754154a9c2faaff9e00932fa3c9e32b3ed936fb4 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Mon, 8 Jun 2026 13:32:19 -0700 Subject: [PATCH] fix(tests): retry per-file pytest subprocess once on exit-4 when the file exists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parallel test runner sharded a present, tracked test file (tests/plugins/platforms/photon/test_inbound.py) onto a slice that then reported 'file or directory not found' (pytest exit 4) at exec time — even though the planner had just enumerated the file via --collect-only ('5269 passed, 0 failed' in the same run). On loaded shared CI runners the per-file subprocess can fail to stat a file the planner already saw; the deterministic LPT slicer then reproduces it on every rerun because the same file set lands on the same shard. Fix: when a per-file run exits 4 AND the file still exists on disk, retry the subprocess once before surfacing it as a hard failure. This kills the shard-flake class for everyone, not just this PR. Does NOT widen the exit-5-is-pass rule — exit 4 on a genuinely missing file still fails (verified). Retry reuses the same pgroup-kill cleanup as the primary run so no grandchildren orphan. Validation: photon dir runs green through scripts/run_tests_parallel.py; unit-level negative case confirms a nonexistent file still returns rc=4. --- scripts/run_tests_parallel.py | 44 +++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/scripts/run_tests_parallel.py b/scripts/run_tests_parallel.py index 7fe0b57947a..be8bba8ad20 100755 --- a/scripts/run_tests_parallel.py +++ b/scripts/run_tests_parallel.py @@ -335,6 +335,50 @@ def _run_one_file( # dead processes are a no-op. _kill_tree(proc, pgid=pgid) + if rc == 4 and Path(file).exists(): + # pytest exit 4 = "file or directory not found" at exec time, yet the + # file is present on disk now. On loaded shared CI runners we have seen + # the planner enumerate a file (its tests counted via --collect-only) + # but the per-file subprocess fail to stat it moments later — a + # transient the deterministic LPT slicer otherwise reproduces on every + # rerun (same file set → same shard). Retry the file ONCE before + # surfacing it as a hard failure. We do NOT widen the exit-5 rule: + # exit 4 on a file that genuinely does not exist must still fail. + retry_proc = subprocess.Popen( + cmd, + cwd=repo_root, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + start_new_session=True, + ) + retry_pgid: int | None = None + if sys.platform != "win32": + try: + retry_pgid = os.getpgid(retry_proc.pid) + except (ProcessLookupError, PermissionError): + retry_pgid = None + try: + retry_output, _ = retry_proc.communicate(timeout=file_timeout) + retry_rc = retry_proc.returncode + except subprocess.TimeoutExpired: + _kill_tree(retry_proc, pgid=retry_pgid) + try: + retry_output, _ = retry_proc.communicate(timeout=10) + except subprocess.TimeoutExpired: + retry_output = "(file timeout exceeded on retry; output unavailable)" + retry_rc = 124 + retry_output = ( + f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; " + f"process tree SIGKILL'd)\n{retry_output}" + ) + except BaseException: + _kill_tree(retry_proc, pgid=retry_pgid) + raise + else: + _kill_tree(retry_proc, pgid=retry_pgid) + rc, output = retry_rc, retry_output + if rc == 5: # No tests collected — every test in the file was filtered out. # Treat as a pass; surface info in a slightly distinct status