mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(tests): retry per-file pytest subprocess once on exit-4 when the file exists
The parallel test runner sharded a present, tracked test file
(tests/plugins/platforms/photon/test_inbound.py) onto a slice that then
reported 'file or directory not found' (pytest exit 4) at exec time —
even though the planner had just enumerated the file via --collect-only
('5269 passed, 0 failed' in the same run). On loaded shared CI runners
the per-file subprocess can fail to stat a file the planner already saw;
the deterministic LPT slicer then reproduces it on every rerun because
the same file set lands on the same shard.
Fix: when a per-file run exits 4 AND the file still exists on disk, retry
the subprocess once before surfacing it as a hard failure. This kills the
shard-flake class for everyone, not just this PR.
Does NOT widen the exit-5-is-pass rule — exit 4 on a genuinely missing
file still fails (verified). Retry reuses the same pgroup-kill cleanup as
the primary run so no grandchildren orphan.
Validation: photon dir runs green through scripts/run_tests_parallel.py;
unit-level negative case confirms a nonexistent file still returns rc=4.
This commit is contained in:
parent
1866518574
commit
754154a9c2
1 changed files with 44 additions and 0 deletions
|
|
@ -335,6 +335,50 @@ def _run_one_file(
|
|||
# dead processes are a no-op.
|
||||
_kill_tree(proc, pgid=pgid)
|
||||
|
||||
if rc == 4 and Path(file).exists():
|
||||
# pytest exit 4 = "file or directory not found" at exec time, yet the
|
||||
# file is present on disk now. On loaded shared CI runners we have seen
|
||||
# the planner enumerate a file (its tests counted via --collect-only)
|
||||
# but the per-file subprocess fail to stat it moments later — a
|
||||
# transient the deterministic LPT slicer otherwise reproduces on every
|
||||
# rerun (same file set → same shard). Retry the file ONCE before
|
||||
# surfacing it as a hard failure. We do NOT widen the exit-5 rule:
|
||||
# exit 4 on a file that genuinely does not exist must still fail.
|
||||
retry_proc = subprocess.Popen(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
start_new_session=True,
|
||||
)
|
||||
retry_pgid: int | None = None
|
||||
if sys.platform != "win32":
|
||||
try:
|
||||
retry_pgid = os.getpgid(retry_proc.pid)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
retry_pgid = None
|
||||
try:
|
||||
retry_output, _ = retry_proc.communicate(timeout=file_timeout)
|
||||
retry_rc = retry_proc.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
_kill_tree(retry_proc, pgid=retry_pgid)
|
||||
try:
|
||||
retry_output, _ = retry_proc.communicate(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
retry_output = "(file timeout exceeded on retry; output unavailable)"
|
||||
retry_rc = 124
|
||||
retry_output = (
|
||||
f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; "
|
||||
f"process tree SIGKILL'd)\n{retry_output}"
|
||||
)
|
||||
except BaseException:
|
||||
_kill_tree(retry_proc, pgid=retry_pgid)
|
||||
raise
|
||||
else:
|
||||
_kill_tree(retry_proc, pgid=retry_pgid)
|
||||
rc, output = retry_rc, retry_output
|
||||
|
||||
if rc == 5:
|
||||
# No tests collected — every test in the file was filtered out.
|
||||
# Treat as a pass; surface info in a slightly distinct status
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue