diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0f339747d16..f74d574fa27 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,25 +21,7 @@ jobs: name: "Generate slices" runs-on: ubuntu-latest outputs: - slices: ${{ steps.matrix.outputs.slices }} - slice_count: ${{ steps.matrix.outputs.slice_count }} - steps: - - name: Generate test slices - id: matrix - run: | - COUNT="${{ inputs.slice_count }}" - SLICES=$(python3 -c "import json; print(json.dumps({'slice': list(range(1, $COUNT + 1))}))") - echo "slices=$SLICES" >> "$GITHUB_OUTPUT" - echo "slice_count=$COUNT" >> "$GITHUB_OUTPUT" - - test: - name: Run tests slice - needs: generate - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - fail-fast: false - matrix: ${{ fromJSON(needs.generate.outputs.slices) }} + matrix: ${{ steps.matrix.outputs.matrix }} steps: - name: Checkout code uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -48,13 +30,26 @@ jobs: uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: test_durations.json - # main always writes a new suffix, but jobs pick the latest one with the same prefix - # quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses - # If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys. - # If there are no exact matches, the action searches for partial matches of the restore keys. - # When the action finds a partial match, the most recent cache is restored to the path directory. key: test-durations + - name: Generate test slices + id: matrix + run: | + MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }}) + echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT" + + test: + name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }} + needs: generate + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.generate.outputs.matrix) }} + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Install ripgrep (prebuilt binary) run: | set -euo pipefail @@ -99,33 +94,19 @@ jobs: # re-download, keeping the persisted cache small and fast to restore. run: uv cache prune --ci - - name: Run tests (slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }}) - # Per-file isolation via scripts/run_tests.sh: discovers - # every test_*.py file under tests/ (excluding integration/ + e2e/), - # then runs `python -m pytest ` in a freshly-spawned subprocess + - name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}) + # Per-file isolation via scripts/run_tests.sh: each test file runs + # in its own freshly-spawned `python -m pytest ` subprocess # with bounded parallelism. No xdist, no shared workers, no # module-level state leakage between files. # - # Why per-file (not per-test): per-test spawn cost (~250ms × 17k - # tests = 70min CPU minimum) blew the wall-clock budget. Per-file - # spawn (~250ms × ~850 files = ~3.5min) fits while still giving - # every file a fresh interpreter — the only isolation boundary - # that matters in practice (cross-file leakage was the original - # flake source; intra-file is the test author's responsibility). - # - # Why drop xdist entirely: xdist's persistent workers accumulate - # state across files, which is exactly the leakage we wanted to - # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does - # the job with cleaner semantics. - # - # Matrix slicing (--slice I/N): files are distributed across N - # jobs by cached duration (LPT algorithm) so each job gets - # roughly equal wall time. Without a cache, files default to 2s - # estimate and get split roughly evenly by count — still correct, - # just not perfectly balanced. + # File list is pre-computed by the generate job (--generate-slices) + # which runs LPT distribution once and passes the file list to each + # matrix job via --files. Previously each job re-discovered files and + # re-ran LPT independently — redundant N times. run: | source .venv/bin/activate - scripts/run_tests.sh --slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }} + scripts/run_tests.sh --files '${{ matrix.slice.files }}' env: # Ensure tests don't accidentally call real APIs OPENROUTER_API_KEY: "" @@ -135,7 +116,7 @@ jobs: - name: Upload per-slice durations uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: test-durations-slice-${{ matrix.slice }} + name: test-durations-slice-${{ matrix.slice.index }} path: test_durations.json retention-days: 1 diff --git a/scripts/run_tests_parallel.py b/scripts/run_tests_parallel.py index d2febba3632..0128bcfbef9 100755 --- a/scripts/run_tests_parallel.py +++ b/scripts/run_tests_parallel.py @@ -493,38 +493,27 @@ def _save_durations( path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n") -def _slice_files( +def _compute_lpt_slices( files: List[Path], - slice_index: int, slice_count: int, durations: dict[str, float], repo_root: Path, -) -> List[Path]: - """Return the subset of *files* belonging to slice *slice_index*. +) -> List[List[Path]]: + """Distribute files across N slices using LPT (Longest Processing Time first). - Uses **Longest Processing Time first** (LPT) distribution: sort files - by estimated duration descending, then greedily assign each file to - the slice with the smallest accumulated time so far. This minimizes - the makespan (max slice duration) and keeps CI jobs balanced. + Sorts files by estimated duration descending, then greedily assigns each + file to the slice with the smallest accumulated time so far. This + minimizes the makespan (max slice duration) and keeps CI jobs balanced. Files with no cached duration get a default estimate of 2.0s (roughly - the P50 from profiling). This means first-time ``--slice`` runs - (no cache) still get reasonable distribution, and new files don't - all land in one slice. + the P50 from profiling). This means first-time runs (no cache) still + get reasonable distribution, and new files don't all land in one slice. - ``slice_index`` is 1-indexed (1..slice_count) for ergonomics — - ``--slice 1/4`` reads more naturally than ``--slice 0/4``. + Returns a list of N file-lists, one per slice (0-indexed). """ if slice_count < 2: - return files - if not (1 <= slice_index <= slice_count): - print( - f"error: --slice index must be 1..{slice_count}, got {slice_index}", - file=sys.stderr, - ) - sys.exit(2) + return [files] - # Build (file, estimated_duration) pairs. default_dur = 2.0 file_durs: List[Tuple[Path, float]] = [] for f in files: @@ -541,15 +530,47 @@ def _slice_files( bucket_totals: List[float] = [0.0] * slice_count for f, dur in file_durs: - # Find the least-loaded bucket. min_idx = min(range(slice_count), key=lambda i: bucket_totals[i]) bucket_files[min_idx].append(f) bucket_totals[min_idx] += dur - # Print slice summary for visibility. + return bucket_files + + +def _slice_files( + files: List[Path], + slice_index: int, + slice_count: int, + durations: dict[str, float], + repo_root: Path, +) -> List[Path]: + """Return the subset of *files* belonging to slice *slice_index*. + + Uses :func:`_compute_lpt_slices` for LPT distribution. + + ``slice_index`` is 1-indexed (1..slice_count) for ergonomics — + ``--slice 1/4`` reads more naturally than ``--slice 0/4``. + """ + if slice_count < 2: + return files + if not (1 <= slice_index <= slice_count): + print( + f"error: --slice index must be 1..{slice_count}, got {slice_index}", + file=sys.stderr, + ) + sys.exit(2) + + bucket_files = _compute_lpt_slices(files, slice_count, durations, repo_root) + target = bucket_files[slice_index - 1] - target_dur = bucket_totals[slice_index - 1] - total_dur = sum(bucket_totals) + target_dur = sum( + durations.get(_format_file(f, repo_root), 2.0) for f in target + ) + total_dur = sum( + durations.get(_format_file(f, repo_root), 2.0) + for bucket in bucket_files + for f in bucket + ) print( f"Slice {slice_index}/{slice_count}: {len(target)} files " f"(~{target_dur:.0f}s estimated of {total_dur:.0f}s total)", @@ -604,6 +625,27 @@ def main() -> int: "Env: HERMES_TEST_SLICE (format: I/N)." ), ) + parser.add_argument( + "--generate-slices", + metavar="N", + type=int, + help=( + "Discover test files, distribute them across N slices using " + "LPT on cached durations, and print a JSON matrix to stdout " + "then exit (no tests run). The JSON has the shape " + "'{\"slices\": [{\"index\": 1, \"files\": [\"tests/foo.py\", ...]}, ...]}' " + "so the CI generate job can feed it directly into a matrix." + ), + ) + parser.add_argument( + "--files", + metavar="LIST", + help=( + "Explicit colon-separated list of test files to run. Bypasses " + "discovery entirely — used by CI matrix jobs that receive their " + "file list from the generate job." + ), + ) parser.add_argument( "paths_positional", nargs="*", @@ -642,26 +684,48 @@ def main() -> int: repo_root = Path(__file__).resolve().parent.parent - # Resolve discovery roots: positional path args override --paths if any - # were supplied, otherwise --paths (which itself defaults to 'tests'). - if args.paths_positional: - # Positionals can be directories OR explicit .py files. Either is - # fine — _discover_files handles both via rglob('test_*.py') for - # dirs and direct inclusion for files. - roots = [repo_root / p for p in args.paths_positional] + # --files: explicit file list from the CI generate job — skip discovery. + if args.files: + files = [repo_root / f for f in args.files.split(":") if f.strip()] + roots = [] else: - roots = [repo_root / p for p in args.paths.split(":") if p] + # Resolve discovery roots: positional path args override --paths if any + # were supplied, otherwise --paths (which itself defaults to 'tests'). + if args.paths_positional: + roots = [repo_root / p for p in args.paths_positional] + else: + roots = [repo_root / p for p in args.paths.split(":") if p] - if args.include_integration: - # Caller takes responsibility — typically used via explicit -k filter. - global _SKIP_PARTS # noqa: PLW0603 — config knob - _SKIP_PARTS = set() + if args.include_integration: + # Caller takes responsibility — typically used via explicit -k filter. + global _SKIP_PARTS # noqa: PLW0603 — config knob + _SKIP_PARTS = set() + + files = _discover_files(roots) - files = _discover_files(roots) if not files: - print(f"No test files discovered under {[str(r) for r in roots]}", file=sys.stderr) + print(f"No test files to run", file=sys.stderr) return 1 + # --generate-slices: compute LPT distribution and emit JSON, then exit. + if args.generate_slices is not None: + durations = _load_durations(repo_root) + slices = _compute_lpt_slices( + files, args.generate_slices, durations, repo_root + ) + matrix = { + "slice": [ + { + "index": i + 1, + "files": ":".join(_format_file(f, repo_root) for f in bucket), + } + for i, bucket in enumerate(slices) + ] + } + # Print to stdout so the CI step can capture it with $(). + print(json.dumps(matrix)) + return 0 + # Count individual tests per file test_counts = _approximately_count_tests(files, repo_root) approx_total_tests = sum(test_counts.values()) @@ -675,12 +739,19 @@ def main() -> int: test_counts = {f: test_counts[f] for f in files if f in test_counts} approx_total_tests = sum(test_counts.values()) - print( - f"Discovered {len(files)} test files (~{approx_total_tests} tests) under " - f"{[str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]}; " - f"running with -j {args.jobs}", - flush=True, - ) + if roots: + roots_str = [str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots] + print( + f"Discovered {len(files)} test files (~{approx_total_tests} tests) under " + f"{roots_str}; running with -j {args.jobs}", + flush=True, + ) + else: + print( + f"Running {len(files)} test files (~{approx_total_tests} tests) " + f"with -j {args.jobs}", + flush=True, + ) # Capture and print on completion (out-of-order is fine — keeps the # terminal clean rather than interleaving N parallel pytest outputs).