change(ci): slice files in matrix job

avoid duplicating work, avoid file discovery on each job
2026-07-01 12:02:05 +00:00 · 2026-06-26 21:52:44 -04:00 · 2026-06-26 21:52:44 -04:00 · dd0e4ab81a
commit dd0e4ab81a
parent 1a75387fa8
2 changed files with 144 additions and 92 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -21,25 +21,7 @@ jobs:
    name: "Generate slices"
    runs-on: ubuntu-latest
    outputs:
-      slices: ${{ steps.matrix.outputs.slices }}
-      slice_count: ${{ steps.matrix.outputs.slice_count }}
-    steps:
-      - name: Generate test slices
-        id: matrix
-        run: |
-          COUNT="${{ inputs.slice_count }}"
-          SLICES=$(python3 -c "import json; print(json.dumps({'slice': list(range(1, $COUNT + 1))}))")
-          echo "slices=$SLICES" >> "$GITHUB_OUTPUT"
-          echo "slice_count=$COUNT" >> "$GITHUB_OUTPUT"
-
-  test:
-    name: Run tests slice
-    needs: generate
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.generate.outputs.slices) }}
+      matrix: ${{ steps.matrix.outputs.matrix }}
    steps:
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@ -48,13 +30,26 @@ jobs:
        uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
        with:
          path: test_durations.json
-          # main always writes a new suffix, but jobs pick the latest one with the same prefix
-          # quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses
-          # If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys.
-          # If there are no exact matches, the action searches for partial matches of the restore keys.
-          # When the action finds a partial match, the most recent cache is restored to the path directory.
          key: test-durations

+      - name: Generate test slices
+        id: matrix
+        run: |
+          MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }})
+          echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
+
+  test:
+    name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}
+    needs: generate
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJSON(needs.generate.outputs.matrix) }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
      - name: Install ripgrep (prebuilt binary)
        run: |
          set -euo pipefail
@ -99,33 +94,19 @@ jobs:
        # re-download, keeping the persisted cache small and fast to restore.
        run: uv cache prune --ci

-      - name: Run tests (slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }})
-        # Per-file isolation via scripts/run_tests.sh: discovers
-        # every test_*.py file under tests/ (excluding integration/ + e2e/),
-        # then runs `python -m pytest <file>` in a freshly-spawned subprocess
+      - name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }})
+        # Per-file isolation via scripts/run_tests.sh: each test file runs
+        # in its own freshly-spawned `python -m pytest <file>` subprocess
        # with bounded parallelism. No xdist, no shared workers, no
        # module-level state leakage between files.
        #
-        # Why per-file (not per-test): per-test spawn cost (~250ms × 17k
-        # tests = 70min CPU minimum) blew the wall-clock budget. Per-file
-        # spawn (~250ms × ~850 files = ~3.5min) fits while still giving
-        # every file a fresh interpreter — the only isolation boundary
-        # that matters in practice (cross-file leakage was the original
-        # flake source; intra-file is the test author's responsibility).
-        #
-        # Why drop xdist entirely: xdist's persistent workers accumulate
-        # state across files, which is exactly the leakage we wanted to
-        # fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
-        # the job with cleaner semantics.
-        #
-        # Matrix slicing (--slice I/N): files are distributed across N
-        # jobs by cached duration (LPT algorithm) so each job gets
-        # roughly equal wall time. Without a cache, files default to 2s
-        # estimate and get split roughly evenly by count — still correct,
-        # just not perfectly balanced.
+        # File list is pre-computed by the generate job (--generate-slices)
+        # which runs LPT distribution once and passes the file list to each
+        # matrix job via --files. Previously each job re-discovered files and
+        # re-ran LPT independently — redundant N times.
        run: |
          source .venv/bin/activate
-          scripts/run_tests.sh --slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }}
+          scripts/run_tests.sh --files '${{ matrix.slice.files }}'
        env:
          # Ensure tests don't accidentally call real APIs
          OPENROUTER_API_KEY: ""
@ -135,7 +116,7 @@ jobs:
      - name: Upload per-slice durations
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
-          name: test-durations-slice-${{ matrix.slice }}
+          name: test-durations-slice-${{ matrix.slice.index }}
          path: test_durations.json
          retention-days: 1

--- a/scripts/run_tests_parallel.py
+++ b/scripts/run_tests_parallel.py
@ -493,38 +493,27 @@ def _save_durations(
    path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n")


-def _slice_files(
+def _compute_lpt_slices(
    files: List[Path],
-    slice_index: int,
    slice_count: int,
    durations: dict[str, float],
    repo_root: Path,
-) -> List[Path]:
-    """Return the subset of *files* belonging to slice *slice_index*.
+) -> List[List[Path]]:
+    """Distribute files across N slices using LPT (Longest Processing Time first).

-    Uses **Longest Processing Time first** (LPT) distribution: sort files
-    by estimated duration descending, then greedily assign each file to
-    the slice with the smallest accumulated time so far. This minimizes
-    the makespan (max slice duration) and keeps CI jobs balanced.
+    Sorts files by estimated duration descending, then greedily assigns each
+    file to the slice with the smallest accumulated time so far. This
+    minimizes the makespan (max slice duration) and keeps CI jobs balanced.

    Files with no cached duration get a default estimate of 2.0s (roughly
-    the P50 from profiling). This means first-time ``--slice`` runs
-    (no cache) still get reasonable distribution, and new files don't
-    all land in one slice.
+    the P50 from profiling). This means first-time runs (no cache) still
+    get reasonable distribution, and new files don't all land in one slice.

-    ``slice_index`` is 1-indexed (1..slice_count) for ergonomics —
-    ``--slice 1/4`` reads more naturally than ``--slice 0/4``.
+    Returns a list of N file-lists, one per slice (0-indexed).
    """
    if slice_count < 2:
-        return files
-    if not (1 <= slice_index <= slice_count):
-        print(
-            f"error: --slice index must be 1..{slice_count}, got {slice_index}",
-            file=sys.stderr,
-        )
-        sys.exit(2)
+        return [files]

-    # Build (file, estimated_duration) pairs.
    default_dur = 2.0
    file_durs: List[Tuple[Path, float]] = []
    for f in files:
@ -541,15 +530,47 @@ def _slice_files(
    bucket_totals: List[float] = [0.0] * slice_count

    for f, dur in file_durs:
-        # Find the least-loaded bucket.
        min_idx = min(range(slice_count), key=lambda i: bucket_totals[i])
        bucket_files[min_idx].append(f)
        bucket_totals[min_idx] += dur

-    # Print slice summary for visibility.
+    return bucket_files
+
+
+def _slice_files(
+    files: List[Path],
+    slice_index: int,
+    slice_count: int,
+    durations: dict[str, float],
+    repo_root: Path,
+) -> List[Path]:
+    """Return the subset of *files* belonging to slice *slice_index*.
+
+    Uses :func:`_compute_lpt_slices` for LPT distribution.
+
+    ``slice_index`` is 1-indexed (1..slice_count) for ergonomics —
+    ``--slice 1/4`` reads more naturally than ``--slice 0/4``.
+    """
+    if slice_count < 2:
+        return files
+    if not (1 <= slice_index <= slice_count):
+        print(
+            f"error: --slice index must be 1..{slice_count}, got {slice_index}",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
+    bucket_files = _compute_lpt_slices(files, slice_count, durations, repo_root)
+
    target = bucket_files[slice_index - 1]
-    target_dur = bucket_totals[slice_index - 1]
-    total_dur = sum(bucket_totals)
+    target_dur = sum(
+        durations.get(_format_file(f, repo_root), 2.0) for f in target
+    )
+    total_dur = sum(
+        durations.get(_format_file(f, repo_root), 2.0)
+        for bucket in bucket_files
+        for f in bucket
+    )
    print(
        f"Slice {slice_index}/{slice_count}: {len(target)} files "
        f"(~{target_dur:.0f}s estimated of {total_dur:.0f}s total)",
@ -604,6 +625,27 @@ def main() -> int:
            "Env: HERMES_TEST_SLICE (format: I/N)."
        ),
    )
+    parser.add_argument(
+        "--generate-slices",
+        metavar="N",
+        type=int,
+        help=(
+            "Discover test files, distribute them across N slices using "
+            "LPT on cached durations, and print a JSON matrix to stdout "
+            "then exit (no tests run). The JSON has the shape "
+            "'{\"slices\": [{\"index\": 1, \"files\": [\"tests/foo.py\", ...]}, ...]}' "
+            "so the CI generate job can feed it directly into a matrix."
+        ),
+    )
+    parser.add_argument(
+        "--files",
+        metavar="LIST",
+        help=(
+            "Explicit colon-separated list of test files to run. Bypasses "
+            "discovery entirely — used by CI matrix jobs that receive their "
+            "file list from the generate job."
+        ),
+    )
    parser.add_argument(
        "paths_positional",
        nargs="*",
@ -642,26 +684,48 @@ def main() -> int:

    repo_root = Path(__file__).resolve().parent.parent

-    # Resolve discovery roots: positional path args override --paths if any
-    # were supplied, otherwise --paths (which itself defaults to 'tests').
-    if args.paths_positional:
-        # Positionals can be directories OR explicit .py files. Either is
-        # fine — _discover_files handles both via rglob('test_*.py') for
-        # dirs and direct inclusion for files.
-        roots = [repo_root / p for p in args.paths_positional]
+    # --files: explicit file list from the CI generate job — skip discovery.
+    if args.files:
+        files = [repo_root / f for f in args.files.split(":") if f.strip()]
+        roots = []
    else:
-        roots = [repo_root / p for p in args.paths.split(":") if p]
+        # Resolve discovery roots: positional path args override --paths if any
+        # were supplied, otherwise --paths (which itself defaults to 'tests').
+        if args.paths_positional:
+            roots = [repo_root / p for p in args.paths_positional]
+        else:
+            roots = [repo_root / p for p in args.paths.split(":") if p]

-    if args.include_integration:
-        # Caller takes responsibility — typically used via explicit -k filter.
-        global _SKIP_PARTS  # noqa: PLW0603 — config knob
-        _SKIP_PARTS = set()
+        if args.include_integration:
+            # Caller takes responsibility — typically used via explicit -k filter.
+            global _SKIP_PARTS  # noqa: PLW0603 — config knob
+            _SKIP_PARTS = set()
+
+        files = _discover_files(roots)

-    files = _discover_files(roots)
    if not files:
-        print(f"No test files discovered under {[str(r) for r in roots]}", file=sys.stderr)
+        print(f"No test files to run", file=sys.stderr)
        return 1

+    # --generate-slices: compute LPT distribution and emit JSON, then exit.
+    if args.generate_slices is not None:
+        durations = _load_durations(repo_root)
+        slices = _compute_lpt_slices(
+            files, args.generate_slices, durations, repo_root
+        )
+        matrix = {
+            "slice": [
+                {
+                    "index": i + 1,
+                    "files": ":".join(_format_file(f, repo_root) for f in bucket),
+                }
+                for i, bucket in enumerate(slices)
+            ]
+        }
+        # Print to stdout so the CI step can capture it with $().
+        print(json.dumps(matrix))
+        return 0
+
    # Count individual tests per file
    test_counts = _approximately_count_tests(files, repo_root)
    approx_total_tests = sum(test_counts.values())
@ -675,12 +739,19 @@ def main() -> int:
        test_counts = {f: test_counts[f] for f in files if f in test_counts}
        approx_total_tests = sum(test_counts.values())

-    print(
-        f"Discovered {len(files)} test files (~{approx_total_tests} tests) under "
-        f"{[str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]}; "
-        f"running with -j {args.jobs}",
-        flush=True,
-    )
+    if roots:
+        roots_str = [str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]
+        print(
+            f"Discovered {len(files)} test files (~{approx_total_tests} tests) under "
+            f"{roots_str}; running with -j {args.jobs}",
+            flush=True,
+        )
+    else:
+        print(
+            f"Running {len(files)} test files (~{approx_total_tests} tests) "
+            f"with -j {args.jobs}",
+            flush=True,
+        )

    # Capture and print on completion (out-of-order is fine — keeps the
    # terminal clean rather than interleaving N parallel pytest outputs).