change(ci): slice files in matrix job

avoid duplicating work, avoid file discovery on each job
This commit is contained in:
ethernet 2026-06-26 21:52:44 -04:00
parent 1a75387fa8
commit dd0e4ab81a
2 changed files with 144 additions and 92 deletions

View file

@ -21,25 +21,7 @@ jobs:
name: "Generate slices"
runs-on: ubuntu-latest
outputs:
slices: ${{ steps.matrix.outputs.slices }}
slice_count: ${{ steps.matrix.outputs.slice_count }}
steps:
- name: Generate test slices
id: matrix
run: |
COUNT="${{ inputs.slice_count }}"
SLICES=$(python3 -c "import json; print(json.dumps({'slice': list(range(1, $COUNT + 1))}))")
echo "slices=$SLICES" >> "$GITHUB_OUTPUT"
echo "slice_count=$COUNT" >> "$GITHUB_OUTPUT"
test:
name: Run tests slice
needs: generate
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.generate.outputs.slices) }}
matrix: ${{ steps.matrix.outputs.matrix }}
steps:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@ -48,13 +30,26 @@ jobs:
uses: actions/cache/restore@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
with:
path: test_durations.json
# main always writes a new suffix, but jobs pick the latest one with the same prefix
# quote from https://docs.github.com/en/actions/reference/workflows-and-actions/dependency-caching#cache-hits-and-misses
# If you provide restore-keys, the cache action sequentially searches for any caches that match the list of restore-keys.
# If there are no exact matches, the action searches for partial matches of the restore keys.
# When the action finds a partial match, the most recent cache is restored to the path directory.
key: test-durations
- name: Generate test slices
id: matrix
run: |
MATRIX=$(python3 scripts/run_tests_parallel.py --generate-slices ${{ inputs.slice_count }})
echo "matrix=$MATRIX" >> "$GITHUB_OUTPUT"
test:
name: Run tests slice ${{ matrix.slice.index }}/${{ inputs.slice_count }}
needs: generate
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.generate.outputs.matrix) }}
steps:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Install ripgrep (prebuilt binary)
run: |
set -euo pipefail
@ -99,33 +94,19 @@ jobs:
# re-download, keeping the persisted cache small and fast to restore.
run: uv cache prune --ci
- name: Run tests (slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }})
# Per-file isolation via scripts/run_tests.sh: discovers
# every test_*.py file under tests/ (excluding integration/ + e2e/),
# then runs `python -m pytest <file>` in a freshly-spawned subprocess
- name: Run tests (slice ${{ matrix.slice.index }}/${{ inputs.slice_count }})
# Per-file isolation via scripts/run_tests.sh: each test file runs
# in its own freshly-spawned `python -m pytest <file>` subprocess
# with bounded parallelism. No xdist, no shared workers, no
# module-level state leakage between files.
#
# Why per-file (not per-test): per-test spawn cost (~250ms × 17k
# tests = 70min CPU minimum) blew the wall-clock budget. Per-file
# spawn (~250ms × ~850 files = ~3.5min) fits while still giving
# every file a fresh interpreter — the only isolation boundary
# that matters in practice (cross-file leakage was the original
# flake source; intra-file is the test author's responsibility).
#
# Why drop xdist entirely: xdist's persistent workers accumulate
# state across files, which is exactly the leakage we wanted to
# fix. ThreadPoolExecutor + subprocess.run is ~60 lines and does
# the job with cleaner semantics.
#
# Matrix slicing (--slice I/N): files are distributed across N
# jobs by cached duration (LPT algorithm) so each job gets
# roughly equal wall time. Without a cache, files default to 2s
# estimate and get split roughly evenly by count — still correct,
# just not perfectly balanced.
# File list is pre-computed by the generate job (--generate-slices)
# which runs LPT distribution once and passes the file list to each
# matrix job via --files. Previously each job re-discovered files and
# re-ran LPT independently — redundant N times.
run: |
source .venv/bin/activate
scripts/run_tests.sh --slice ${{ matrix.slice }}/${{ needs.generate.outputs.slice_count }}
scripts/run_tests.sh --files '${{ matrix.slice.files }}'
env:
# Ensure tests don't accidentally call real APIs
OPENROUTER_API_KEY: ""
@ -135,7 +116,7 @@ jobs:
- name: Upload per-slice durations
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: test-durations-slice-${{ matrix.slice }}
name: test-durations-slice-${{ matrix.slice.index }}
path: test_durations.json
retention-days: 1

View file

@ -493,38 +493,27 @@ def _save_durations(
path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n")
def _slice_files(
def _compute_lpt_slices(
files: List[Path],
slice_index: int,
slice_count: int,
durations: dict[str, float],
repo_root: Path,
) -> List[Path]:
"""Return the subset of *files* belonging to slice *slice_index*.
) -> List[List[Path]]:
"""Distribute files across N slices using LPT (Longest Processing Time first).
Uses **Longest Processing Time first** (LPT) distribution: sort files
by estimated duration descending, then greedily assign each file to
the slice with the smallest accumulated time so far. This minimizes
the makespan (max slice duration) and keeps CI jobs balanced.
Sorts files by estimated duration descending, then greedily assigns each
file to the slice with the smallest accumulated time so far. This
minimizes the makespan (max slice duration) and keeps CI jobs balanced.
Files with no cached duration get a default estimate of 2.0s (roughly
the P50 from profiling). This means first-time ``--slice`` runs
(no cache) still get reasonable distribution, and new files don't
all land in one slice.
the P50 from profiling). This means first-time runs (no cache) still
get reasonable distribution, and new files don't all land in one slice.
``slice_index`` is 1-indexed (1..slice_count) for ergonomics
``--slice 1/4`` reads more naturally than ``--slice 0/4``.
Returns a list of N file-lists, one per slice (0-indexed).
"""
if slice_count < 2:
return files
if not (1 <= slice_index <= slice_count):
print(
f"error: --slice index must be 1..{slice_count}, got {slice_index}",
file=sys.stderr,
)
sys.exit(2)
return [files]
# Build (file, estimated_duration) pairs.
default_dur = 2.0
file_durs: List[Tuple[Path, float]] = []
for f in files:
@ -541,15 +530,47 @@ def _slice_files(
bucket_totals: List[float] = [0.0] * slice_count
for f, dur in file_durs:
# Find the least-loaded bucket.
min_idx = min(range(slice_count), key=lambda i: bucket_totals[i])
bucket_files[min_idx].append(f)
bucket_totals[min_idx] += dur
# Print slice summary for visibility.
return bucket_files
def _slice_files(
files: List[Path],
slice_index: int,
slice_count: int,
durations: dict[str, float],
repo_root: Path,
) -> List[Path]:
"""Return the subset of *files* belonging to slice *slice_index*.
Uses :func:`_compute_lpt_slices` for LPT distribution.
``slice_index`` is 1-indexed (1..slice_count) for ergonomics
``--slice 1/4`` reads more naturally than ``--slice 0/4``.
"""
if slice_count < 2:
return files
if not (1 <= slice_index <= slice_count):
print(
f"error: --slice index must be 1..{slice_count}, got {slice_index}",
file=sys.stderr,
)
sys.exit(2)
bucket_files = _compute_lpt_slices(files, slice_count, durations, repo_root)
target = bucket_files[slice_index - 1]
target_dur = bucket_totals[slice_index - 1]
total_dur = sum(bucket_totals)
target_dur = sum(
durations.get(_format_file(f, repo_root), 2.0) for f in target
)
total_dur = sum(
durations.get(_format_file(f, repo_root), 2.0)
for bucket in bucket_files
for f in bucket
)
print(
f"Slice {slice_index}/{slice_count}: {len(target)} files "
f"(~{target_dur:.0f}s estimated of {total_dur:.0f}s total)",
@ -604,6 +625,27 @@ def main() -> int:
"Env: HERMES_TEST_SLICE (format: I/N)."
),
)
parser.add_argument(
"--generate-slices",
metavar="N",
type=int,
help=(
"Discover test files, distribute them across N slices using "
"LPT on cached durations, and print a JSON matrix to stdout "
"then exit (no tests run). The JSON has the shape "
"'{\"slices\": [{\"index\": 1, \"files\": [\"tests/foo.py\", ...]}, ...]}' "
"so the CI generate job can feed it directly into a matrix."
),
)
parser.add_argument(
"--files",
metavar="LIST",
help=(
"Explicit colon-separated list of test files to run. Bypasses "
"discovery entirely — used by CI matrix jobs that receive their "
"file list from the generate job."
),
)
parser.add_argument(
"paths_positional",
nargs="*",
@ -642,26 +684,48 @@ def main() -> int:
repo_root = Path(__file__).resolve().parent.parent
# Resolve discovery roots: positional path args override --paths if any
# were supplied, otherwise --paths (which itself defaults to 'tests').
if args.paths_positional:
# Positionals can be directories OR explicit .py files. Either is
# fine — _discover_files handles both via rglob('test_*.py') for
# dirs and direct inclusion for files.
roots = [repo_root / p for p in args.paths_positional]
# --files: explicit file list from the CI generate job — skip discovery.
if args.files:
files = [repo_root / f for f in args.files.split(":") if f.strip()]
roots = []
else:
roots = [repo_root / p for p in args.paths.split(":") if p]
# Resolve discovery roots: positional path args override --paths if any
# were supplied, otherwise --paths (which itself defaults to 'tests').
if args.paths_positional:
roots = [repo_root / p for p in args.paths_positional]
else:
roots = [repo_root / p for p in args.paths.split(":") if p]
if args.include_integration:
# Caller takes responsibility — typically used via explicit -k filter.
global _SKIP_PARTS # noqa: PLW0603 — config knob
_SKIP_PARTS = set()
if args.include_integration:
# Caller takes responsibility — typically used via explicit -k filter.
global _SKIP_PARTS # noqa: PLW0603 — config knob
_SKIP_PARTS = set()
files = _discover_files(roots)
files = _discover_files(roots)
if not files:
print(f"No test files discovered under {[str(r) for r in roots]}", file=sys.stderr)
print(f"No test files to run", file=sys.stderr)
return 1
# --generate-slices: compute LPT distribution and emit JSON, then exit.
if args.generate_slices is not None:
durations = _load_durations(repo_root)
slices = _compute_lpt_slices(
files, args.generate_slices, durations, repo_root
)
matrix = {
"slice": [
{
"index": i + 1,
"files": ":".join(_format_file(f, repo_root) for f in bucket),
}
for i, bucket in enumerate(slices)
]
}
# Print to stdout so the CI step can capture it with $().
print(json.dumps(matrix))
return 0
# Count individual tests per file
test_counts = _approximately_count_tests(files, repo_root)
approx_total_tests = sum(test_counts.values())
@ -675,12 +739,19 @@ def main() -> int:
test_counts = {f: test_counts[f] for f in files if f in test_counts}
approx_total_tests = sum(test_counts.values())
print(
f"Discovered {len(files)} test files (~{approx_total_tests} tests) under "
f"{[str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]}; "
f"running with -j {args.jobs}",
flush=True,
)
if roots:
roots_str = [str(r.relative_to(repo_root)) if r.is_relative_to(repo_root) else str(r) for r in roots]
print(
f"Discovered {len(files)} test files (~{approx_total_tests} tests) under "
f"{roots_str}; running with -j {args.jobs}",
flush=True,
)
else:
print(
f"Running {len(files)} test files (~{approx_total_tests} tests) "
f"with -j {args.jobs}",
flush=True,
)
# Capture and print on completion (out-of-order is fine — keeps the
# terminal clean rather than interleaving N parallel pytest outputs).