From 56b4ef74a631bdca0bd5cc58bd43369fc227ea83 Mon Sep 17 00:00:00 2001 From: Brooklyn Nicholson Date: Fri, 19 Jun 2026 17:05:34 -0500 Subject: [PATCH] ci: make dependency installs resilient to transient flakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `npm ci` / `uv sync` / toolchain header fetches occasionally die on transient network blips — e.g. node-pty's node-gyp fetching Node headers (an undici assert) during the typecheck job's `npm ci`, which killed the job before `tsc` ever ran. "Re-run and it goes green" is exactly what CI should do itself. - New reusable `.github/actions/retry` composite action wraps a command and retries on failure (3x / 10s, command passed via env so it can't inject). Applied to every PR-path network install: npm ci (typecheck, desktop build, docs site), uv sync (tests, e2e), uv tool install (lint), pip install (docs site). - typecheck now runs `npm ci --ignore-scripts`: `tsc` needs only sources + type defs, so skipping install scripts drops node-pty's native rebuild (whose header fetch was the flake) and is faster. Validated locally — tsc passes for ui-tui, apps/shared, and apps/desktop with scripts skipped. - ripgrep download uses `curl --retry`. Docker (main-only) and the release/windows workflows are intentionally left for a follow-up. --- .github/actions/retry/action.yml | 50 ++++++++++++++++++++++++++ .github/workflows/docs-site-checks.yml | 10 ++++-- .github/workflows/lint.yml | 10 +++--- .github/workflows/tests.yml | 12 ++++--- .github/workflows/typecheck.yml | 14 ++++++-- 5 files changed, 83 insertions(+), 13 deletions(-) create mode 100644 .github/actions/retry/action.yml diff --git a/.github/actions/retry/action.yml b/.github/actions/retry/action.yml new file mode 100644 index 00000000000..0eba2866ebe --- /dev/null +++ b/.github/actions/retry/action.yml @@ -0,0 +1,50 @@ +name: Retry a flaky command +description: >- + Run a shell command, retrying on non-zero exit. For dependency installs + (npm ci, uv sync) whose only failures are transient network/toolchain + flakes — a node-gyp header fetch, a registry blip — so CI self-heals + instead of needing a manual re-run. + +inputs: + command: + description: Shell command to run (and retry). + required: true + attempts: + description: Max attempts before giving up. + default: "3" + delay: + description: Seconds to wait between attempts. + default: "10" + working-directory: + description: Directory to run in. + default: "." + +runs: + using: composite + steps: + - shell: bash + working-directory: ${{ inputs.working-directory }} + # command goes through env, never interpolated into the script body, so + # a command with quotes/specials can't break or inject into the runner. + env: + _CMD: ${{ inputs.command }} + _ATTEMPTS: ${{ inputs.attempts }} + _DELAY: ${{ inputs.delay }} + run: | + set -uo pipefail + n=0 + while :; do + n=$((n + 1)) + echo "::group::attempt $n/$_ATTEMPTS: $_CMD" + if bash -c "$_CMD"; then + echo "::endgroup::" + exit 0 + fi + echo "::endgroup::" + if [ "$n" -ge "$_ATTEMPTS" ]; then + echo "::error::failed after $n attempts: $_CMD" + exit 1 + fi + echo "::warning::attempt $n failed; retrying in ${_DELAY}s: $_CMD" + sleep "$_DELAY" + done diff --git a/.github/workflows/docs-site-checks.yml b/.github/workflows/docs-site-checks.yml index 53f8dce93f0..3ffe51ec744 100644 --- a/.github/workflows/docs-site-checks.yml +++ b/.github/workflows/docs-site-checks.yml @@ -36,8 +36,10 @@ jobs: - name: Install website dependencies if: steps.changes.outputs.site == 'true' - run: npm ci - working-directory: website + uses: ./.github/actions/retry + with: + command: npm ci + working-directory: website - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 if: steps.changes.outputs.site == 'true' @@ -46,7 +48,9 @@ jobs: - name: Install ascii-guard if: steps.changes.outputs.site == 'true' - run: python -m pip install ascii-guard==2.3.0 pyyaml==6.0.3 + uses: ./.github/actions/retry + with: + command: python -m pip install ascii-guard==2.3.0 pyyaml==6.0.3 - name: Extract skill metadata for dashboard if: steps.changes.outputs.site == 'true' diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 30e0ca68f8e..a9e496fcd4d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,9 +54,9 @@ jobs: - name: Install ruff + ty if: steps.changes.outputs.python == 'true' - run: | - uv tool install ruff - uv tool install ty + uses: ./.github/actions/retry + with: + command: uv tool install ruff && uv tool install ty - name: Determine base ref id: base @@ -194,7 +194,9 @@ jobs: - name: Install ruff if: steps.changes.outputs.python == 'true' - run: uv tool install ruff + uses: ./.github/actions/retry + with: + command: uv tool install ruff - name: ruff check . if: steps.changes.outputs.python == 'true' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c4dae1166dd..d40212bbcac 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -60,7 +60,7 @@ jobs: RG_VERSION=15.1.0 RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599 RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz - curl -sSfL -o "$RG_TARBALL" \ + curl -sSfL --retry 3 --retry-delay 5 -o "$RG_TARBALL" \ "https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}" echo "${RG_SHA256} ${RG_TARBALL}" | sha256sum -c - tar -xzf "$RG_TARBALL" @@ -92,7 +92,9 @@ jobs: # fails if the lock is out of sync with pyproject.toml), giving a # reproducible env. It also creates .venv itself, so no separate # `uv venv` step is needed. - run: uv sync --locked --python 3.11 --extra all --extra dev + uses: ./.github/actions/retry + with: + command: uv sync --locked --python 3.11 --extra all --extra dev - name: Minimize uv cache if: steps.changes.outputs.python == 'true' @@ -195,7 +197,7 @@ jobs: RG_VERSION=15.1.0 RG_SHA256=1c9297be4a084eea7ecaedf93eb03d058d6faae29bbc57ecdaf5063921491599 RG_TARBALL=ripgrep-${RG_VERSION}-x86_64-unknown-linux-musl.tar.gz - curl -sSfL -o "$RG_TARBALL" \ + curl -sSfL --retry 3 --retry-delay 5 -o "$RG_TARBALL" \ "https://github.com/BurntSushi/ripgrep/releases/download/${RG_VERSION}/${RG_TARBALL}" echo "${RG_SHA256} ${RG_TARBALL}" | sha256sum -c - tar -xzf "$RG_TARBALL" @@ -227,7 +229,9 @@ jobs: # fails if the lock is out of sync with pyproject.toml), giving a # reproducible env. It also creates .venv itself, so no separate # `uv venv` step is needed. - run: uv sync --locked --python 3.11 --extra all --extra dev + uses: ./.github/actions/retry + with: + command: uv sync --locked --python 3.11 --extra all --extra dev - name: Minimize uv cache if: steps.changes.outputs.python == 'true' diff --git a/.github/workflows/typecheck.yml b/.github/workflows/typecheck.yml index aeb7c35cdc8..b52161d3121 100644 --- a/.github/workflows/typecheck.yml +++ b/.github/workflows/typecheck.yml @@ -32,8 +32,14 @@ jobs: with: node-version: 22 cache: npm + # --ignore-scripts: typecheck only needs the TS sources + type defs, not + # native builds. Skipping install scripts drops node-pty's node-gyp + # header fetch — the transient flake that killed this job pre-`tsc` — and + # is faster. retry covers the remaining registry blips. - if: steps.changes.outputs.frontend == 'true' - run: npm ci + uses: ./.github/actions/retry + with: + command: npm ci --ignore-scripts - if: steps.changes.outputs.frontend == 'true' run: npm run --prefix ${{ matrix.package }} typecheck @@ -56,7 +62,11 @@ jobs: with: node-version: 22 cache: npm + # Keep install scripts here: the production build may need node-pty's + # native binary. retry handles the transient install-time fetch flakes. - if: steps.changes.outputs.frontend == 'true' - run: npm ci + uses: ./.github/actions/retry + with: + command: npm ci - if: steps.changes.outputs.frontend == 'true' run: npm run --prefix apps/desktop build