From dd86deef137a39aed934175bf80c9189ad88a94f Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:13:08 -0700 Subject: [PATCH] feat(ci): add contributor attribution check on PRs (#9376) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a CI workflow that blocks PRs introducing commits with unmapped author emails. Checks each new commit's author email against AUTHOR_MAP in scripts/release.py — GitHub noreply emails auto-pass, but personal/work emails must be mapped. Also adds --strict and --diff-base flags to contributor_audit.py for programmatic use. --strict exits 1 when new unmapped emails are found; --diff-base scopes the check to only flag emails from commits after a given ref (grandfathers existing unknowns). Prevention for the 97-unmapped-email gap found in the April 2026 contributor audit. --- .github/workflows/contributor-check.yml | 70 +++++++++++++++++++++++++ scripts/contributor_audit.py | 49 +++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 .github/workflows/contributor-check.yml diff --git a/.github/workflows/contributor-check.yml b/.github/workflows/contributor-check.yml new file mode 100644 index 000000000..f8d65a3ea --- /dev/null +++ b/.github/workflows/contributor-check.yml @@ -0,0 +1,70 @@ +name: Contributor Attribution Check + +on: + pull_request: + branches: [main] + paths: + # Only run when code files change (not docs-only PRs) + - '*.py' + - '**/*.py' + - '.github/workflows/contributor-check.yml' + +jobs: + check-attribution: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history needed for git log + + - name: Check for unmapped contributor emails + run: | + # Get the merge base between this PR and main + MERGE_BASE=$(git merge-base origin/main HEAD) + + # Find any new author emails in this PR's commits + NEW_EMAILS=$(git log ${MERGE_BASE}..HEAD --format='%ae' --no-merges | sort -u) + + if [ -z "$NEW_EMAILS" ]; then + echo "No new commits to check." + exit 0 + fi + + # Check each email against AUTHOR_MAP in release.py + MISSING="" + while IFS= read -r email; do + # Skip teknium and bot emails + case "$email" in + *teknium*|*noreply@github.com*|*dependabot*|*github-actions*|*anthropic.com*|*cursor.com*) + continue ;; + esac + + # Check if email is in AUTHOR_MAP (either as a key or matches noreply pattern) + if echo "$email" | grep -qP '\+.*@users\.noreply\.github\.com'; then + continue # GitHub noreply emails auto-resolve + fi + + if ! grep -qF "\"${email}\"" scripts/release.py 2>/dev/null; then + AUTHOR=$(git log --author="$email" --format='%an' -1) + MISSING="${MISSING}\n ${email} (${AUTHOR})" + fi + done <<< "$NEW_EMAILS" + + if [ -n "$MISSING" ]; then + echo "" + echo "⚠️ New contributor email(s) not in AUTHOR_MAP:" + echo -e "$MISSING" + echo "" + echo "Please add mappings to scripts/release.py AUTHOR_MAP:" + echo -e "$MISSING" | while read -r line; do + email=$(echo "$line" | sed 's/^ *//' | cut -d' ' -f1) + [ -z "$email" ] && continue + echo " \"${email}\": \"\"," + done + echo "" + echo "To find the GitHub username for an email:" + echo " gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'" + exit 1 + else + echo "✅ All contributor emails are mapped in AUTHOR_MAP." + fi diff --git a/scripts/contributor_audit.py b/scripts/contributor_audit.py index 5d39f8316..474b0d52b 100644 --- a/scripts/contributor_audit.py +++ b/scripts/contributor_audit.py @@ -333,6 +333,16 @@ def main(): default=None, help="Path to a release notes file to check for missing contributors", ) + parser.add_argument( + "--strict", + action="store_true", + help="Exit with code 1 if new unmapped emails are found (for CI)", + ) + parser.add_argument( + "--diff-base", + default=None, + help="Git ref to diff against (only flag emails from commits after this ref)", + ) args = parser.parse_args() print(f"=== Contributor Audit: {args.since_tag}..{args.until} ===") @@ -398,6 +408,42 @@ def main(): for email, name in sorted(all_unknowns.items()): print(f' "{email}": "{name}",') + # ---- Strict mode: fail CI if new unmapped emails are introduced ---- + if args.strict and all_unknowns: + # In strict mode, check if ANY unknown emails come from commits in this + # PR's diff range (new unmapped emails that weren't there before). + # This is the CI gate: existing unknowns are grandfathered, but new + # commits must have their author email in AUTHOR_MAP. + new_unknowns = {} + if args.diff_base: + # Only flag emails from commits after diff_base + new_commits_output = git( + "log", f"{args.diff_base}..HEAD", + "--format=%ae", "--no-merges", + ) + new_emails = set(new_commits_output.splitlines()) if new_commits_output else set() + for email, name in all_unknowns.items(): + if email in new_emails: + new_unknowns[email] = name + else: + new_unknowns = all_unknowns + + if new_unknowns: + print() + print(f"=== STRICT MODE FAILURE: {len(new_unknowns)} new unmapped email(s) ===") + print("Add these to AUTHOR_MAP in scripts/release.py before merging:") + print() + for email, name in sorted(new_unknowns.items()): + print(f' "{email}": "",') + print() + print("To find the GitHub username:") + print(" gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'") + strict_failed = True + else: + strict_failed = False + else: + strict_failed = False + # ---- Release file comparison ---- if args.release_file: print() @@ -419,6 +465,9 @@ def main(): print() print("Done.") + if strict_failed: + sys.exit(1) + if __name__ == "__main__": main()