From def8b959b814d158c80e66d70f501adc3b6e9384 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:31:27 -0700 Subject: [PATCH] fix: add contributor audit script + fix missed contributors (#9264) Three problems fixed: 1. bobashopcashier missing from v0.9.0 contributor list despite authoring the gateway drain PR (#7290, salvaged into #7503). Their email (kennyx102@gmail.com) was missing from AUTHOR_MAP. 2. release.py only scanned git commit authors, missing Co-authored-by trailers. Now parse_coauthors() extracts trailers from commit bodies. 3. No mechanism to detect contributors from salvaged PRs (where original author only appears in PR description, not git log). Changes: - scripts/release.py: add kennyx102@gmail.com to AUTHOR_MAP, enhance get_commits() to parse Co-authored-by trailers, filter AI assistants (Claude, Copilot, Cursor Agent) from co-author lists - scripts/contributor_audit.py: new script that cross-references git authors, co-author trailers, and salvaged PR descriptions. Reports unknown emails and contributors missing from release notes. - RELEASE_v0.9.0.md: add bobashopcashier to community contributors Usage: python scripts/contributor_audit.py --since-tag v2026.4.8 python scripts/contributor_audit.py --since-tag v2026.4.8 --release-file RELEASE_v0.9.0.md --- RELEASE_v0.9.0.md | 1 + scripts/contributor_audit.py | 424 +++++++++++++++++++++++++++++++++++ scripts/release.py | 53 ++++- 3 files changed, 473 insertions(+), 5 deletions(-) create mode 100644 scripts/contributor_audit.py diff --git a/RELEASE_v0.9.0.md b/RELEASE_v0.9.0.md index e895d818b..15d5b84b4 100644 --- a/RELEASE_v0.9.0.md +++ b/RELEASE_v0.9.0.md @@ -318,6 +318,7 @@ - **@JiayuuWang** — CLI uninstall import fix - **@HiddenPuppy** — Docker procps installation - **@dsocolobsky** — Test suite fixes +- **@bobashopcashier** (1 PR) — Graceful gateway drain before restart (salvaged into #7503 from #7290) - **@benbarclay** — Docker image tag simplification - **@sosyz** — Shallow git clone for faster install - **@devorun** — Nix setupSecrets optional diff --git a/scripts/contributor_audit.py b/scripts/contributor_audit.py new file mode 100644 index 000000000..5d39f8316 --- /dev/null +++ b/scripts/contributor_audit.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +"""Contributor Audit Script + +Cross-references git authors, Co-authored-by trailers, and salvaged PR +descriptions to find any contributors missing from the release notes. + +Usage: + # Basic audit since a tag + python scripts/contributor_audit.py --since-tag v2026.4.8 + + # Audit with a custom endpoint + python scripts/contributor_audit.py --since-tag v2026.4.8 --until v2026.4.13 + + # Compare against a release notes file + python scripts/contributor_audit.py --since-tag v2026.4.8 --release-file RELEASE_v0.9.0.md +""" + +import argparse +import json +import os +import re +import subprocess +import sys +from collections import defaultdict +from pathlib import Path + +# --------------------------------------------------------------------------- +# Import AUTHOR_MAP and resolve_author from the sibling release.py module +# --------------------------------------------------------------------------- +SCRIPT_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(SCRIPT_DIR)) + +from release import AUTHOR_MAP, resolve_author # noqa: E402 + +REPO_ROOT = SCRIPT_DIR.parent + +# --------------------------------------------------------------------------- +# AI assistants, bots, and machine accounts to exclude from contributor lists +# --------------------------------------------------------------------------- +IGNORED_PATTERNS = [ + re.compile(r"^Claude", re.IGNORECASE), + re.compile(r"^Copilot$", re.IGNORECASE), + re.compile(r"^Cursor\s+Agent$", re.IGNORECASE), + re.compile(r"^GitHub\s*Actions?$", re.IGNORECASE), + re.compile(r"^dependabot", re.IGNORECASE), + re.compile(r"^renovate", re.IGNORECASE), + re.compile(r"^Hermes\s+(Agent|Audit)$", re.IGNORECASE), + re.compile(r"^Ubuntu$", re.IGNORECASE), +] + +IGNORED_EMAILS = { + "noreply@anthropic.com", + "noreply@github.com", + "cursoragent@cursor.com", + "hermes@nousresearch.com", + "hermes-audit@example.com", + "hermes@habibilabs.dev", +} + + +def is_ignored(handle: str, email: str = "") -> bool: + """Return True if this contributor is a bot/AI/machine account.""" + if email in IGNORED_EMAILS: + return True + for pattern in IGNORED_PATTERNS: + if pattern.search(handle): + return True + return False + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def git(*args, cwd=None): + """Run a git command and return stdout.""" + result = subprocess.run( + ["git"] + list(args), + capture_output=True, + text=True, + cwd=cwd or str(REPO_ROOT), + ) + if result.returncode != 0: + print(f" [warn] git {' '.join(args)} failed: {result.stderr.strip()}", file=sys.stderr) + return "" + return result.stdout.strip() + + +def gh_pr_list(): + """Fetch merged PRs from GitHub using the gh CLI. + + Returns a list of dicts with keys: number, title, body, author. + Returns an empty list if gh is not available or the call fails. + """ + try: + result = subprocess.run( + [ + "gh", "pr", "list", + "--repo", "NousResearch/hermes-agent", + "--state", "merged", + "--json", "number,title,body,author,mergedAt", + "--limit", "300", + ], + capture_output=True, + text=True, + timeout=60, + ) + if result.returncode != 0: + print(f" [warn] gh pr list failed: {result.stderr.strip()}", file=sys.stderr) + return [] + return json.loads(result.stdout) + except FileNotFoundError: + print(" [warn] 'gh' CLI not found — skipping salvaged PR scan.", file=sys.stderr) + return [] + except subprocess.TimeoutExpired: + print(" [warn] gh pr list timed out — skipping salvaged PR scan.", file=sys.stderr) + return [] + except json.JSONDecodeError: + print(" [warn] gh pr list returned invalid JSON — skipping salvaged PR scan.", file=sys.stderr) + return [] + + +# --------------------------------------------------------------------------- +# Contributor collection +# --------------------------------------------------------------------------- + +# Patterns that indicate salvaged/cherry-picked/co-authored work in PR bodies +SALVAGE_PATTERNS = [ + # "Salvaged from @username" or "Salvaged from #123" + re.compile(r"[Ss]alvaged\s+from\s+@(\w[\w-]*)"), + re.compile(r"[Ss]alvaged\s+from\s+#(\d+)"), + # "Cherry-picked from @username" + re.compile(r"[Cc]herry[- ]?picked\s+from\s+@(\w[\w-]*)"), + # "Based on work by @username" + re.compile(r"[Bb]ased\s+on\s+work\s+by\s+@(\w[\w-]*)"), + # "Original PR by @username" + re.compile(r"[Oo]riginal\s+PR\s+by\s+@(\w[\w-]*)"), + # "Co-authored with @username" + re.compile(r"[Cc]o[- ]?authored\s+with\s+@(\w[\w-]*)"), +] + +# Pattern for Co-authored-by trailers in commit messages +CO_AUTHORED_RE = re.compile( + r"Co-authored-by:\s*(.+?)\s*<([^>]+)>", + re.IGNORECASE, +) + + +def collect_commit_authors(since_tag, until="HEAD"): + """Collect contributors from git commit authors. + + Returns: + contributors: dict mapping github_handle -> set of source labels + unknown_emails: dict mapping email -> git name (for emails not in AUTHOR_MAP) + """ + range_spec = f"{since_tag}..{until}" + log = git( + "log", range_spec, + "--format=%H|%an|%ae|%s", + "--no-merges", + ) + + contributors = defaultdict(set) + unknown_emails = {} + + if not log: + return contributors, unknown_emails + + for line in log.split("\n"): + if not line.strip(): + continue + parts = line.split("|", 3) + if len(parts) != 4: + continue + _sha, name, email, _subject = parts + + handle = resolve_author(name, email) + # resolve_author returns "@handle" or plain name + if handle.startswith("@"): + contributors[handle.lstrip("@")].add("commit") + else: + # Could not resolve — record as unknown + contributors[handle].add("commit") + unknown_emails[email] = name + + return contributors, unknown_emails + + +def collect_co_authors(since_tag, until="HEAD"): + """Collect contributors from Co-authored-by trailers in commit messages. + + Returns: + contributors: dict mapping github_handle -> set of source labels + unknown_emails: dict mapping email -> git name + """ + range_spec = f"{since_tag}..{until}" + # Get full commit messages to scan for trailers + log = git( + "log", range_spec, + "--format=__COMMIT__%H%n%b", + "--no-merges", + ) + + contributors = defaultdict(set) + unknown_emails = {} + + if not log: + return contributors, unknown_emails + + for line in log.split("\n"): + match = CO_AUTHORED_RE.search(line) + if match: + name = match.group(1).strip() + email = match.group(2).strip() + handle = resolve_author(name, email) + if handle.startswith("@"): + contributors[handle.lstrip("@")].add("co-author") + else: + contributors[handle].add("co-author") + unknown_emails[email] = name + + return contributors, unknown_emails + + +def collect_salvaged_contributors(since_tag, until="HEAD"): + """Scan merged PR bodies for salvage/cherry-pick/co-author attribution. + + Uses the gh CLI to fetch PRs, then filters to the date range defined + by since_tag..until and scans bodies for salvage patterns. + + Returns: + contributors: dict mapping github_handle -> set of source labels + pr_refs: dict mapping github_handle -> list of PR numbers where found + """ + contributors = defaultdict(set) + pr_refs = defaultdict(list) + + # Determine the date range from git tags/refs + since_date = git("log", "-1", "--format=%aI", since_tag) + if until == "HEAD": + until_date = git("log", "-1", "--format=%aI", "HEAD") + else: + until_date = git("log", "-1", "--format=%aI", until) + + if not since_date: + print(f" [warn] Could not resolve date for {since_tag}", file=sys.stderr) + return contributors, pr_refs + + prs = gh_pr_list() + if not prs: + return contributors, pr_refs + + for pr in prs: + # Filter by merge date if available + merged_at = pr.get("mergedAt", "") + if merged_at and since_date: + if merged_at < since_date: + continue + if until_date and merged_at > until_date: + continue + + body = pr.get("body") or "" + pr_number = pr.get("number", "?") + + # Also credit the PR author + pr_author = pr.get("author", {}) + pr_author_login = pr_author.get("login", "") if isinstance(pr_author, dict) else "" + + for pattern in SALVAGE_PATTERNS: + for match in pattern.finditer(body): + value = match.group(1) + # If it's a number, it's a PR reference — skip for now + # (would need another API call to resolve PR author) + if value.isdigit(): + continue + contributors[value].add("salvage") + pr_refs[value].append(pr_number) + + return contributors, pr_refs + + +# --------------------------------------------------------------------------- +# Release file comparison +# --------------------------------------------------------------------------- + +def check_release_file(release_file, all_contributors): + """Check which contributors are mentioned in the release file. + + Returns: + mentioned: set of handles found in the file + missing: set of handles NOT found in the file + """ + try: + content = Path(release_file).read_text() + except FileNotFoundError: + print(f" [error] Release file not found: {release_file}", file=sys.stderr) + return set(), set(all_contributors) + + mentioned = set() + missing = set() + content_lower = content.lower() + + for handle in all_contributors: + # Check for @handle or just handle (case-insensitive) + if f"@{handle.lower()}" in content_lower or handle.lower() in content_lower: + mentioned.add(handle) + else: + missing.add(handle) + + return mentioned, missing + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Audit contributors across git history, co-author trailers, and salvaged PRs.", + ) + parser.add_argument( + "--since-tag", + required=True, + help="Git tag to start from (e.g., v2026.4.8)", + ) + parser.add_argument( + "--until", + default="HEAD", + help="Git ref to end at (default: HEAD)", + ) + parser.add_argument( + "--release-file", + default=None, + help="Path to a release notes file to check for missing contributors", + ) + args = parser.parse_args() + + print(f"=== Contributor Audit: {args.since_tag}..{args.until} ===") + print() + + # ---- 1. Git commit authors ---- + print("[1/3] Scanning git commit authors...") + commit_contribs, commit_unknowns = collect_commit_authors(args.since_tag, args.until) + print(f" Found {len(commit_contribs)} contributor(s) from commits.") + + # ---- 2. Co-authored-by trailers ---- + print("[2/3] Scanning Co-authored-by trailers...") + coauthor_contribs, coauthor_unknowns = collect_co_authors(args.since_tag, args.until) + print(f" Found {len(coauthor_contribs)} contributor(s) from co-author trailers.") + + # ---- 3. Salvaged PRs ---- + print("[3/3] Scanning salvaged/cherry-picked PR descriptions...") + salvage_contribs, salvage_pr_refs = collect_salvaged_contributors(args.since_tag, args.until) + print(f" Found {len(salvage_contribs)} contributor(s) from salvaged PRs.") + + # ---- Merge all contributors ---- + all_contributors = defaultdict(set) + for handle, sources in commit_contribs.items(): + all_contributors[handle].update(sources) + for handle, sources in coauthor_contribs.items(): + all_contributors[handle].update(sources) + for handle, sources in salvage_contribs.items(): + all_contributors[handle].update(sources) + + # Merge unknown emails + all_unknowns = {} + all_unknowns.update(commit_unknowns) + all_unknowns.update(coauthor_unknowns) + + # Filter out AI assistants, bots, and machine accounts + ignored = {h for h in all_contributors if is_ignored(h)} + for h in ignored: + del all_contributors[h] + # Also filter unknowns by email + all_unknowns = {e: n for e, n in all_unknowns.items() if not is_ignored(n, e)} + + # ---- Output ---- + print() + print(f"=== All Contributors ({len(all_contributors)}) ===") + print() + + # Sort by handle, case-insensitive + for handle in sorted(all_contributors.keys(), key=str.lower): + sources = sorted(all_contributors[handle]) + source_str = ", ".join(sources) + extra = "" + if handle in salvage_pr_refs: + pr_nums = salvage_pr_refs[handle] + extra = f" (PRs: {', '.join(f'#{n}' for n in pr_nums)})" + print(f" @{handle} [{source_str}]{extra}") + + # ---- Unknown emails ---- + if all_unknowns: + print() + print(f"=== Unknown Emails ({len(all_unknowns)}) ===") + print("These emails are not in AUTHOR_MAP and should be added:") + print() + for email, name in sorted(all_unknowns.items()): + print(f' "{email}": "{name}",') + + # ---- Release file comparison ---- + if args.release_file: + print() + print(f"=== Release File Check: {args.release_file} ===") + print() + mentioned, missing = check_release_file(args.release_file, all_contributors.keys()) + print(f" Mentioned in release notes: {len(mentioned)}") + print(f" Missing from release notes: {len(missing)}") + if missing: + print() + print(" Contributors NOT mentioned in the release file:") + for handle in sorted(missing, key=str.lower): + sources = sorted(all_contributors[handle]) + print(f" @{handle} [{', '.join(sources)}]") + else: + print() + print(" All contributors are mentioned in the release file!") + + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/scripts/release.py b/scripts/release.py index ea697cb3e..84d057ea0 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -94,6 +94,7 @@ AUTHOR_MAP = { "vincentcharlebois@gmail.com": "vincentcharlebois", "aryan@synvoid.com": "aryansingh", "johnsonblake1@gmail.com": "blakejohnson", + "kennyx102@gmail.com": "bobashopcashier", "bryan@intertwinesys.com": "bryanyoung", "christo.mitov@gmail.com": "christomitov", "hermes@nousresearch.com": "NousResearch", @@ -315,6 +316,28 @@ def clean_subject(subject: str) -> str: return cleaned +def parse_coauthors(body: str) -> list: + """Extract Co-authored-by trailers from a commit message body. + + Returns a list of {'name': ..., 'email': ...} dicts. + Filters out AI assistants and bots (Claude, Copilot, Cursor, etc.). + """ + if not body: + return [] + # AI/bot emails to ignore in co-author trailers + _ignored_emails = {"noreply@anthropic.com", "noreply@github.com", + "cursoragent@cursor.com", "hermes@nousresearch.com"} + _ignored_names = re.compile(r"^(Claude|Copilot|Cursor Agent|GitHub Actions?|dependabot|renovate)", re.IGNORECASE) + pattern = re.compile(r"Co-authored-by:\s*(.+?)\s*<([^>]+)>", re.IGNORECASE) + results = [] + for m in pattern.finditer(body): + name, email = m.group(1).strip(), m.group(2).strip() + if email in _ignored_emails or _ignored_names.match(name): + continue + results.append({"name": name, "email": email}) + return results + + def get_commits(since_tag=None): """Get commits since a tag (or all commits if None).""" if since_tag: @@ -322,10 +345,11 @@ def get_commits(since_tag=None): else: range_spec = "HEAD" - # Format: hash|author_name|author_email|subject + # Format: hash|author_name|author_email|subject\0body + # Using %x00 (null) as separator between subject and body log = git( "log", range_spec, - "--format=%H|%an|%ae|%s", + "--format=%H|%an|%ae|%s%x00%b%x00", "--no-merges", ) @@ -333,13 +357,25 @@ def get_commits(since_tag=None): return [] commits = [] - for line in log.split("\n"): - if not line.strip(): + # Split on double-null to get each commit entry, since body ends with \0 + # and format ends with \0, each record ends with \0\0 between entries + for entry in log.split("\0\0"): + entry = entry.strip() + if not entry: continue - parts = line.split("|", 3) + # Split on first null to separate "hash|name|email|subject" from "body" + if "\0" in entry: + header, body = entry.split("\0", 1) + body = body.strip() + else: + header = entry + body = "" + parts = header.split("|", 3) if len(parts) != 4: continue sha, name, email, subject = parts + coauthor_info = parse_coauthors(body) + coauthors = [resolve_author(ca["name"], ca["email"]) for ca in coauthor_info] commits.append({ "sha": sha, "short_sha": sha[:8], @@ -348,6 +384,7 @@ def get_commits(since_tag=None): "subject": subject, "category": categorize_commit(subject), "github_author": resolve_author(name, email), + "coauthors": coauthors, }) return commits @@ -389,6 +426,9 @@ def generate_changelog(commits, tag_name, semver, repo_url="https://github.com/N author = commit["github_author"] if author not in teknium_aliases: all_authors.add(author) + for coauthor in commit.get("coauthors", []): + if coauthor not in teknium_aliases: + all_authors.add(coauthor) # Category display order and emoji category_order = [ @@ -437,6 +477,9 @@ def generate_changelog(commits, tag_name, semver, repo_url="https://github.com/N author = commit["github_author"] if author not in teknium_aliases: author_counts[author] += 1 + for coauthor in commit.get("coauthors", []): + if coauthor not in teknium_aliases: + author_counts[coauthor] += 1 sorted_authors = sorted(author_counts.items(), key=lambda x: -x[1])