#!/usr/bin/env python3 """Build a structured findings.json with evidence chains (stdlib-only). Aggregates cross_links.csv (entity_resolution output) and an optional timing.json (timing_analysis output) into a single evidence-chain document. Output structure: { "metadata": {...}, "findings": [ { "id": "F0001", "title": "...", "severity": "HIGH|MEDIUM|LOW", "confidence": "high|medium|low", "summary": "...", "evidence": [ {"source": "cross_links.csv", "row": 12, "fields": {...}}, ... ], "sources": ["cross_links.csv", "timing.json"] } ] } Every finding traces to specific source rows. No naked claims. """ from __future__ import annotations import argparse import csv import json from collections import defaultdict from pathlib import Path CONFIDENCE_ORDER = {"high": 0, "medium": 1, "low": 2} SEVERITY_ORDER = {"HIGH": 0, "MEDIUM": 1, "LOW": 2} def _read_cross_links(path: str) -> list[dict[str, str]]: with open(path, newline="", encoding="utf-8") as fh: return list(csv.DictReader(fh)) def build_findings( cross_links_path: str, timing_path: str | None = None, out_path: str = "findings.json", bundled_threshold: int = 3, ) -> dict: findings: list[dict] = [] next_id = 1 # 1. Match-based findings, grouped by (left_normalized, right_normalized). matches = _read_cross_links(cross_links_path) grouped: dict[tuple[str, str], list[dict[str, str]]] = defaultdict(list) for i, row in enumerate(matches): row["__row__"] = str(i) grouped[(row.get("left_normalized", ""), row.get("right_normalized", ""))].append(row) for (left_norm, right_norm), rows in grouped.items(): if not left_norm or not right_norm: continue # Use the highest-confidence match for the finding's overall confidence. best = min(rows, key=lambda r: CONFIDENCE_ORDER.get(r.get("confidence", "low"), 2)) finding_id = f"F{next_id:04d}" next_id += 1 evidence = [ { "source": "cross_links.csv", "row": int(r["__row__"]), "fields": { "match_type": r.get("match_type", ""), "confidence": r.get("confidence", ""), "left_name": r.get("left_name", ""), "right_name": r.get("right_name", ""), "overlap_ratio": r.get("overlap_ratio", ""), "shared_tokens": r.get("shared_tokens", ""), }, } for r in rows ] findings.append( { "id": finding_id, "title": f"Entity match: {best.get('left_name', '')} ↔ {best.get('right_name', '')}", "severity": "MEDIUM" if best.get("confidence") == "high" else "LOW", "confidence": best.get("confidence", "low"), "summary": ( f"{len(rows)} cross-link record(s) tie " f"'{best.get('left_name', '')}' to " f"'{best.get('right_name', '')}' " f"(best tier: {best.get('match_type', '')})." ), "evidence": evidence, "sources": ["cross_links.csv"], } ) # 2. Bundled-donations findings (if cross_links carries donor↔candidate pattern). # Heuristic: many distinct left names sharing the same right name. by_right: dict[str, set[str]] = defaultdict(set) by_right_rows: dict[str, list[dict[str, str]]] = defaultdict(list) for r in matches: right = r.get("right_normalized", "") left_raw = r.get("left_name", "").strip() if right and left_raw: by_right[right].add(left_raw) by_right_rows[right].append(r) for right_norm, lefts in by_right.items(): if len(lefts) < bundled_threshold: continue rows = by_right_rows[right_norm] right_raw = rows[0].get("right_name", "") findings.append( { "id": f"F{next_id:04d}", "title": f"Bundled cross-links: {len(lefts)} distinct left entities ↔ '{right_raw}'", "severity": "HIGH", "confidence": "medium", "summary": ( f"{len(lefts)} distinct left-side entities link to " f"'{right_raw}'. Pattern suggests coordinated relationship " f"(e.g. bundled donations, multi-vendor employer)." ), "evidence": [ { "source": "cross_links.csv", "row": int(r.get("__row__", "0")), "fields": { "left_name": r.get("left_name", ""), "match_type": r.get("match_type", ""), }, } for r in rows ], "sources": ["cross_links.csv"], } ) next_id += 1 # 3. Timing-based findings. if timing_path and Path(timing_path).exists(): timing = json.loads(Path(timing_path).read_text()) for r in timing.get("results", []): if not r.get("significant"): continue findings.append( { "id": f"F{next_id:04d}", "title": ( f"Donation timing significantly clusters near awards: " f"{r['donor']} ↔ {r['recipient']}" ), "severity": "HIGH" if r["p_value"] < 0.01 else "MEDIUM", "confidence": "medium", "summary": ( f"Mean nearest-award distance {r['observed_mean_days']} days " f"(null {r['null_mean_days']} days). p={r['p_value']}, " f"effect size {r['effect_size_sd']} SD. " f"{r['n_donations']} donations, {r['n_award_dates']} awards." ), "evidence": [ { "source": "timing.json", "row": None, "fields": r, } ], "sources": ["timing.json"], } ) next_id += 1 # Sort: severity → confidence → id. findings.sort( key=lambda f: ( SEVERITY_ORDER.get(f["severity"], 3), CONFIDENCE_ORDER.get(f["confidence"], 3), f["id"], ) ) payload = { "metadata": { "n_findings": len(findings), "cross_links_path": cross_links_path, "timing_path": timing_path, "bundled_threshold": bundled_threshold, }, "findings": findings, } Path(out_path).write_text(json.dumps(payload, indent=2)) return payload def main() -> int: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--cross-links", required=True) p.add_argument("--timing", help="Optional timing.json from timing_analysis.py") p.add_argument("--out", default="findings.json") p.add_argument( "--bundled-threshold", type=int, default=3, help="Minimum distinct left entities to flag as bundled (default 3)", ) a = p.parse_args() payload = build_findings( cross_links_path=a.cross_links, timing_path=a.timing, out_path=a.out, bundled_threshold=a.bundled_threshold, ) print(f"Wrote {payload['metadata']['n_findings']} findings to {a.out}") return 0 if __name__ == "__main__": raise SystemExit(main())