#!/usr/bin/env python3 """Cross-source entity resolution (stdlib-only). Given two CSV files with name columns, find candidate matches using three tiers of normalization: 1. exact — normalized strings equal 2. fuzzy — sorted-token (word-bag) match 3. token_overlap — >=60% Jaccard overlap on >=4-char tokens, >=2 shared Adapted from ShinMegamiBoson/OpenPlanter (MIT) but generalized: no Boston- specific record types, no contribution-code filters, no fixed schemas. Output CSV columns: match_type, confidence, left_name, right_name, left_normalized, right_normalized, left_row, right_row, overlap_ratio, shared_tokens """ from __future__ import annotations import argparse import csv import sys from pathlib import Path # Allow running directly or as a module. sys.path.insert(0, str(Path(__file__).parent)) from _normalize import ( # noqa: E402 normalize_name, normalize_aggressive, token_overlap_ratio, ) CONFIDENCE = { "exact": "high", "fuzzy": "medium", "token_overlap": "low", } def _read_csv(path: str, name_col: str) -> list[dict[str, str]]: rows = [] with open(path, newline="", encoding="utf-8") as fh: reader = csv.DictReader(fh) if name_col not in (reader.fieldnames or []): raise SystemExit( f"Column {name_col!r} not in {path}. " f"Available: {reader.fieldnames}" ) for i, row in enumerate(reader): row["__row__"] = str(i) rows.append(row) return rows def _build_index(rows: list[dict[str, str]], name_col: str): """Index by exact-normalized and aggressive (sorted-token) form.""" exact: dict[str, list[dict[str, str]]] = {} aggressive: dict[str, list[dict[str, str]]] = {} for row in rows: raw = row.get(name_col, "") n = normalize_name(raw) if n: exact.setdefault(n, []).append(row) a = normalize_aggressive(raw) if a: aggressive.setdefault(a, []).append(row) return exact, aggressive def _emit( out_rows: list[dict[str, str]], seen: set[tuple], match_type: str, left_row: dict[str, str], right_row: dict[str, str], left_col: str, right_col: str, ratio: float = 0.0, shared: int = 0, ): left_raw = left_row.get(left_col, "") right_raw = right_row.get(right_col, "") key = ( left_row["__row__"], right_row["__row__"], match_type, ) if key in seen: return seen.add(key) out_rows.append( { "match_type": match_type, "confidence": CONFIDENCE[match_type], "left_name": left_raw, "right_name": right_raw, "left_normalized": normalize_name(left_raw), "right_normalized": normalize_name(right_raw), "left_row": left_row["__row__"], "right_row": right_row["__row__"], "overlap_ratio": f"{ratio:.3f}" if ratio else "", "shared_tokens": str(shared) if shared else "", } ) def resolve( left_path: str, left_col: str, right_path: str, right_col: str, out_path: str, overlap_threshold: float = 0.60, min_shared: int = 2, skip_overlap: bool = False, ) -> int: left_rows = _read_csv(left_path, left_col) right_rows = _read_csv(right_path, right_col) right_exact, right_aggressive = _build_index(right_rows, right_col) out_rows: list[dict[str, str]] = [] seen: set[tuple] = set() # Pass 1+2: exact / fuzzy via index lookup. for lrow in left_rows: raw = lrow.get(left_col, "") n = normalize_name(raw) if not n: continue for rrow in right_exact.get(n, []): _emit(out_rows, seen, "exact", lrow, rrow, left_col, right_col) a = normalize_aggressive(raw) if a: for rrow in right_aggressive.get(a, []): _emit(out_rows, seen, "fuzzy", lrow, rrow, left_col, right_col) if not skip_overlap: # Pass 3: token overlap (O(N*M) — expensive; allow opt-out). for lrow in left_rows: l_raw = lrow.get(left_col, "") if not normalize_name(l_raw): continue for rrow in right_rows: ratio, shared = token_overlap_ratio( l_raw, rrow.get(right_col, "") ) if ratio >= overlap_threshold and shared >= min_shared: _emit( out_rows, seen, "token_overlap", lrow, rrow, left_col, right_col, ratio=ratio, shared=shared, ) fieldnames = [ "match_type", "confidence", "left_name", "right_name", "left_normalized", "right_normalized", "left_row", "right_row", "overlap_ratio", "shared_tokens", ] with open(out_path, "w", newline="", encoding="utf-8") as fh: writer = csv.DictWriter(fh, fieldnames=fieldnames) writer.writeheader() writer.writerows(out_rows) return len(out_rows) def main() -> int: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--left", required=True, help="Left CSV path") p.add_argument( "--left-name-col", required=True, help="Name column in left CSV" ) p.add_argument("--right", required=True, help="Right CSV path") p.add_argument( "--right-name-col", required=True, help="Name column in right CSV", ) p.add_argument("--out", required=True, help="Output CSV path") p.add_argument( "--overlap-threshold", type=float, default=0.60, help="Jaccard overlap threshold for token_overlap tier (default 0.60)", ) p.add_argument( "--min-shared", type=int, default=2, help="Minimum shared tokens for token_overlap tier (default 2)", ) p.add_argument( "--skip-overlap", action="store_true", help="Skip the O(N*M) token_overlap pass (much faster on large CSVs)", ) args = p.parse_args() count = resolve( left_path=args.left, left_col=args.left_name_col, right_path=args.right, right_col=args.right_name_col, out_path=args.out, overlap_threshold=args.overlap_threshold, min_shared=args.min_shared, skip_overlap=args.skip_overlap, ) print(f"Wrote {count} match rows to {args.out}") return 0 if __name__ == "__main__": raise SystemExit(main())