#!/usr/bin/env python3 """Search the Internet Archive Wayback Machine via the CDX server. The CDX API indexes ~900B+ archived web pages. Anonymous read access, no auth required. Useful for finding deleted / changed pages by URL, domain, or substring match. """ from __future__ import annotations import argparse import csv import sys import urllib.parse from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from _http import get_json # noqa: E402 BASE = "https://web.archive.org/cdx/search/cdx" COLUMNS = [ "url", "timestamp", "wayback_url", "mimetype", "status", "digest", "length", ] def fetch( url_or_host: str, match_type: str, from_date: str | None, to_date: str | None, status: str | None, mime: str | None, collapse: str | None, limit: int, out_path: str, ) -> int: params: dict[str, str] = { "url": url_or_host, "matchType": match_type, "output": "json", "limit": str(limit), } if from_date: params["from"] = from_date.replace("-", "") if to_date: params["to"] = to_date.replace("-", "") if status: params["filter"] = f"statuscode:{status}" if mime: params.setdefault("filter", "") # Multiple filters: CDX accepts repeated filter params via urlencode list params["filter"] = f"mimetype:{mime}" if collapse: params["collapse"] = collapse url = f"{BASE}?{urllib.parse.urlencode(params)}" try: payload = get_json(url) except Exception as e: # noqa: BLE001 print(f"Wayback CDX error: {e}", file=sys.stderr) payload = [] rows: list[dict[str, str]] = [] if isinstance(payload, list) and len(payload) > 1: header = payload[0] idx = {h: i for i, h in enumerate(header)} for entry in payload[1:]: ts = entry[idx["timestamp"]] if "timestamp" in idx else "" orig = entry[idx["original"]] if "original" in idx else "" rows.append( { "url": orig, "timestamp": ts, "wayback_url": f"https://web.archive.org/web/{ts}/{orig}" if ts and orig else "", "mimetype": entry[idx["mimetype"]] if "mimetype" in idx else "", "status": entry[idx["statuscode"]] if "statuscode" in idx else "", "digest": entry[idx["digest"]] if "digest" in idx else "", "length": entry[idx["length"]] if "length" in idx else "", } ) Path(out_path).parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", newline="", encoding="utf-8") as fh: w = csv.DictWriter(fh, fieldnames=COLUMNS) w.writeheader() w.writerows(rows) if not rows: print( f"Wayback Machine: 0 captures for {url_or_host!r} matchType={match_type}.", file=sys.stderr, ) return len(rows) def main() -> int: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--url", required=True, help="URL or host to look up in the archive") p.add_argument( "--match", default="exact", choices=["exact", "prefix", "host", "domain"], help=( "exact: this URL only. " "prefix: this URL's path-prefix. " "host: any URL on this host. " "domain: any URL on this domain or subdomains." ), ) p.add_argument("--from-date", help="Earliest capture YYYY-MM-DD") p.add_argument("--to-date", help="Latest capture YYYY-MM-DD") p.add_argument("--status", help="HTTP status filter (e.g. 200)") p.add_argument("--mime", help="MIME type filter (e.g. text/html)") p.add_argument( "--collapse", help="Collapse adjacent identical entries (e.g. 'digest' for unique-content captures)", ) p.add_argument("--limit", type=int, default=200) p.add_argument("--out", required=True) a = p.parse_args() n = fetch( url_or_host=a.url, match_type=a.match, from_date=a.from_date, to_date=a.to_date, status=a.status, mime=a.mime, collapse=a.collapse, limit=a.limit, out_path=a.out, ) print(f"Wrote {n} Wayback capture rows to {a.out}") return 0 if __name__ == "__main__": raise SystemExit(main())