#!/usr/bin/env python3 """Search the GDELT 2.0 DOC API for news mentions. GDELT monitors world news in 100+ languages and indexes the full text. Free, anonymous, ~15-minute update frequency. Covers ~2015→present. Useful for surfacing news mentions of a person, company, or topic across international media — much wider net than Google News. """ from __future__ import annotations import argparse import csv import json import sys import time import urllib.parse from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) from _http import get_json # noqa: E402 BASE = "https://api.gdeltproject.org/api/v2/doc/doc" COLUMNS = [ "title", "url", "seen_date", "domain", "language", "source_country", "tone", "social_image", ] def fetch( query: str, mode: str, timespan: str | None, start_datetime: str | None, end_datetime: str | None, source_country: str | None, source_lang: str | None, limit: int, out_path: str, ) -> int: params: dict[str, str] = { "query": query, "mode": mode, "format": "json", "maxrecords": str(min(limit, 250)), "sort": "datedesc", } if timespan: params["timespan"] = timespan if start_datetime: params["startdatetime"] = start_datetime.replace("-", "").replace(":", "").replace(" ", "") if end_datetime: params["enddatetime"] = end_datetime.replace("-", "").replace(":", "").replace(" ", "") if source_country: params["sourcecountry"] = source_country if source_lang: params["sourcelang"] = source_lang url = f"{BASE}?{urllib.parse.urlencode(params)}" payload: dict | list = {} for attempt in range(3): try: payload = get_json(url) break except RuntimeError as e: # GDELT requires 1 request per 5 seconds; back off and retry. if "429" in str(e) and attempt < 2: print( f"GDELT throttle hit; sleeping 6s before retry " f"(attempt {attempt + 1}/3)", file=sys.stderr, ) time.sleep(6) continue print(f"GDELT error: {e}", file=sys.stderr) payload = {} break except Exception as e: # noqa: BLE001 print(f"GDELT error: {e}", file=sys.stderr) payload = {} break rows: list[dict[str, str]] = [] if isinstance(payload, dict): articles = payload.get("articles", []) or [] for a in articles[:limit]: seen = (a.get("seendate") or "") # GDELT format: 20260319T083000Z → 2026-03-19 08:30:00Z if len(seen) == 16 and "T" in seen: seen = f"{seen[0:4]}-{seen[4:6]}-{seen[6:8]} {seen[9:11]}:{seen[11:13]}:{seen[13:15]}Z" rows.append( { "title": (a.get("title") or "").replace("\n", " ").strip(), "url": a.get("url") or "", "seen_date": seen, "domain": a.get("domain") or "", "language": a.get("language") or "", "source_country": a.get("sourcecountry") or "", "tone": str(a.get("tone") or ""), "social_image": a.get("socialimage") or "", } ) Path(out_path).parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", newline="", encoding="utf-8") as fh: w = csv.DictWriter(fh, fieldnames=COLUMNS) w.writeheader() w.writerows(rows) if not rows: print( f"GDELT: 0 articles for query={query!r}. " "GDELT indexes ~2015→present. Try widening the timespan or " "checking the query syntax (https://blog.gdeltproject.org/gdelt-doc-2-0-api-debuts/).", file=sys.stderr, ) return len(rows) def main() -> int: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--query", required=True, help='Search query (supports GDELT operators: quoted phrases, AND/OR/NOT, sourcecountry:, theme:)') p.add_argument( "--mode", default="ArtList", choices=["ArtList", "ImageCollage", "TimelineVol", "TimelineTone", "ToneChart"], help="GDELT mode (default ArtList for article list)", ) p.add_argument( "--timespan", help="Relative window: e.g. '1d', '1w', '1m', '3m', '1y' (overrides start/end)", ) p.add_argument("--start", help="Absolute start YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS") p.add_argument("--end", help="Absolute end YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS") p.add_argument("--source-country", help="2-letter source country (e.g. US, UK)") p.add_argument("--source-lang", help="Source language (e.g. English, Spanish)") p.add_argument("--limit", type=int, default=100) p.add_argument("--out", required=True) a = p.parse_args() n = fetch( query=a.query, mode=a.mode, timespan=a.timespan, start_datetime=a.start, end_datetime=a.end, source_country=a.source_country, source_lang=a.source_lang, limit=a.limit, out_path=a.out, ) print(f"Wrote {n} GDELT article rows to {a.out}") return 0 if __name__ == "__main__": raise SystemExit(main())