#!/usr/bin/env python3 """ Grep-based checker for Windows cross-platform footguns. Flags common patterns that break silently on Windows. Run before PRs — cheap, fast, catches regressions in a codebase that runs on three OSes. Usage: # Scan staged changes (default when run from a git checkout) python scripts/check-windows-footguns.py # Scan the full tree (full-repo audit) python scripts/check-windows-footguns.py --all # Scan a specific file or directory python scripts/check-windows-footguns.py path/to/file.py path/to/dir/ # Scan only modified files vs. main python scripts/check-windows-footguns.py --diff main Exit status: 0 — no Windows footguns found (or all matches suppressed) 1 — at least one unsuppressed match Suppress an intentional use (e.g. tests or platform-gated code) with: os.kill(pid, 0) # windows-footgun: ok — only called on POSIX """ from __future__ import annotations import argparse import os import re import subprocess import sys from dataclasses import dataclass from pathlib import Path from typing import Iterable REPO_ROOT = Path(__file__).resolve().parent.parent SUPPRESS_MARKER = re.compile(r"#\s*windows-footgun\s*:\s*ok\b", re.IGNORECASE) # Line-level guard hints. If a line contains any of these tokens, we assume # the programmer wrote the line in full awareness of the Windows pitfall — # e.g. `if hasattr(os, 'setsid'): ... os.setsid()`, or the classic # `getattr(signal, 'SIGKILL', signal.SIGTERM)`, or `shutil.which("wmic")`. # False negatives are fine here — the inline `# windows-footgun: ok` marker # is still the authoritative suppression. This is just to reduce the noise # floor on obviously-guarded lines so the signal-to-noise stays useful. GUARD_HINTS = ( "hasattr(os,", "hasattr(signal,", "getattr(os,", "getattr(signal,", "shutil.which(", "if platform.system() != \"Windows\"", "if platform.system() != 'Windows'", "if sys.platform == \"win32\"", "if sys.platform != \"win32\"", "if sys.platform == 'win32'", "if sys.platform != 'win32'", "IS_WINDOWS", "is_windows", ) # Dirs we never scan. EXCLUDED_DIRS = { ".git", "node_modules", "venv", ".venv", "__pycache__", "build", "dist", ".tox", ".mypy_cache", ".pytest_cache", "site-packages", "website/build", "optional-skills", # external skills } # File globs we never scan (beyond the dirs above). EXCLUDED_SUFFIXES = { ".pyc", ".pyo", ".so", ".dll", ".exe", ".png", ".jpg", ".gif", ".ico", ".svg", ".mp4", ".mp3", ".wav", ".pdf", ".zip", ".tar", ".gz", ".whl", ".lock", ".min.js", ".min.css", } # Files we never scan (self-referential — this script mentions the # patterns it detects — and the CONTRIBUTING docs that list them). EXCLUDED_FILES = { "scripts/check-windows-footguns.py", "CONTRIBUTING.md", } @dataclass class Footgun: """A Windows cross-platform footgun pattern.""" name: str pattern: re.Pattern message: str fix: str # If set, matches in files/paths containing any of these substrings are # silently ignored (e.g. tests that legitimately exercise the footgun # behind a platform guard). Prefer `# windows-footgun: ok` inline # suppression over this list; only use path_allowlist for whole files # that are inherently tests of the footgun itself. path_allowlist: tuple[str, ...] = () # Optional post-match predicate. Takes the re.Match and returns True # if the match is a REAL footgun (not a false positive). Use this when # the regex can't fully distinguish (e.g. open() where mode may contain # "b" for binary, or the line may have `encoding=` elsewhere). post_filter: "callable | None" = None FOOTGUNS: list[Footgun] = [ Footgun( name="open() without encoding= on text mode", # Match builtins.open() specifically — NOT os.open(), .open() # method calls (Path.open, tarfile.open, zf.open, webbrowser.open, # Image.open, wave.open, etc), or `async def open()` method # definitions. The pattern requires a start-of-identifier boundary # before `open(` so `os.open`, `.open`, `def open` are all skipped. # Note: Path.open() is ALSO affected by the encoding default, but # rather than flagging all `.open(` (huge noise), we require an # explicit builtins-style open() call. Path.open() is rare in the # codebase compared to open() and can be audited separately. pattern=re.compile( r"""(?:^|[\s\(,;=])(?[^'"]*)['"])?""" ), message=( "open() without an explicit encoding= uses the platform default " "(UTF-8 on POSIX, cp1252/mbcs on Windows) — files round-tripped " "between hosts get mojibake. Always pass encoding='utf-8' for " "text files, or use open(path, 'rb')/'wb' for binary." ), fix=( "open(path, 'r', encoding='utf-8') # or 'utf-8-sig' if the " "file may have a BOM" ), # Filter: only flag if mode is missing-or-text AND the line doesn't # already pass encoding=. Skip binary mode (contains "b"). post_filter=lambda m, line: ( "b" not in (m.group("mode") or "") and "encoding=" not in line and "encoding =" not in line # Skip `def open(` and `async def open(` (method definitions) and not line.lstrip().startswith("def ") and not line.lstrip().startswith("async def ") # Skip open(path, **kwargs) patterns — encoding may be in the dict. # Too expensive to trace; require the author to set encoding in # the dict and trust them (or they can add a # windows-footgun: ok). and "**" not in line ), ), Footgun( name="os.kill(pid, 0)", pattern=re.compile(r"\bos\.kill\s*\(\s*[^,]+,\s*0\s*\)"), message=( "os.kill(pid, 0) is NOT a no-op on Windows — it sends " "CTRL_C_EVENT to the target's console process group, " "hard-killing the target and potentially unrelated siblings. " "See bpo-14484." ), fix=( "Use psutil.pid_exists(pid) (psutil is a core dependency). " "Or gateway.status._pid_exists(pid) for the hermes wrapper " "with a stdlib fallback." ), ), Footgun( name="bare os.setsid", pattern=re.compile(r"(? bool: """Return True if this file is in scope for the checker.""" # Skip the excluded dirs parts = set(path.parts) if parts & EXCLUDED_DIRS: return False # Skip excluded suffixes for suffix in EXCLUDED_SUFFIXES: if str(path).endswith(suffix): return False # Skip self and docs that intentionally mention the patterns rel = path.relative_to(REPO_ROOT).as_posix() if rel in EXCLUDED_FILES: return False # Only scan text files (rough heuristic — .py, .md, .sh, .ps1, .yaml, etc.) if path.suffix in {".py", ".pyw", ".pyi"}: return True # Other file types are read but only Python-specific patterns would match; # that's fine and cheap to skip. return False def iter_files(paths: Iterable[Path]) -> Iterable[Path]: for p in paths: if p.is_file(): if should_scan_file(p): yield p elif p.is_dir(): for root, dirs, files in os.walk(p): # prune excluded dirs in-place for speed dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] for fname in files: fpath = Path(root) / fname if should_scan_file(fpath): yield fpath def _strip_code(line: str) -> str: """Return just the code portion of a line — strip trailing comments and skip lines that are entirely inside a string literal or comment. Heuristic only (we don't parse Python); good enough to avoid flagging our own `# ``os.kill(pid, 0)`` is NOT a no-op` docstring-style comments. """ stripped = line.lstrip() # Line starts with # — entirely a comment. if stripped.startswith("#"): return "" # Remove trailing "# ..." inline comment. Naive — doesn't handle `#` # inside strings — but on balance reduces noise far more than it adds. hash_idx = _find_unquoted_hash(line) if hash_idx is not None: return line[:hash_idx] return line def _find_unquoted_hash(line: str) -> int | None: """Index of the first `#` not inside a single/double/triple-quoted string. Simple state machine — good enough for the 99% case of "code, then optional trailing comment." """ i = 0 n = len(line) in_s = False # single-quote string in_d = False # double-quote string while i < n: c = line[i] if c == "\\" and (in_s or in_d) and i + 1 < n: i += 2 continue if not in_d and c == "'": in_s = not in_s elif not in_s and c == '"': in_d = not in_d elif c == "#" and not in_s and not in_d: return i i += 1 return None def scan_file(path: Path, footguns: list[Footgun]) -> list[tuple[int, str, Footgun]]: """Return a list of (line_number, line, footgun) for unsuppressed matches.""" try: text = path.read_text(encoding="utf-8", errors="replace") except OSError: return [] matches: list[tuple[int, str, Footgun]] = [] # Track whether we're inside a triple-quoted string (docstring/raw block). # Simple state machine — handles both ''' and """, toggled by the FIRST # triple-quote we see; we don't try to handle nested or f-string cases. in_triple: str | None = None # None, "'''", or '"""' for i, line in enumerate(text.splitlines(), start=1): # Update triple-quote state based on this line's occurrences. code_for_scan = line if in_triple: # We're inside a docstring — skip the whole line's scan. # Check if it closes here. if in_triple in line: # Find the closing delimiter; anything after it is real code. after = line.split(in_triple, 1)[1] in_triple = None code_for_scan = after else: continue # Now check for docstring-open in the (possibly after-triple) portion. # Scan for the first unescaped '''/""" in the current code_for_scan. stripped = code_for_scan.strip() for delim in ('"""', "'''"): if delim in code_for_scan: # Count occurrences — even count means single-line docstring, # odd means we've entered a multi-line one. count = code_for_scan.count(delim) if count % 2 == 1: # Odd — we're now inside the triple-quoted block. # Scan only the part BEFORE the opening delimiter. before = code_for_scan.split(delim, 1)[0] code_for_scan = before in_triple = delim break else: # Even — entire docstring fits on one line. Strip it # from the scan text to avoid matching on prose. parts = code_for_scan.split(delim) # Keep the "outside" parts (every other chunk, starting # with index 0) as code, drop the "inside" parts. code_for_scan = "".join(parts[::2]) break if SUPPRESS_MARKER.search(line): continue # Skip if the line has an obvious guard — e.g. hasattr/getattr/ # shutil.which or a platform check. False negatives are acceptable; # the inline suppression marker is the authoritative override. if any(hint in line for hint in GUARD_HINTS): continue code = _strip_code(code_for_scan) if not code.strip(): continue for fg in footguns: if fg.path_allowlist and any(s in str(path) for s in fg.path_allowlist): continue match = fg.pattern.search(code) if not match: continue if fg.post_filter is not None: try: if not fg.post_filter(match, line): continue except (IndexError, AttributeError): # Post-filter assumed a named group that isn't there — skip. continue matches.append((i, line.rstrip(), fg)) return matches def get_staged_files() -> list[Path]: """Return paths staged in the current git index. Empty on non-git trees.""" try: out = subprocess.check_output( ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"], cwd=REPO_ROOT, stderr=subprocess.DEVNULL, text=True, ) except (subprocess.CalledProcessError, FileNotFoundError): return [] return [REPO_ROOT / f for f in out.splitlines() if f.strip()] def get_diff_files(ref: str) -> list[Path]: """Return paths modified vs. the given git ref.""" try: out = subprocess.check_output( ["git", "diff", f"{ref}...HEAD", "--name-only", "--diff-filter=ACMR"], cwd=REPO_ROOT, stderr=subprocess.DEVNULL, text=True, ) except (subprocess.CalledProcessError, FileNotFoundError): return [] return [REPO_ROOT / f for f in out.splitlines() if f.strip()] def parse_args(argv: list[str]) -> argparse.Namespace: p = argparse.ArgumentParser( description="Flag Windows cross-platform footguns in Python code." ) p.add_argument( "paths", nargs="*", type=Path, help="Specific files/dirs to scan (default: staged changes).", ) p.add_argument( "--all", action="store_true", help="Scan the full repository (hermes_cli/, gateway/, tools/, cron/, etc.).", ) p.add_argument( "--diff", metavar="REF", help="Scan files changed vs. the given git ref (e.g. --diff main).", ) p.add_argument( "--list", action="store_true", help="List all known footgun rules and exit.", ) return p.parse_args(argv) def print_rules() -> None: print("Known Windows footguns checked by this script:\n") for i, fg in enumerate(FOOTGUNS, start=1): print(f"{i:2}. {fg.name}") print(f" {fg.message}") print(f" Fix: {fg.fix}") print() def main(argv: list[str]) -> int: args = parse_args(argv) if args.list: print_rules() return 0 if args.all: # Scan main Python packages + scripts roots = [ REPO_ROOT / "hermes_cli", REPO_ROOT / "gateway", REPO_ROOT / "tools", REPO_ROOT / "cron", REPO_ROOT / "agent", REPO_ROOT / "plugins", REPO_ROOT / "scripts", REPO_ROOT / "acp_adapter", REPO_ROOT / "acp_registry", ] roots = [r for r in roots if r.exists()] elif args.diff: roots = get_diff_files(args.diff) elif args.paths: roots = [p.resolve() for p in args.paths] else: # Default: staged changes roots = get_staged_files() if not roots: print( "No staged files to scan. Pass --all for a full-repo scan, " "--diff for a range diff, or paths explicitly.", file=sys.stderr, ) return 0 total_matches = 0 files_scanned = 0 for path in iter_files(roots): files_scanned += 1 matches = scan_file(path, FOOTGUNS) for lineno, line, fg in matches: rel = path.relative_to(REPO_ROOT).as_posix() print(f"{rel}:{lineno}: [{fg.name}]") print(f" {line.strip()}") print(f" — {fg.message}") print(f" Fix: {fg.fix.splitlines()[0]}") print() total_matches += 1 if total_matches: print( f"\n✗ {total_matches} Windows footgun(s) found across " f"{files_scanned} file(s) scanned.", file=sys.stderr, ) print( " If an individual match is a false positive or intentionally " "platform-gated, suppress it with `# windows-footgun: ok` on " "the same line.\n Run with --list to see all rules.", file=sys.stderr, ) return 1 print( f"✓ No Windows footguns found ({files_scanned} file(s) scanned)." ) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))