refactor(skills): slim AST diagnostic to single entry point

Trim ~600 LOC off the original contribution while keeping the same operator-facing surface and detection coverage. - Collapse three entry points (file / dir / bundle) into one ast_scan_path(path) that handles both files and directories. - Drop AstFinding dataclass + severity field — replaced with plain (file, line, pattern_id, description) tuples. Severity ordering was display-only for a diagnostic that explicitly disclaims security verdicts, so the field added bookkeeping without earning its place. - Replace Rich-markup formatter with plain text grouped by file. - Drop the 'inspect --ast-deep' surface — same scanner, same output as 'audit --deep', single CLI entry is enough. Operators audit after install; pre-install inspection signal isn't worth the second surface. - Trim test file to the cases that earn their place: bypass payload, syntax error survival, RecursionError survival, false-positive guard (importer lookalike), literal-arg false-positive guard, non-.py ignored, directory recursion + cache-dir skipping, missing-path, getattr/__dict__ detection, formatter empty + populated. Net: tools/skills_ast_audit.py 353 -> 133 LOC, tests/tools/test_skills_ast_audit.py 299 -> 103 LOC, full diff +704/-12 -> +264/-6. No change to tools/skills_guard.py — Skills Guard verdicts remain untouched per SECURITY.md §2.4.
2026-06-07 08:02:23 +00:00 · 2026-05-23 16:36:37 -07:00 · 2026-05-23 16:36:37 -07:00 · 4254f7dd17
commit 4254f7dd17
parent 7255050c99
4 changed files with 175 additions and 609 deletions
--- a/tools/skills_ast_audit.py
+++ b/tools/skills_ast_audit.py
@ -1,353 +1,133 @@
 """
 AST-level deep audit for skill Python files — opt-in diagnostic, not a security gate.

-This is a standalone diagnostic tool per SECURITY.md spirit: it helps operators
-inspect skill code for patterns that *could* enable dynamic import/access
-obfuscation, but it is NOT a security boundary. Every pattern flagged here has
-legitimate uses. Use your judgment.
+Per SECURITY.md §2.4, Skills Guard is in-process heuristics ("useful — not
+boundaries"). This module is a separate opt-in diagnostic that flags dynamic
+import / dynamic attribute access patterns operators may want to eyeball when
+reviewing third-party skill code. Every pattern flagged here has legitimate
+uses; findings are hints for human review, not verdicts.

-Usage::
-
-    from tools.skills_ast_audit import ast_scan_skill, format_ast_report
-
-    findings = ast_scan_skill(Path("~/.hermes/skills/some-skill"))
-    if findings:
-        print(format_ast_report(findings))
-
-CLI integration: ``hermes skills audit --deep``
+CLI: ``hermes skills audit --deep``
 """

 from __future__ import annotations

 import ast
-from dataclasses import dataclass
 from pathlib import Path
-from typing import Mapping, List, Optional, Union
+from typing import List, Tuple
+
+# (file, line, pattern_id, description)
+Finding = Tuple[str, int, str, str]
+
+_IGNORED_DIRS = {"__pycache__", ".venv", "venv", "node_modules"}


-# ---------------------------------------------------------------------------
-# Data model
-# ---------------------------------------------------------------------------
-
-
-@dataclass
-class AstFinding:
-    """A single finding from AST-level analysis."""
-
-    pattern_id: str
-    """Short identifier for deduplication and grouping (e.g. 'ast_importlib_import')."""
-
-    severity: str
-    """One of 'high', 'medium', 'low' — for display only, not a security claim."""
-
-    category: str
-    """Grouping label — currently always 'obfuscation'."""
-
-    file: str
-    """Relative path to the file containing the finding."""
-
-    line: int
-    """1-based line number."""
-
-    match: str
-    """The matched source construct (human-readable snippet)."""
-
-    description: str
-    """Why this pattern is worth reviewing."""
-
-
-# ---------------------------------------------------------------------------
-# Scanner
-# ---------------------------------------------------------------------------
-
-def _ast_scan_python(content: str, rel_path: str) -> List[AstFinding]:
-    """Detect obfuscation via dynamic imports, attribute access, and string construction.
-
-    Hostile or pathological input (deeply-nested expressions, malformed source)
-    must not crash the scan. Both ``ast.parse`` and the visitor traversal are
-    guarded so parse/visit failures degrade gracefully to "no AST findings"
-    rather than raising.
-    """
+def _scan_source(content: str, rel_path: str) -> List[Finding]:
    try:
        tree = ast.parse(content)
    except (SyntaxError, ValueError, RecursionError):
        return []

-    findings: List[AstFinding] = []
+    findings: List[Finding] = []

-    class _Visitor(ast.NodeVisitor):
+    class V(ast.NodeVisitor):
        def visit_Call(self, node):
-            # Detect importlib.import_module(...)
-            if (
-                isinstance(node.func, ast.Attribute)
-                and node.func.attr == "import_module"
-            ):
-                findings.append(
-                    AstFinding(
-                        pattern_id="ast_dynamic_import",
-                        severity="high",
-                        category="obfuscation",
-                        file=rel_path,
-                        line=node.lineno,
-                        match="importlib.import_module()",
-                        description="dynamic import via importlib — can load arbitrary modules at runtime",
-                    )
-                )
-            # Detect __import__ with non-literal argument
-            if isinstance(node.func, ast.Name) and node.func.id == "__import__":
+            f = node.func
+            # importlib.import_module(...)
+            if isinstance(f, ast.Attribute) and f.attr == "import_module":
+                findings.append((rel_path, node.lineno, "dynamic_import",
+                                 "importlib.import_module() — loads arbitrary modules at runtime"))
+            # __import__(<computed>)
+            elif isinstance(f, ast.Name) and f.id == "__import__":
                if node.args and not isinstance(node.args[0], ast.Constant):
-                    findings.append(
-                        AstFinding(
-                            pattern_id="ast_dynamic_import_computed",
-                            severity="high",
-                            category="obfuscation",
-                            file=rel_path,
-                            line=node.lineno,
-                            match="__import__(<computed>)",
-                            description="__import__ with dynamically constructed module name",
-                        )
-                    )
-            # Detect getattr with computed attribute name
-            if isinstance(node.func, ast.Name) and node.func.id == "getattr":
-                if len(node.args) >= 2 and not isinstance(
-                    node.args[1], ast.Constant
-                ):
-                    findings.append(
-                        AstFinding(
-                            pattern_id="ast_dynamic_getattr",
-                            severity="medium",
-                            category="obfuscation",
-                            file=rel_path,
-                            line=node.lineno,
-                            match="getattr(<obj>, <computed>)",
-                            description="getattr with dynamically constructed attribute name",
-                        )
-                    )
+                    findings.append((rel_path, node.lineno, "dynamic_import_computed",
+                                     "__import__ with non-literal module name"))
+            # getattr(obj, <computed>)
+            elif isinstance(f, ast.Name) and f.id == "getattr":
+                if len(node.args) >= 2 and not isinstance(node.args[1], ast.Constant):
+                    findings.append((rel_path, node.lineno, "dynamic_getattr",
+                                     "getattr with non-literal attribute name"))
            self.generic_visit(node)

        def visit_Subscript(self, node):
-            # Detect obj.__dict__[<computed>]
-            if (
-                isinstance(node.value, ast.Attribute)
-                and node.value.attr == "__dict__"
-            ):
-                if not isinstance(node.slice, ast.Constant):
-                    findings.append(
-                        AstFinding(
-                            pattern_id="ast_dict_access",
-                            severity="high",
-                            category="obfuscation",
-                            file=rel_path,
-                            line=node.lineno,
-                            match="__dict__[<computed>]",
-                            description="dynamic attribute access via __dict__ with computed key",
-                        )
-                    )
+            # obj.__dict__[<computed>]
+            if (isinstance(node.value, ast.Attribute)
+                    and node.value.attr == "__dict__"
+                    and not isinstance(node.slice, ast.Constant)):
+                findings.append((rel_path, node.lineno, "dict_access",
+                                 "__dict__[<computed>] — dynamic attribute access"))
            self.generic_visit(node)

        def visit_Import(self, node):
-            # Flag importlib and any importlib.* submodule.
-            for alias in node.names:
-                if alias.name == "importlib" or alias.name.startswith(
-                    "importlib."
-                ):
-                    findings.append(
-                        AstFinding(
-                            pattern_id="ast_importlib_import",
-                            severity="medium",
-                            category="obfuscation",
-                            file=rel_path,
-                            line=node.lineno,
-                            match=f"import {alias.name}",
-                            description="importlib imported — enables dynamic module loading",
-                        )
-                    )
+            for a in node.names:
+                if a.name == "importlib" or a.name.startswith("importlib."):
+                    findings.append((rel_path, node.lineno, "importlib_import",
+                                     f"import {a.name} — enables dynamic module loading"))
            self.generic_visit(node)

        def visit_ImportFrom(self, node):
-            module = node.module or ""
-            if module == "importlib" or module.startswith("importlib."):
-                findings.append(
-                    AstFinding(
-                        pattern_id="ast_importlib_import",
-                        severity="medium",
-                        category="obfuscation",
-                        file=rel_path,
-                        line=node.lineno,
-                        match=f"from {module} import ...",
-                        description="importlib imported — enables dynamic module loading",
-                    )
-                )
+            m = node.module or ""
+            if m == "importlib" or m.startswith("importlib."):
+                findings.append((rel_path, node.lineno, "importlib_import",
+                                 f"from {m} import ... — enables dynamic module loading"))
            self.generic_visit(node)

    try:
-        _Visitor().visit(tree)
+        V().visit(tree)
    except (RecursionError, ValueError, RuntimeError):
-        # Visitor traversal can fail on hostile input even when ast.parse
-        # succeeded (e.g. deeply-nested call/attribute chains). Return
-        # whatever findings we collected before the failure.
-        return findings
+        # Hostile/pathological input: return what we collected so far.
+        pass

    return findings


-def ast_scan_file(file_path: Path, rel_path: Optional[str] = None) -> List[AstFinding]:
-    """Scan a single Python file and return AST-level findings.
+def ast_scan_path(path: Path) -> List[Finding]:
+    """Scan a single .py file or recursively scan all .py under a directory.

-    Args:
-        file_path: Absolute path to the .py file.
-        rel_path: Relative path for display (defaults to file_path.name).
-
-    Returns:
-        List of :class:`AstFinding` — empty if the file isn't Python or scan yields nothing.
+    Returns a list of (file, line, pattern_id, description) tuples. Empty for
+    non-Python paths, missing paths, or paths with no matching patterns.
    """
-    if file_path.suffix.lower() != ".py":
+    if path.is_file():
+        if path.suffix.lower() != ".py":
+            return []
+        try:
+            content = path.read_text(encoding="utf-8", errors="replace")
+        except OSError:
+            return []
+        return _scan_source(content, path.name)
+
+    if not path.is_dir():
        return []

-    if rel_path is None:
-        rel_path = file_path.name
-
-    try:
-        content = file_path.read_text(encoding="utf-8", errors="replace")
-    except (OSError, UnicodeDecodeError):
-        return []
-
-    return _ast_scan_python(content, rel_path)
-
-
-def ast_scan_skill(skill_path: Path) -> List[AstFinding]:
-    """Recursively scan all Python files in a skill directory.
-
-    Args:
-        skill_path: Path to the installed skill directory.
-
-    Returns:
-        Combined list of :class:`AstFinding` across all .py files.
-    """
-    if not skill_path.is_dir():
-        return []
-
-    all_findings: List[AstFinding] = []
-
-    for py_file in sorted(skill_path.rglob("*.py")):
-        # Skip __pycache__ and .venv/venv directories
-        parts = set(py_file.parent.parts)
-        if parts & {"__pycache__", ".venv", "venv", "node_modules"}:
+    out: List[Finding] = []
+    for py in sorted(path.rglob("*.py")):
+        if set(py.parent.parts) & _IGNORED_DIRS:
            continue
        try:
-            rel = py_file.relative_to(skill_path).as_posix()
+            content = py.read_text(encoding="utf-8", errors="replace")
+        except OSError:
+            continue
+        try:
+            rel = py.relative_to(path).as_posix()
        except ValueError:
-            rel = py_file.name
-        all_findings.extend(ast_scan_file(py_file, rel))
-
-    return all_findings
+            rel = py.name
+        out.extend(_scan_source(content, rel))
+    return out


-def ast_scan_bundle_files(
-    files: Mapping[str, Union[str, bytes]],
-) -> List[AstFinding]:
-    """Scan Python files from an in-memory skill bundle.
-
-    This powers ``hermes skills inspect --ast-deep`` so operators can review
-    a skill before installing it. The input is the bundle's filename -> content
-    mapping, as returned by the skills hub source adapters.
-    """
-    all_findings: List[AstFinding] = []
-
-    for rel_path, content in sorted(files.items()):
-        path = Path(rel_path)
-        if path.suffix.lower() != ".py":
-            continue
-        if set(path.parts) & {"__pycache__", ".venv", "venv", "node_modules"}:
-            continue
-        if isinstance(content, bytes):
-            text = content.decode("utf-8", errors="replace")
-        else:
-            text = str(content)
-        all_findings.extend(_ast_scan_python(text, rel_path))
-
-    return all_findings
-
-
-# ---------------------------------------------------------------------------
-# Rich formatting
-# ---------------------------------------------------------------------------
-
-
-def format_ast_report(
-    findings: List[AstFinding],
-    skill_name: str = "",
-) -> str:
-    """Format AST findings as a Rich-markup string.
-
-    Args:
-        findings: List of findings from :func:`ast_scan_skill`.
-        skill_name: Optional skill name for the report header.
-
-    Returns:
-        Rich-markup string suitable for ``console.print()``.
-    """
+def format_ast_report(findings: List[Finding], skill_name: str = "") -> str:
+    """Plain-text report (Rich-markup-free) grouped by file."""
+    header = f"AST deep scan: {skill_name}" if skill_name else "AST deep scan"
    if not findings:
-        header = (
-            f"[bold]AST Deep Scan: {skill_name}[/]"
-            if skill_name
-            else "[bold]AST Deep Scan[/]"
-        )
-        return f"{header}\n[dim green]No AST-level patterns detected.[/]"
+        return f"{header}\n  No dynamic import/access patterns detected."

-    lines: List[str] = []
-    severity_order = {"high": 0, "medium": 1, "low": 2}
-    findings_sorted = sorted(
-        findings,
-        key=lambda f: (
-            severity_order.get(f.severity, 99),
-            f.file,
-            f.line,
-        ),
-    )
-
-    if skill_name:
-        lines.append(f"[bold]AST Deep Scan: {skill_name}[/]")
-    else:
-        lines.append("[bold]AST Deep Scan[/]")
-
-    total = len(findings_sorted)
-    high_count = sum(1 for f in findings_sorted if f.severity == "high")
-    med_count = sum(1 for f in findings_sorted if f.severity == "medium")
-    low_count = sum(1 for f in findings_sorted if f.severity == "low")
-
-    summary_parts = []
-    if high_count:
-        summary_parts.append(f"[bold red]{high_count} high[/]")
-    if med_count:
-        summary_parts.append(f"[yellow]{med_count} medium[/]")
-    if low_count:
-        summary_parts.append(f"[dim]{low_count} low[/]")
-    lines.append(
-        f"[dim]{total} finding(s)[/] — "
-        + ", ".join(summary_parts)
-        if summary_parts
-        else f"[dim]{total} finding(s)[/]"
-    )
+    lines = [header, f"  {len(findings)} finding(s):"]
+    current = None
+    for f, line, pid, desc in sorted(findings):
+        if f != current:
+            current = f
+            lines.append(f"  {f}")
+        lines.append(f"    L{line}  {pid}  — {desc}")
    lines.append("")
-
-    current_file = None
-    for f in findings_sorted:
-        if f.file != current_file:
-            current_file = f.file
-            lines.append(f"  [bold cyan]{f.file}[/]")
-        sev_color = {"high": "bold red", "medium": "yellow", "low": "dim"}.get(
-            f.severity, "dim"
-        )
-        lines.append(
-            f"    L{f.line:>4} [{sev_color}]{f.severity:6}[/] {f.description}"
-        )
-        lines.append(f"          [dim]{f.match}[/]")
-
-    lines.append("")
-    lines.append(
-        "[dim]Note: AST findings are diagnostic hints, not security verdicts. "
-        "Review each pattern in context.[/]"
-    )
-
+    lines.append("  Note: diagnostic hints for human review, not security verdicts.")
    return "\n".join(lines)