feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with an opt-in config flag so users who want belt-and-suspenders security can still get the keyword scan on skill_manage output. ## Default behavior (flag off) skill_manage(action='create'|'edit'|'patch') no longer runs the keyword scanner. The agent can write skills that mention risky keywords in prose (documenting what reviewers should watch for, describing cache-bust semantics in a PR-review skill, referencing AGENTS.md, etc.) without getting blocked. Rationale: the agent can already execute the same code paths via terminal() with no gate, so the scan adds friction without meaningful security against a compromised or malicious agent. ## Opt-in behavior (flag on) Set skills.guard_agent_created: true in config.yaml to get the original behavior back. Scanner runs on every skill_manage write; dangerous verdicts surface as a tool error the agent can react to (retry without the flagged content). ## External hub installs unaffected trusted/community sources (hermes skills install) always get scanned regardless of this flag. The gate is specifically for skill_manage, which only agents call. ## Changes - hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG - tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag; _security_scan_skill() short-circuits to None when the flag is off - tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] = ('allow', 'allow', 'ask') so the scan remains strict when it does run - tests/tools/test_skills_guard.py: restore original ask/force tests - tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class covering both flag states + config error handling ## Validation - tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass - E2E: flagged-keyword skill creates with default config, blocks with flag on
2026-04-25 00:51:20 +00:00 · 2026-04-23 06:20:19 -07:00 · 2026-04-23 06:20:19 -07:00 · ce089169d5
commit ce089169d5
parent e3c0084140
5 changed files with 134 additions and 22 deletions
--- a/tools/skill_manager_tool.py
+++ b/tools/skill_manager_tool.py
@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple

 logger = logging.getLogger(__name__)

-# Import security scanner — agent-created skills get the same scrutiny as
-# community hub installs.
+# Import security scanner — external hub installs always get scanned;
+# agent-created skills only get scanned when skills.guard_agent_created is on.
 try:
    from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
    _GUARD_AVAILABLE = True
@ -53,10 +53,31 @@ except ImportError:
    _GUARD_AVAILABLE = False


+def _guard_agent_created_enabled() -> bool:
+    """Read skills.guard_agent_created from config (default False).
+
+    Off by default because the agent can already execute the same code
+    paths via terminal() with no gate, so the scan adds friction without
+    meaningful security.  Users who want belt-and-suspenders can turn it
+    on via `hermes config set skills.guard_agent_created true`.
+    """
+    try:
+        from hermes_cli.config import load_config
+        cfg = load_config()
+        return bool(cfg.get("skills", {}).get("guard_agent_created", False))
+    except Exception:
+        return False
+
+
 def _security_scan_skill(skill_dir: Path) -> Optional[str]:
-    """Scan a skill directory after write. Returns error string if blocked, else None."""
+    """Scan a skill directory after write. Returns error string if blocked, else None.
+
+    No-op when skills.guard_agent_created is disabled (the default).
+    """
    if not _GUARD_AVAILABLE:
        return None
+    if not _guard_agent_created_enabled():
+        return None
    try:
        result = scan_skill(skill_dir, source="agent-created")
        allowed, reason = should_allow_install(result)
@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]:
            return f"Security scan blocked this skill ({reason}):\n{report}"
        if allowed is None:
            # "ask" verdict — for agent-created skills this means dangerous
-            # findings were detected.  Block the skill and include the report.
+            # findings were detected.  Surface as an error so the agent can
+            # retry with the flagged content removed.
            report = format_scan_report(result)
            logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
            return f"Security scan blocked this skill ({reason}):\n{report}"
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@ -43,11 +43,11 @@ INSTALL_POLICY = {
    "builtin":       ("allow",  "allow",   "allow"),
    "trusted":       ("allow",  "allow",   "block"),
    "community":     ("allow",  "block",   "block"),
-    # Agent-created skills run in the same process as the agent that
-    # wrote them — the agent could already execute the same code via
-    # terminal(), so a dangerous-pattern gate on skill_manage adds
-    # friction without meaningful security. Allow all verdicts.
-    "agent-created": ("allow",  "allow",   "allow"),
+    # Agent-created: "ask" on dangerous surfaces as an error to the agent,
+    # which can retry without the flagged content. This gate only runs when
+    # skills.guard_agent_created is enabled (off by default) — see
+    # tools/skill_manager_tool.py::_guard_agent_created_enabled.
+    "agent-created": ("allow",  "allow",   "ask"),
 }

 VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}