feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with an opt-in config flag so users who want belt-and-suspenders security can still get the keyword scan on skill_manage output. ## Default behavior (flag off) skill_manage(action='create'|'edit'|'patch') no longer runs the keyword scanner. The agent can write skills that mention risky keywords in prose (documenting what reviewers should watch for, describing cache-bust semantics in a PR-review skill, referencing AGENTS.md, etc.) without getting blocked. Rationale: the agent can already execute the same code paths via terminal() with no gate, so the scan adds friction without meaningful security against a compromised or malicious agent. ## Opt-in behavior (flag on) Set skills.guard_agent_created: true in config.yaml to get the original behavior back. Scanner runs on every skill_manage write; dangerous verdicts surface as a tool error the agent can react to (retry without the flagged content). ## External hub installs unaffected trusted/community sources (hermes skills install) always get scanned regardless of this flag. The gate is specifically for skill_manage, which only agents call. ## Changes - hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG - tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag; _security_scan_skill() short-circuits to None when the flag is off - tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] = ('allow', 'allow', 'ask') so the scan remains strict when it does run - tests/tools/test_skills_guard.py: restore original ask/force tests - tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class covering both flag states + config error handling ## Validation - tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass - E2E: flagged-keyword skill creates with default config, blocks with flag on
2026-04-25 00:51:20 +00:00 · 2026-04-23 06:20:19 -07:00 · 2026-04-23 06:20:19 -07:00 · ce089169d5
commit ce089169d5
parent e3c0084140
5 changed files with 134 additions and 22 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -760,6 +760,17 @@ DEFAULT_CONFIG = {
        "inline_shell": False,
        # Timeout (seconds) for each !`cmd` snippet when inline_shell is on.
        "inline_shell_timeout": 10,
+        # Run the keyword/pattern security scanner on skills the agent
+        # writes via skill_manage (create/edit/patch).  Off by default
+        # because the agent can already execute the same code paths via
+        # terminal() with no gate, so the scan adds friction (blocks
+        # skills that mention risky keywords in prose) without meaningful
+        # security.  Turn on if you want the belt-and-suspenders — a
+        # dangerous verdict will then surface as a tool error to the
+        # agent, which can retry with the flagged content removed.
+        # External hub installs (trusted/community sources) are always
+        # scanned regardless of this setting.
+        "guard_agent_created": False,
    },

    # Honcho AI-native memory -- reads ~/.honcho/config.json as single source of truth.
--- a/tests/tools/test_skill_manager_tool.py
+++ b/tests/tools/test_skill_manager_tool.py
@ -484,3 +484,85 @@ class TestSkillManageDispatcher:
            raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
        result = json.loads(raw)
        assert result["success"] is True
+
+
+class TestSecurityScanGate:
+    """_security_scan_skill is gated by skills.guard_agent_created config flag."""
+
+    def test_scan_noop_when_flag_off(self, tmp_path):
+        """Default config (flag off) short-circuits before running scan_skill."""
+        from tools.skill_manager_tool import _security_scan_skill
+
+        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \
+             patch("tools.skill_manager_tool.scan_skill") as mock_scan:
+            result = _security_scan_skill(tmp_path)
+
+        assert result is None
+        mock_scan.assert_not_called()  # scan never ran
+
+    def test_scan_runs_when_flag_on(self, tmp_path):
+        """When flag is on, scan_skill is invoked and its verdict is honored."""
+        from tools.skill_manager_tool import _security_scan_skill
+        from tools.skills_guard import ScanResult
+
+        # Fake a safe scan result — caller should return None (allow)
+        fake_result = ScanResult(
+            skill_name="test",
+            source="agent-created",
+            trust_level="agent-created",
+            verdict="safe",
+            findings=[],
+            summary="ok",
+        )
+        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
+             patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan:
+            result = _security_scan_skill(tmp_path)
+
+        assert result is None
+        mock_scan.assert_called_once()
+
+    def test_scan_blocks_dangerous_when_flag_on(self, tmp_path):
+        """Dangerous verdict + flag on → returns an error string for the agent."""
+        from tools.skill_manager_tool import _security_scan_skill
+        from tools.skills_guard import ScanResult, Finding
+
+        finding = Finding(
+            pattern_id="test", severity="critical", category="exfiltration",
+            file="SKILL.md", line=1, match="curl $TOKEN", description="test",
+        )
+        fake_result = ScanResult(
+            skill_name="test",
+            source="agent-created",
+            trust_level="agent-created",
+            verdict="dangerous",
+            findings=[finding],
+            summary="dangerous",
+        )
+        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
+             patch("tools.skill_manager_tool.scan_skill", return_value=fake_result):
+            result = _security_scan_skill(tmp_path)
+
+        assert result is not None
+        assert "Security scan blocked" in result
+
+    def test_guard_flag_reads_config_default_false(self):
+        """_guard_agent_created_enabled returns False when config doesn't set it."""
+        from tools.skill_manager_tool import _guard_agent_created_enabled
+
+        with patch("hermes_cli.config.load_config", return_value={"skills": {}}):
+            assert _guard_agent_created_enabled() is False
+
+    def test_guard_flag_reads_config_when_set(self):
+        """_guard_agent_created_enabled returns True when user explicitly enables."""
+        from tools.skill_manager_tool import _guard_agent_created_enabled
+
+        with patch("hermes_cli.config.load_config",
+                   return_value={"skills": {"guard_agent_created": True}}):
+            assert _guard_agent_created_enabled() is True
+
+    def test_guard_flag_handles_config_error(self):
+        """If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off)."""
+        from tools.skill_manager_tool import _guard_agent_created_enabled
+
+        with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
+            assert _guard_agent_created_enabled() is False
--- a/tests/tools/test_skills_guard.py
+++ b/tests/tools/test_skills_guard.py
@ -174,27 +174,24 @@ class TestShouldAllowInstall:
        assert allowed is True
        assert "agent-created" in reason

-    def test_dangerous_agent_created_allowed(self):
-        """Agent-created skills bypass verdict gating — agent can already
-        execute the same code via terminal(), so skill_manage allows all
-        verdicts. This prevents friction when the agent writes skills that
-        mention risky keywords in prose (e.g. describing cache-busting or
-        persistence semantics in a PR-review skill)."""
+    def test_dangerous_agent_created_asks(self):
+        """Agent-created skills with dangerous verdict return None (ask for confirmation)
+        when the scan runs. The caller (_security_scan_skill) surfaces this as an error
+        to the agent, who can retry without the flagged content.
+
+        This gate only runs when skills.guard_agent_created is enabled (off by default)."""
        f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")]
        allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f))
-        assert allowed is True
-        assert "agent-created" in reason
+        assert allowed is None
+        assert "Requires confirmation" in reason

-    def test_force_noop_for_agent_created_dangerous(self):
-        """With agent-created dangerous mapped to 'allow', force becomes a
-        no-op — the allow branch returns first. Force still works for any
-        trust level that maps to block (community/trusted)."""
+    def test_force_overrides_dangerous_for_agent_created(self):
        f = [Finding("x", "critical", "c", "f", 1, "m", "d")]
        allowed, reason = should_allow_install(
            self._result("agent-created", "dangerous", f), force=True
        )
        assert allowed is True
-        assert "agent-created" in reason
+        assert "Force-installed" in reason


 # ---------------------------------------------------------------------------
--- a/tools/skill_manager_tool.py
+++ b/tools/skill_manager_tool.py
@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple

 logger = logging.getLogger(__name__)

-# Import security scanner — agent-created skills get the same scrutiny as
-# community hub installs.
+# Import security scanner — external hub installs always get scanned;
+# agent-created skills only get scanned when skills.guard_agent_created is on.
 try:
    from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
    _GUARD_AVAILABLE = True
@ -53,10 +53,31 @@ except ImportError:
    _GUARD_AVAILABLE = False


+def _guard_agent_created_enabled() -> bool:
+    """Read skills.guard_agent_created from config (default False).
+
+    Off by default because the agent can already execute the same code
+    paths via terminal() with no gate, so the scan adds friction without
+    meaningful security.  Users who want belt-and-suspenders can turn it
+    on via `hermes config set skills.guard_agent_created true`.
+    """
+    try:
+        from hermes_cli.config import load_config
+        cfg = load_config()
+        return bool(cfg.get("skills", {}).get("guard_agent_created", False))
+    except Exception:
+        return False
+
+
 def _security_scan_skill(skill_dir: Path) -> Optional[str]:
-    """Scan a skill directory after write. Returns error string if blocked, else None."""
+    """Scan a skill directory after write. Returns error string if blocked, else None.
+
+    No-op when skills.guard_agent_created is disabled (the default).
+    """
    if not _GUARD_AVAILABLE:
        return None
+    if not _guard_agent_created_enabled():
+        return None
    try:
        result = scan_skill(skill_dir, source="agent-created")
        allowed, reason = should_allow_install(result)
@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]:
            return f"Security scan blocked this skill ({reason}):\n{report}"
        if allowed is None:
            # "ask" verdict — for agent-created skills this means dangerous
-            # findings were detected.  Block the skill and include the report.
+            # findings were detected.  Surface as an error so the agent can
+            # retry with the flagged content removed.
            report = format_scan_report(result)
            logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
            return f"Security scan blocked this skill ({reason}):\n{report}"
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@ -43,11 +43,11 @@ INSTALL_POLICY = {
    "builtin":       ("allow",  "allow",   "allow"),
    "trusted":       ("allow",  "allow",   "block"),
    "community":     ("allow",  "block",   "block"),
-    # Agent-created skills run in the same process as the agent that
-    # wrote them — the agent could already execute the same code via
-    # terminal(), so a dangerous-pattern gate on skill_manage adds
-    # friction without meaningful security. Allow all verdicts.
-    "agent-created": ("allow",  "allow",   "allow"),
+    # Agent-created: "ask" on dangerous surfaces as an error to the agent,
+    # which can retry without the flagged content. This gate only runs when
+    # skills.guard_agent_created is enabled (off by default) — see
+    # tools/skill_manager_tool.py::_guard_agent_created_enabled.
+    "agent-created": ("allow",  "allow",   "ask"),
 }

 VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}