feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with an opt-in config flag so users who want belt-and-suspenders security can still get the keyword scan on skill_manage output. ## Default behavior (flag off) skill_manage(action='create'|'edit'|'patch') no longer runs the keyword scanner. The agent can write skills that mention risky keywords in prose (documenting what reviewers should watch for, describing cache-bust semantics in a PR-review skill, referencing AGENTS.md, etc.) without getting blocked. Rationale: the agent can already execute the same code paths via terminal() with no gate, so the scan adds friction without meaningful security against a compromised or malicious agent. ## Opt-in behavior (flag on) Set skills.guard_agent_created: true in config.yaml to get the original behavior back. Scanner runs on every skill_manage write; dangerous verdicts surface as a tool error the agent can react to (retry without the flagged content). ## External hub installs unaffected trusted/community sources (hermes skills install) always get scanned regardless of this flag. The gate is specifically for skill_manage, which only agents call. ## Changes - hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG - tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag; _security_scan_skill() short-circuits to None when the flag is off - tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] = ('allow', 'allow', 'ask') so the scan remains strict when it does run - tests/tools/test_skills_guard.py: restore original ask/force tests - tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class covering both flag states + config error handling ## Validation - tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass - E2E: flagged-keyword skill creates with default config, blocks with flag on
2026-04-25 00:51:20 +00:00 · 2026-04-23 06:20:19 -07:00 · 2026-04-23 06:20:19 -07:00 · ce089169d5
commit ce089169d5
parent e3c0084140
5 changed files with 134 additions and 22 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -760,6 +760,17 @@ DEFAULT_CONFIG = {
        "inline_shell": False,
        # Timeout (seconds) for each !`cmd` snippet when inline_shell is on.
        "inline_shell_timeout": 10,
        # Run the keyword/pattern security scanner on skills the agent
        # writes via skill_manage (create/edit/patch).  Off by default
        # because the agent can already execute the same code paths via
        # terminal() with no gate, so the scan adds friction (blocks
        # skills that mention risky keywords in prose) without meaningful
        # security.  Turn on if you want the belt-and-suspenders — a
        # dangerous verdict will then surface as a tool error to the
        # agent, which can retry with the flagged content removed.
        # External hub installs (trusted/community sources) are always
        # scanned regardless of this setting.
        "guard_agent_created": False,
    },
    # Honcho AI-native memory -- reads ~/.honcho/config.json as single source of truth.
--- a/tests/tools/test_skill_manager_tool.py
+++ b/tests/tools/test_skill_manager_tool.py
@ -484,3 +484,85 @@ class TestSkillManageDispatcher:
            raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
        result = json.loads(raw)
        assert result["success"] is True
 class TestSecurityScanGate:
    """_security_scan_skill is gated by skills.guard_agent_created config flag."""
    def test_scan_noop_when_flag_off(self, tmp_path):
        """Default config (flag off) short-circuits before running scan_skill."""
        from tools.skill_manager_tool import _security_scan_skill
        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \
             patch("tools.skill_manager_tool.scan_skill") as mock_scan:
            result = _security_scan_skill(tmp_path)
        assert result is None
        mock_scan.assert_not_called()  # scan never ran
    def test_scan_runs_when_flag_on(self, tmp_path):
        """When flag is on, scan_skill is invoked and its verdict is honored."""
        from tools.skill_manager_tool import _security_scan_skill
        from tools.skills_guard import ScanResult
        # Fake a safe scan result — caller should return None (allow)
        fake_result = ScanResult(
            skill_name="test",
            source="agent-created",
            trust_level="agent-created",
            verdict="safe",
            findings=[],
            summary="ok",
        )
        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
             patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan:
            result = _security_scan_skill(tmp_path)
        assert result is None
        mock_scan.assert_called_once()
    def test_scan_blocks_dangerous_when_flag_on(self, tmp_path):
        """Dangerous verdict + flag on → returns an error string for the agent."""
        from tools.skill_manager_tool import _security_scan_skill
        from tools.skills_guard import ScanResult, Finding
        finding = Finding(
            pattern_id="test", severity="critical", category="exfiltration",
            file="SKILL.md", line=1, match="curl $TOKEN", description="test",
        )
        fake_result = ScanResult(
            skill_name="test",
            source="agent-created",
            trust_level="agent-created",
            verdict="dangerous",
            findings=[finding],
            summary="dangerous",
        )
        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
             patch("tools.skill_manager_tool.scan_skill", return_value=fake_result):
            result = _security_scan_skill(tmp_path)
        assert result is not None
        assert "Security scan blocked" in result
    def test_guard_flag_reads_config_default_false(self):
        """_guard_agent_created_enabled returns False when config doesn't set it."""
        from tools.skill_manager_tool import _guard_agent_created_enabled
        with patch("hermes_cli.config.load_config", return_value={"skills": {}}):
            assert _guard_agent_created_enabled() is False
    def test_guard_flag_reads_config_when_set(self):
        """_guard_agent_created_enabled returns True when user explicitly enables."""
        from tools.skill_manager_tool import _guard_agent_created_enabled
        with patch("hermes_cli.config.load_config",
                   return_value={"skills": {"guard_agent_created": True}}):
            assert _guard_agent_created_enabled() is True
    def test_guard_flag_handles_config_error(self):
        """If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off)."""
        from tools.skill_manager_tool import _guard_agent_created_enabled
        with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
            assert _guard_agent_created_enabled() is False
--- a/tests/tools/test_skills_guard.py
+++ b/tests/tools/test_skills_guard.py
@ -174,27 +174,24 @@ class TestShouldAllowInstall:
        assert allowed is True
        assert "agent-created" in reason
-    def test_dangerous_agent_created_allowed(self):
+    def test_dangerous_agent_created_asks(self):
-        """Agent-created skills bypass verdict gating — agent can already
+        """Agent-created skills with dangerous verdict return None (ask for confirmation)
-        execute the same code via terminal(), so skill_manage allows all
+        when the scan runs. The caller (_security_scan_skill) surfaces this as an error
-        verdicts. This prevents friction when the agent writes skills that
+        to the agent, who can retry without the flagged content.
-        mention risky keywords in prose (e.g. describing cache-busting or
+
-        persistence semantics in a PR-review skill)."""
+        This gate only runs when skills.guard_agent_created is enabled (off by default)."""
        f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")]
        allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f))
-        assert allowed is True
+        assert allowed is None
-        assert "agent-created" in reason
+        assert "Requires confirmation" in reason
-    def test_force_noop_for_agent_created_dangerous(self):
+    def test_force_overrides_dangerous_for_agent_created(self):
        """With agent-created dangerous mapped to 'allow', force becomes a
        no-op — the allow branch returns first. Force still works for any
        trust level that maps to block (community/trusted)."""
        f = [Finding("x", "critical", "c", "f", 1, "m", "d")]
        allowed, reason = should_allow_install(
            self._result("agent-created", "dangerous", f), force=True
        )
        assert allowed is True
-        assert "agent-created" in reason
+        assert "Force-installed" in reason
 # ---------------------------------------------------------------------------
--- a/tools/skill_manager_tool.py
+++ b/tools/skill_manager_tool.py
@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple
 logger = logging.getLogger(__name__)
-# Import security scanner — agent-created skills get the same scrutiny as
+# Import security scanner — external hub installs always get scanned;
-# community hub installs.
+# agent-created skills only get scanned when skills.guard_agent_created is on.
 try:
    from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
    _GUARD_AVAILABLE = True
@ -53,10 +53,31 @@ except ImportError:
    _GUARD_AVAILABLE = False
 def _guard_agent_created_enabled() -> bool:
    """Read skills.guard_agent_created from config (default False).
    Off by default because the agent can already execute the same code
    paths via terminal() with no gate, so the scan adds friction without
    meaningful security.  Users who want belt-and-suspenders can turn it
    on via `hermes config set skills.guard_agent_created true`.
    """
    try:
        from hermes_cli.config import load_config
        cfg = load_config()
        return bool(cfg.get("skills", {}).get("guard_agent_created", False))
    except Exception:
        return False
 def _security_scan_skill(skill_dir: Path) -> Optional[str]:
-    """Scan a skill directory after write. Returns error string if blocked, else None."""
+    """Scan a skill directory after write. Returns error string if blocked, else None.
    No-op when skills.guard_agent_created is disabled (the default).
    """
    if not _GUARD_AVAILABLE:
        return None
    if not _guard_agent_created_enabled():
        return None
    try:
        result = scan_skill(skill_dir, source="agent-created")
        allowed, reason = should_allow_install(result)
@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]:
            return f"Security scan blocked this skill ({reason}):\n{report}"
        if allowed is None:
            # "ask" verdict — for agent-created skills this means dangerous
-            # findings were detected.  Block the skill and include the report.
+            # findings were detected.  Surface as an error so the agent can
            # retry with the flagged content removed.
            report = format_scan_report(result)
            logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
            return f"Security scan blocked this skill ({reason}):\n{report}"
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@ -43,11 +43,11 @@ INSTALL_POLICY = {
    "builtin":       ("allow",  "allow",   "allow"),
    "trusted":       ("allow",  "allow",   "block"),
    "community":     ("allow",  "block",   "block"),
-    # Agent-created skills run in the same process as the agent that
+    # Agent-created: "ask" on dangerous surfaces as an error to the agent,
-    # wrote them — the agent could already execute the same code via
+    # which can retry without the flagged content. This gate only runs when
-    # terminal(), so a dangerous-pattern gate on skill_manage adds
+    # skills.guard_agent_created is enabled (off by default) — see
-    # friction without meaningful security. Allow all verdicts.
+    # tools/skill_manager_tool.py::_guard_agent_created_enabled.
-    "agent-created": ("allow",  "allow",   "allow"),
+    "agent-created": ("allow",  "allow",   "ask"),
 }
 VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}