feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with an opt-in config flag so users who want belt-and-suspenders security can still get the keyword scan on skill_manage output. ## Default behavior (flag off) skill_manage(action='create'|'edit'|'patch') no longer runs the keyword scanner. The agent can write skills that mention risky keywords in prose (documenting what reviewers should watch for, describing cache-bust semantics in a PR-review skill, referencing AGENTS.md, etc.) without getting blocked. Rationale: the agent can already execute the same code paths via terminal() with no gate, so the scan adds friction without meaningful security against a compromised or malicious agent. ## Opt-in behavior (flag on) Set skills.guard_agent_created: true in config.yaml to get the original behavior back. Scanner runs on every skill_manage write; dangerous verdicts surface as a tool error the agent can react to (retry without the flagged content). ## External hub installs unaffected trusted/community sources (hermes skills install) always get scanned regardless of this flag. The gate is specifically for skill_manage, which only agents call. ## Changes - hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG - tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag; _security_scan_skill() short-circuits to None when the flag is off - tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] = ('allow', 'allow', 'ask') so the scan remains strict when it does run - tests/tools/test_skills_guard.py: restore original ask/force tests - tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class covering both flag states + config error handling ## Validation - tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass - E2E: flagged-keyword skill creates with default config, blocks with flag on
2026-04-29 01:31:41 +00:00 · 2026-04-23 06:20:19 -07:00 · 2026-04-23 06:20:19 -07:00 · ce089169d5
commit ce089169d5
parent e3c0084140
5 changed files with 134 additions and 22 deletions
--- a/tests/tools/test_skill_manager_tool.py
+++ b/tests/tools/test_skill_manager_tool.py
@ -484,3 +484,85 @@ class TestSkillManageDispatcher:
            raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
        result = json.loads(raw)
        assert result["success"] is True
+
+
+class TestSecurityScanGate:
+    """_security_scan_skill is gated by skills.guard_agent_created config flag."""
+
+    def test_scan_noop_when_flag_off(self, tmp_path):
+        """Default config (flag off) short-circuits before running scan_skill."""
+        from tools.skill_manager_tool import _security_scan_skill
+
+        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \
+             patch("tools.skill_manager_tool.scan_skill") as mock_scan:
+            result = _security_scan_skill(tmp_path)
+
+        assert result is None
+        mock_scan.assert_not_called()  # scan never ran
+
+    def test_scan_runs_when_flag_on(self, tmp_path):
+        """When flag is on, scan_skill is invoked and its verdict is honored."""
+        from tools.skill_manager_tool import _security_scan_skill
+        from tools.skills_guard import ScanResult
+
+        # Fake a safe scan result — caller should return None (allow)
+        fake_result = ScanResult(
+            skill_name="test",
+            source="agent-created",
+            trust_level="agent-created",
+            verdict="safe",
+            findings=[],
+            summary="ok",
+        )
+        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
+             patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan:
+            result = _security_scan_skill(tmp_path)
+
+        assert result is None
+        mock_scan.assert_called_once()
+
+    def test_scan_blocks_dangerous_when_flag_on(self, tmp_path):
+        """Dangerous verdict + flag on → returns an error string for the agent."""
+        from tools.skill_manager_tool import _security_scan_skill
+        from tools.skills_guard import ScanResult, Finding
+
+        finding = Finding(
+            pattern_id="test", severity="critical", category="exfiltration",
+            file="SKILL.md", line=1, match="curl $TOKEN", description="test",
+        )
+        fake_result = ScanResult(
+            skill_name="test",
+            source="agent-created",
+            trust_level="agent-created",
+            verdict="dangerous",
+            findings=[finding],
+            summary="dangerous",
+        )
+        with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
+             patch("tools.skill_manager_tool.scan_skill", return_value=fake_result):
+            result = _security_scan_skill(tmp_path)
+
+        assert result is not None
+        assert "Security scan blocked" in result
+
+    def test_guard_flag_reads_config_default_false(self):
+        """_guard_agent_created_enabled returns False when config doesn't set it."""
+        from tools.skill_manager_tool import _guard_agent_created_enabled
+
+        with patch("hermes_cli.config.load_config", return_value={"skills": {}}):
+            assert _guard_agent_created_enabled() is False
+
+    def test_guard_flag_reads_config_when_set(self):
+        """_guard_agent_created_enabled returns True when user explicitly enables."""
+        from tools.skill_manager_tool import _guard_agent_created_enabled
+
+        with patch("hermes_cli.config.load_config",
+                   return_value={"skills": {"guard_agent_created": True}}):
+            assert _guard_agent_created_enabled() is True
+
+    def test_guard_flag_handles_config_error(self):
+        """If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off)."""
+        from tools.skill_manager_tool import _guard_agent_created_enabled
+
+        with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
+            assert _guard_agent_created_enabled() is False