feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with
an opt-in config flag so users who want belt-and-suspenders security can
still get the keyword scan on skill_manage output.

## Default behavior (flag off)
skill_manage(action='create'|'edit'|'patch') no longer runs the keyword
scanner. The agent can write skills that mention risky keywords in prose
(documenting what reviewers should watch for, describing cache-bust
semantics in a PR-review skill, referencing AGENTS.md, etc.) without
getting blocked.

Rationale: the agent can already execute the same code paths via
terminal() with no gate, so the scan adds friction without meaningful
security against a compromised or malicious agent.

## Opt-in behavior (flag on)
Set skills.guard_agent_created: true in config.yaml to get the original
behavior back. Scanner runs on every skill_manage write; dangerous
verdicts surface as a tool error the agent can react to (retry without
the flagged content).

## External hub installs unaffected
trusted/community sources (hermes skills install) always get scanned
regardless of this flag. The gate is specifically for skill_manage,
which only agents call.

## Changes
- hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG
- tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag;
  _security_scan_skill() short-circuits to None when the flag is off
- tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] =
  ('allow', 'allow', 'ask') so the scan remains strict when it does run
- tests/tools/test_skills_guard.py: restore original ask/force tests
- tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class
  covering both flag states + config error handling

## Validation
- tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass
- E2E: flagged-keyword skill creates with default config, blocks with flag on
This commit is contained in:
Teknium 2026-04-23 06:20:19 -07:00 committed by Teknium
parent e3c0084140
commit ce089169d5
5 changed files with 134 additions and 22 deletions

View file

@ -484,3 +484,85 @@ class TestSkillManageDispatcher:
raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
result = json.loads(raw)
assert result["success"] is True
class TestSecurityScanGate:
"""_security_scan_skill is gated by skills.guard_agent_created config flag."""
def test_scan_noop_when_flag_off(self, tmp_path):
"""Default config (flag off) short-circuits before running scan_skill."""
from tools.skill_manager_tool import _security_scan_skill
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \
patch("tools.skill_manager_tool.scan_skill") as mock_scan:
result = _security_scan_skill(tmp_path)
assert result is None
mock_scan.assert_not_called() # scan never ran
def test_scan_runs_when_flag_on(self, tmp_path):
"""When flag is on, scan_skill is invoked and its verdict is honored."""
from tools.skill_manager_tool import _security_scan_skill
from tools.skills_guard import ScanResult
# Fake a safe scan result — caller should return None (allow)
fake_result = ScanResult(
skill_name="test",
source="agent-created",
trust_level="agent-created",
verdict="safe",
findings=[],
summary="ok",
)
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan:
result = _security_scan_skill(tmp_path)
assert result is None
mock_scan.assert_called_once()
def test_scan_blocks_dangerous_when_flag_on(self, tmp_path):
"""Dangerous verdict + flag on → returns an error string for the agent."""
from tools.skill_manager_tool import _security_scan_skill
from tools.skills_guard import ScanResult, Finding
finding = Finding(
pattern_id="test", severity="critical", category="exfiltration",
file="SKILL.md", line=1, match="curl $TOKEN", description="test",
)
fake_result = ScanResult(
skill_name="test",
source="agent-created",
trust_level="agent-created",
verdict="dangerous",
findings=[finding],
summary="dangerous",
)
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
patch("tools.skill_manager_tool.scan_skill", return_value=fake_result):
result = _security_scan_skill(tmp_path)
assert result is not None
assert "Security scan blocked" in result
def test_guard_flag_reads_config_default_false(self):
"""_guard_agent_created_enabled returns False when config doesn't set it."""
from tools.skill_manager_tool import _guard_agent_created_enabled
with patch("hermes_cli.config.load_config", return_value={"skills": {}}):
assert _guard_agent_created_enabled() is False
def test_guard_flag_reads_config_when_set(self):
"""_guard_agent_created_enabled returns True when user explicitly enables."""
from tools.skill_manager_tool import _guard_agent_created_enabled
with patch("hermes_cli.config.load_config",
return_value={"skills": {"guard_agent_created": True}}):
assert _guard_agent_created_enabled() is True
def test_guard_flag_handles_config_error(self):
"""If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off)."""
from tools.skill_manager_tool import _guard_agent_created_enabled
with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
assert _guard_agent_created_enabled() is False