feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with
an opt-in config flag so users who want belt-and-suspenders security can
still get the keyword scan on skill_manage output.

## Default behavior (flag off)
skill_manage(action='create'|'edit'|'patch') no longer runs the keyword
scanner. The agent can write skills that mention risky keywords in prose
(documenting what reviewers should watch for, describing cache-bust
semantics in a PR-review skill, referencing AGENTS.md, etc.) without
getting blocked.

Rationale: the agent can already execute the same code paths via
terminal() with no gate, so the scan adds friction without meaningful
security against a compromised or malicious agent.

## Opt-in behavior (flag on)
Set skills.guard_agent_created: true in config.yaml to get the original
behavior back. Scanner runs on every skill_manage write; dangerous
verdicts surface as a tool error the agent can react to (retry without
the flagged content).

## External hub installs unaffected
trusted/community sources (hermes skills install) always get scanned
regardless of this flag. The gate is specifically for skill_manage,
which only agents call.

## Changes
- hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG
- tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag;
  _security_scan_skill() short-circuits to None when the flag is off
- tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] =
  ('allow', 'allow', 'ask') so the scan remains strict when it does run
- tests/tools/test_skills_guard.py: restore original ask/force tests
- tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class
  covering both flag states + config error handling

## Validation
- tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass
- E2E: flagged-keyword skill creates with default config, blocks with flag on
This commit is contained in:
Teknium 2026-04-23 06:20:19 -07:00 committed by Teknium
parent e3c0084140
commit ce089169d5
5 changed files with 134 additions and 22 deletions

View file

@ -760,6 +760,17 @@ DEFAULT_CONFIG = {
"inline_shell": False,
# Timeout (seconds) for each !`cmd` snippet when inline_shell is on.
"inline_shell_timeout": 10,
# Run the keyword/pattern security scanner on skills the agent
# writes via skill_manage (create/edit/patch). Off by default
# because the agent can already execute the same code paths via
# terminal() with no gate, so the scan adds friction (blocks
# skills that mention risky keywords in prose) without meaningful
# security. Turn on if you want the belt-and-suspenders — a
# dangerous verdict will then surface as a tool error to the
# agent, which can retry with the flagged content removed.
# External hub installs (trusted/community sources) are always
# scanned regardless of this setting.
"guard_agent_created": False,
},
# Honcho AI-native memory -- reads ~/.honcho/config.json as single source of truth.

View file

@ -484,3 +484,85 @@ class TestSkillManageDispatcher:
raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
result = json.loads(raw)
assert result["success"] is True
class TestSecurityScanGate:
"""_security_scan_skill is gated by skills.guard_agent_created config flag."""
def test_scan_noop_when_flag_off(self, tmp_path):
"""Default config (flag off) short-circuits before running scan_skill."""
from tools.skill_manager_tool import _security_scan_skill
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \
patch("tools.skill_manager_tool.scan_skill") as mock_scan:
result = _security_scan_skill(tmp_path)
assert result is None
mock_scan.assert_not_called() # scan never ran
def test_scan_runs_when_flag_on(self, tmp_path):
"""When flag is on, scan_skill is invoked and its verdict is honored."""
from tools.skill_manager_tool import _security_scan_skill
from tools.skills_guard import ScanResult
# Fake a safe scan result — caller should return None (allow)
fake_result = ScanResult(
skill_name="test",
source="agent-created",
trust_level="agent-created",
verdict="safe",
findings=[],
summary="ok",
)
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan:
result = _security_scan_skill(tmp_path)
assert result is None
mock_scan.assert_called_once()
def test_scan_blocks_dangerous_when_flag_on(self, tmp_path):
"""Dangerous verdict + flag on → returns an error string for the agent."""
from tools.skill_manager_tool import _security_scan_skill
from tools.skills_guard import ScanResult, Finding
finding = Finding(
pattern_id="test", severity="critical", category="exfiltration",
file="SKILL.md", line=1, match="curl $TOKEN", description="test",
)
fake_result = ScanResult(
skill_name="test",
source="agent-created",
trust_level="agent-created",
verdict="dangerous",
findings=[finding],
summary="dangerous",
)
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
patch("tools.skill_manager_tool.scan_skill", return_value=fake_result):
result = _security_scan_skill(tmp_path)
assert result is not None
assert "Security scan blocked" in result
def test_guard_flag_reads_config_default_false(self):
"""_guard_agent_created_enabled returns False when config doesn't set it."""
from tools.skill_manager_tool import _guard_agent_created_enabled
with patch("hermes_cli.config.load_config", return_value={"skills": {}}):
assert _guard_agent_created_enabled() is False
def test_guard_flag_reads_config_when_set(self):
"""_guard_agent_created_enabled returns True when user explicitly enables."""
from tools.skill_manager_tool import _guard_agent_created_enabled
with patch("hermes_cli.config.load_config",
return_value={"skills": {"guard_agent_created": True}}):
assert _guard_agent_created_enabled() is True
def test_guard_flag_handles_config_error(self):
"""If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off)."""
from tools.skill_manager_tool import _guard_agent_created_enabled
with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
assert _guard_agent_created_enabled() is False

View file

@ -174,27 +174,24 @@ class TestShouldAllowInstall:
assert allowed is True
assert "agent-created" in reason
def test_dangerous_agent_created_allowed(self):
"""Agent-created skills bypass verdict gating — agent can already
execute the same code via terminal(), so skill_manage allows all
verdicts. This prevents friction when the agent writes skills that
mention risky keywords in prose (e.g. describing cache-busting or
persistence semantics in a PR-review skill)."""
def test_dangerous_agent_created_asks(self):
"""Agent-created skills with dangerous verdict return None (ask for confirmation)
when the scan runs. The caller (_security_scan_skill) surfaces this as an error
to the agent, who can retry without the flagged content.
This gate only runs when skills.guard_agent_created is enabled (off by default)."""
f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")]
allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f))
assert allowed is True
assert "agent-created" in reason
assert allowed is None
assert "Requires confirmation" in reason
def test_force_noop_for_agent_created_dangerous(self):
"""With agent-created dangerous mapped to 'allow', force becomes a
no-op the allow branch returns first. Force still works for any
trust level that maps to block (community/trusted)."""
def test_force_overrides_dangerous_for_agent_created(self):
f = [Finding("x", "critical", "c", "f", 1, "m", "d")]
allowed, reason = should_allow_install(
self._result("agent-created", "dangerous", f), force=True
)
assert allowed is True
assert "agent-created" in reason
assert "Force-installed" in reason
# ---------------------------------------------------------------------------

View file

@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple
logger = logging.getLogger(__name__)
# Import security scanner — agent-created skills get the same scrutiny as
# community hub installs.
# Import security scanner — external hub installs always get scanned;
# agent-created skills only get scanned when skills.guard_agent_created is on.
try:
from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
_GUARD_AVAILABLE = True
@ -53,10 +53,31 @@ except ImportError:
_GUARD_AVAILABLE = False
def _guard_agent_created_enabled() -> bool:
"""Read skills.guard_agent_created from config (default False).
Off by default because the agent can already execute the same code
paths via terminal() with no gate, so the scan adds friction without
meaningful security. Users who want belt-and-suspenders can turn it
on via `hermes config set skills.guard_agent_created true`.
"""
try:
from hermes_cli.config import load_config
cfg = load_config()
return bool(cfg.get("skills", {}).get("guard_agent_created", False))
except Exception:
return False
def _security_scan_skill(skill_dir: Path) -> Optional[str]:
"""Scan a skill directory after write. Returns error string if blocked, else None."""
"""Scan a skill directory after write. Returns error string if blocked, else None.
No-op when skills.guard_agent_created is disabled (the default).
"""
if not _GUARD_AVAILABLE:
return None
if not _guard_agent_created_enabled():
return None
try:
result = scan_skill(skill_dir, source="agent-created")
allowed, reason = should_allow_install(result)
@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]:
return f"Security scan blocked this skill ({reason}):\n{report}"
if allowed is None:
# "ask" verdict — for agent-created skills this means dangerous
# findings were detected. Block the skill and include the report.
# findings were detected. Surface as an error so the agent can
# retry with the flagged content removed.
report = format_scan_report(result)
logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
return f"Security scan blocked this skill ({reason}):\n{report}"

View file

@ -43,11 +43,11 @@ INSTALL_POLICY = {
"builtin": ("allow", "allow", "allow"),
"trusted": ("allow", "allow", "block"),
"community": ("allow", "block", "block"),
# Agent-created skills run in the same process as the agent that
# wrote them — the agent could already execute the same code via
# terminal(), so a dangerous-pattern gate on skill_manage adds
# friction without meaningful security. Allow all verdicts.
"agent-created": ("allow", "allow", "allow"),
# Agent-created: "ask" on dangerous surfaces as an error to the agent,
# which can retry without the flagged content. This gate only runs when
# skills.guard_agent_created is enabled (off by default) — see
# tools/skill_manager_tool.py::_guard_agent_created_enabled.
"agent-created": ("allow", "allow", "ask"),
}
VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}