feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)

Replaces the blanket 'always allow' change from the previous commit with
an opt-in config flag so users who want belt-and-suspenders security can
still get the keyword scan on skill_manage output.

## Default behavior (flag off)
skill_manage(action='create'|'edit'|'patch') no longer runs the keyword
scanner. The agent can write skills that mention risky keywords in prose
(documenting what reviewers should watch for, describing cache-bust
semantics in a PR-review skill, referencing AGENTS.md, etc.) without
getting blocked.

Rationale: the agent can already execute the same code paths via
terminal() with no gate, so the scan adds friction without meaningful
security against a compromised or malicious agent.

## Opt-in behavior (flag on)
Set skills.guard_agent_created: true in config.yaml to get the original
behavior back. Scanner runs on every skill_manage write; dangerous
verdicts surface as a tool error the agent can react to (retry without
the flagged content).

## External hub installs unaffected
trusted/community sources (hermes skills install) always get scanned
regardless of this flag. The gate is specifically for skill_manage,
which only agents call.

## Changes
- hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG
- tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag;
  _security_scan_skill() short-circuits to None when the flag is off
- tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] =
  ('allow', 'allow', 'ask') so the scan remains strict when it does run
- tests/tools/test_skills_guard.py: restore original ask/force tests
- tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class
  covering both flag states + config error handling

## Validation
- tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass
- E2E: flagged-keyword skill creates with default config, blocks with flag on
This commit is contained in:
Teknium 2026-04-23 06:20:19 -07:00 committed by Teknium
parent e3c0084140
commit ce089169d5
5 changed files with 134 additions and 22 deletions

View file

@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple
logger = logging.getLogger(__name__)
# Import security scanner — agent-created skills get the same scrutiny as
# community hub installs.
# Import security scanner — external hub installs always get scanned;
# agent-created skills only get scanned when skills.guard_agent_created is on.
try:
from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
_GUARD_AVAILABLE = True
@ -53,10 +53,31 @@ except ImportError:
_GUARD_AVAILABLE = False
def _guard_agent_created_enabled() -> bool:
"""Read skills.guard_agent_created from config (default False).
Off by default because the agent can already execute the same code
paths via terminal() with no gate, so the scan adds friction without
meaningful security. Users who want belt-and-suspenders can turn it
on via `hermes config set skills.guard_agent_created true`.
"""
try:
from hermes_cli.config import load_config
cfg = load_config()
return bool(cfg.get("skills", {}).get("guard_agent_created", False))
except Exception:
return False
def _security_scan_skill(skill_dir: Path) -> Optional[str]:
"""Scan a skill directory after write. Returns error string if blocked, else None."""
"""Scan a skill directory after write. Returns error string if blocked, else None.
No-op when skills.guard_agent_created is disabled (the default).
"""
if not _GUARD_AVAILABLE:
return None
if not _guard_agent_created_enabled():
return None
try:
result = scan_skill(skill_dir, source="agent-created")
allowed, reason = should_allow_install(result)
@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]:
return f"Security scan blocked this skill ({reason}):\n{report}"
if allowed is None:
# "ask" verdict — for agent-created skills this means dangerous
# findings were detected. Block the skill and include the report.
# findings were detected. Surface as an error so the agent can
# retry with the flagged content removed.
report = format_scan_report(result)
logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
return f"Security scan blocked this skill ({reason}):\n{report}"

View file

@ -43,11 +43,11 @@ INSTALL_POLICY = {
"builtin": ("allow", "allow", "allow"),
"trusted": ("allow", "allow", "block"),
"community": ("allow", "block", "block"),
# Agent-created skills run in the same process as the agent that
# wrote them — the agent could already execute the same code via
# terminal(), so a dangerous-pattern gate on skill_manage adds
# friction without meaningful security. Allow all verdicts.
"agent-created": ("allow", "allow", "allow"),
# Agent-created: "ask" on dangerous surfaces as an error to the agent,
# which can retry without the flagged content. This gate only runs when
# skills.guard_agent_created is enabled (off by default) — see
# tools/skill_manager_tool.py::_guard_agent_created_enabled.
"agent-created": ("allow", "allow", "ask"),
}
VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}