mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat(skills-guard): gate agent-created scanner on config.skills.guard_agent_created (default off)
Replaces the blanket 'always allow' change from the previous commit with
an opt-in config flag so users who want belt-and-suspenders security can
still get the keyword scan on skill_manage output.
## Default behavior (flag off)
skill_manage(action='create'|'edit'|'patch') no longer runs the keyword
scanner. The agent can write skills that mention risky keywords in prose
(documenting what reviewers should watch for, describing cache-bust
semantics in a PR-review skill, referencing AGENTS.md, etc.) without
getting blocked.
Rationale: the agent can already execute the same code paths via
terminal() with no gate, so the scan adds friction without meaningful
security against a compromised or malicious agent.
## Opt-in behavior (flag on)
Set skills.guard_agent_created: true in config.yaml to get the original
behavior back. Scanner runs on every skill_manage write; dangerous
verdicts surface as a tool error the agent can react to (retry without
the flagged content).
## External hub installs unaffected
trusted/community sources (hermes skills install) always get scanned
regardless of this flag. The gate is specifically for skill_manage,
which only agents call.
## Changes
- hermes_cli/config.py: add skills.guard_agent_created: False to DEFAULT_CONFIG
- tools/skill_manager_tool.py: _guard_agent_created_enabled() reads the flag;
_security_scan_skill() short-circuits to None when the flag is off
- tools/skills_guard.py: restore INSTALL_POLICY['agent-created'] =
('allow', 'allow', 'ask') so the scan remains strict when it does run
- tests/tools/test_skills_guard.py: restore original ask/force tests
- tests/tools/test_skill_manager_tool.py: new TestSecurityScanGate class
covering both flag states + config error handling
## Validation
- tests/tools/test_skills_guard.py + test_skill_manager_tool.py: 115/115 pass
- E2E: flagged-keyword skill creates with default config, blocks with flag on
This commit is contained in:
parent
e3c0084140
commit
ce089169d5
5 changed files with 134 additions and 22 deletions
|
|
@ -760,6 +760,17 @@ DEFAULT_CONFIG = {
|
||||||
"inline_shell": False,
|
"inline_shell": False,
|
||||||
# Timeout (seconds) for each !`cmd` snippet when inline_shell is on.
|
# Timeout (seconds) for each !`cmd` snippet when inline_shell is on.
|
||||||
"inline_shell_timeout": 10,
|
"inline_shell_timeout": 10,
|
||||||
|
# Run the keyword/pattern security scanner on skills the agent
|
||||||
|
# writes via skill_manage (create/edit/patch). Off by default
|
||||||
|
# because the agent can already execute the same code paths via
|
||||||
|
# terminal() with no gate, so the scan adds friction (blocks
|
||||||
|
# skills that mention risky keywords in prose) without meaningful
|
||||||
|
# security. Turn on if you want the belt-and-suspenders — a
|
||||||
|
# dangerous verdict will then surface as a tool error to the
|
||||||
|
# agent, which can retry with the flagged content removed.
|
||||||
|
# External hub installs (trusted/community sources) are always
|
||||||
|
# scanned regardless of this setting.
|
||||||
|
"guard_agent_created": False,
|
||||||
},
|
},
|
||||||
|
|
||||||
# Honcho AI-native memory -- reads ~/.honcho/config.json as single source of truth.
|
# Honcho AI-native memory -- reads ~/.honcho/config.json as single source of truth.
|
||||||
|
|
|
||||||
|
|
@ -484,3 +484,85 @@ class TestSkillManageDispatcher:
|
||||||
raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
|
raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT)
|
||||||
result = json.loads(raw)
|
result = json.loads(raw)
|
||||||
assert result["success"] is True
|
assert result["success"] is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestSecurityScanGate:
|
||||||
|
"""_security_scan_skill is gated by skills.guard_agent_created config flag."""
|
||||||
|
|
||||||
|
def test_scan_noop_when_flag_off(self, tmp_path):
|
||||||
|
"""Default config (flag off) short-circuits before running scan_skill."""
|
||||||
|
from tools.skill_manager_tool import _security_scan_skill
|
||||||
|
|
||||||
|
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \
|
||||||
|
patch("tools.skill_manager_tool.scan_skill") as mock_scan:
|
||||||
|
result = _security_scan_skill(tmp_path)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
mock_scan.assert_not_called() # scan never ran
|
||||||
|
|
||||||
|
def test_scan_runs_when_flag_on(self, tmp_path):
|
||||||
|
"""When flag is on, scan_skill is invoked and its verdict is honored."""
|
||||||
|
from tools.skill_manager_tool import _security_scan_skill
|
||||||
|
from tools.skills_guard import ScanResult
|
||||||
|
|
||||||
|
# Fake a safe scan result — caller should return None (allow)
|
||||||
|
fake_result = ScanResult(
|
||||||
|
skill_name="test",
|
||||||
|
source="agent-created",
|
||||||
|
trust_level="agent-created",
|
||||||
|
verdict="safe",
|
||||||
|
findings=[],
|
||||||
|
summary="ok",
|
||||||
|
)
|
||||||
|
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
|
||||||
|
patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan:
|
||||||
|
result = _security_scan_skill(tmp_path)
|
||||||
|
|
||||||
|
assert result is None
|
||||||
|
mock_scan.assert_called_once()
|
||||||
|
|
||||||
|
def test_scan_blocks_dangerous_when_flag_on(self, tmp_path):
|
||||||
|
"""Dangerous verdict + flag on → returns an error string for the agent."""
|
||||||
|
from tools.skill_manager_tool import _security_scan_skill
|
||||||
|
from tools.skills_guard import ScanResult, Finding
|
||||||
|
|
||||||
|
finding = Finding(
|
||||||
|
pattern_id="test", severity="critical", category="exfiltration",
|
||||||
|
file="SKILL.md", line=1, match="curl $TOKEN", description="test",
|
||||||
|
)
|
||||||
|
fake_result = ScanResult(
|
||||||
|
skill_name="test",
|
||||||
|
source="agent-created",
|
||||||
|
trust_level="agent-created",
|
||||||
|
verdict="dangerous",
|
||||||
|
findings=[finding],
|
||||||
|
summary="dangerous",
|
||||||
|
)
|
||||||
|
with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \
|
||||||
|
patch("tools.skill_manager_tool.scan_skill", return_value=fake_result):
|
||||||
|
result = _security_scan_skill(tmp_path)
|
||||||
|
|
||||||
|
assert result is not None
|
||||||
|
assert "Security scan blocked" in result
|
||||||
|
|
||||||
|
def test_guard_flag_reads_config_default_false(self):
|
||||||
|
"""_guard_agent_created_enabled returns False when config doesn't set it."""
|
||||||
|
from tools.skill_manager_tool import _guard_agent_created_enabled
|
||||||
|
|
||||||
|
with patch("hermes_cli.config.load_config", return_value={"skills": {}}):
|
||||||
|
assert _guard_agent_created_enabled() is False
|
||||||
|
|
||||||
|
def test_guard_flag_reads_config_when_set(self):
|
||||||
|
"""_guard_agent_created_enabled returns True when user explicitly enables."""
|
||||||
|
from tools.skill_manager_tool import _guard_agent_created_enabled
|
||||||
|
|
||||||
|
with patch("hermes_cli.config.load_config",
|
||||||
|
return_value={"skills": {"guard_agent_created": True}}):
|
||||||
|
assert _guard_agent_created_enabled() is True
|
||||||
|
|
||||||
|
def test_guard_flag_handles_config_error(self):
|
||||||
|
"""If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off)."""
|
||||||
|
from tools.skill_manager_tool import _guard_agent_created_enabled
|
||||||
|
|
||||||
|
with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")):
|
||||||
|
assert _guard_agent_created_enabled() is False
|
||||||
|
|
|
||||||
|
|
@ -174,27 +174,24 @@ class TestShouldAllowInstall:
|
||||||
assert allowed is True
|
assert allowed is True
|
||||||
assert "agent-created" in reason
|
assert "agent-created" in reason
|
||||||
|
|
||||||
def test_dangerous_agent_created_allowed(self):
|
def test_dangerous_agent_created_asks(self):
|
||||||
"""Agent-created skills bypass verdict gating — agent can already
|
"""Agent-created skills with dangerous verdict return None (ask for confirmation)
|
||||||
execute the same code via terminal(), so skill_manage allows all
|
when the scan runs. The caller (_security_scan_skill) surfaces this as an error
|
||||||
verdicts. This prevents friction when the agent writes skills that
|
to the agent, who can retry without the flagged content.
|
||||||
mention risky keywords in prose (e.g. describing cache-busting or
|
|
||||||
persistence semantics in a PR-review skill)."""
|
This gate only runs when skills.guard_agent_created is enabled (off by default)."""
|
||||||
f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")]
|
f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")]
|
||||||
allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f))
|
allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f))
|
||||||
assert allowed is True
|
assert allowed is None
|
||||||
assert "agent-created" in reason
|
assert "Requires confirmation" in reason
|
||||||
|
|
||||||
def test_force_noop_for_agent_created_dangerous(self):
|
def test_force_overrides_dangerous_for_agent_created(self):
|
||||||
"""With agent-created dangerous mapped to 'allow', force becomes a
|
|
||||||
no-op — the allow branch returns first. Force still works for any
|
|
||||||
trust level that maps to block (community/trusted)."""
|
|
||||||
f = [Finding("x", "critical", "c", "f", 1, "m", "d")]
|
f = [Finding("x", "critical", "c", "f", 1, "m", "d")]
|
||||||
allowed, reason = should_allow_install(
|
allowed, reason = should_allow_install(
|
||||||
self._result("agent-created", "dangerous", f), force=True
|
self._result("agent-created", "dangerous", f), force=True
|
||||||
)
|
)
|
||||||
assert allowed is True
|
assert allowed is True
|
||||||
assert "agent-created" in reason
|
assert "Force-installed" in reason
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Import security scanner — agent-created skills get the same scrutiny as
|
# Import security scanner — external hub installs always get scanned;
|
||||||
# community hub installs.
|
# agent-created skills only get scanned when skills.guard_agent_created is on.
|
||||||
try:
|
try:
|
||||||
from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
|
from tools.skills_guard import scan_skill, should_allow_install, format_scan_report
|
||||||
_GUARD_AVAILABLE = True
|
_GUARD_AVAILABLE = True
|
||||||
|
|
@ -53,10 +53,31 @@ except ImportError:
|
||||||
_GUARD_AVAILABLE = False
|
_GUARD_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
def _guard_agent_created_enabled() -> bool:
|
||||||
|
"""Read skills.guard_agent_created from config (default False).
|
||||||
|
|
||||||
|
Off by default because the agent can already execute the same code
|
||||||
|
paths via terminal() with no gate, so the scan adds friction without
|
||||||
|
meaningful security. Users who want belt-and-suspenders can turn it
|
||||||
|
on via `hermes config set skills.guard_agent_created true`.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
cfg = load_config()
|
||||||
|
return bool(cfg.get("skills", {}).get("guard_agent_created", False))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _security_scan_skill(skill_dir: Path) -> Optional[str]:
|
def _security_scan_skill(skill_dir: Path) -> Optional[str]:
|
||||||
"""Scan a skill directory after write. Returns error string if blocked, else None."""
|
"""Scan a skill directory after write. Returns error string if blocked, else None.
|
||||||
|
|
||||||
|
No-op when skills.guard_agent_created is disabled (the default).
|
||||||
|
"""
|
||||||
if not _GUARD_AVAILABLE:
|
if not _GUARD_AVAILABLE:
|
||||||
return None
|
return None
|
||||||
|
if not _guard_agent_created_enabled():
|
||||||
|
return None
|
||||||
try:
|
try:
|
||||||
result = scan_skill(skill_dir, source="agent-created")
|
result = scan_skill(skill_dir, source="agent-created")
|
||||||
allowed, reason = should_allow_install(result)
|
allowed, reason = should_allow_install(result)
|
||||||
|
|
@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]:
|
||||||
return f"Security scan blocked this skill ({reason}):\n{report}"
|
return f"Security scan blocked this skill ({reason}):\n{report}"
|
||||||
if allowed is None:
|
if allowed is None:
|
||||||
# "ask" verdict — for agent-created skills this means dangerous
|
# "ask" verdict — for agent-created skills this means dangerous
|
||||||
# findings were detected. Block the skill and include the report.
|
# findings were detected. Surface as an error so the agent can
|
||||||
|
# retry with the flagged content removed.
|
||||||
report = format_scan_report(result)
|
report = format_scan_report(result)
|
||||||
logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
|
logger.warning("Agent-created skill blocked (dangerous findings): %s", reason)
|
||||||
return f"Security scan blocked this skill ({reason}):\n{report}"
|
return f"Security scan blocked this skill ({reason}):\n{report}"
|
||||||
|
|
|
||||||
|
|
@ -43,11 +43,11 @@ INSTALL_POLICY = {
|
||||||
"builtin": ("allow", "allow", "allow"),
|
"builtin": ("allow", "allow", "allow"),
|
||||||
"trusted": ("allow", "allow", "block"),
|
"trusted": ("allow", "allow", "block"),
|
||||||
"community": ("allow", "block", "block"),
|
"community": ("allow", "block", "block"),
|
||||||
# Agent-created skills run in the same process as the agent that
|
# Agent-created: "ask" on dangerous surfaces as an error to the agent,
|
||||||
# wrote them — the agent could already execute the same code via
|
# which can retry without the flagged content. This gate only runs when
|
||||||
# terminal(), so a dangerous-pattern gate on skill_manage adds
|
# skills.guard_agent_created is enabled (off by default) — see
|
||||||
# friction without meaningful security. Allow all verdicts.
|
# tools/skill_manager_tool.py::_guard_agent_created_enabled.
|
||||||
"agent-created": ("allow", "allow", "allow"),
|
"agent-created": ("allow", "allow", "ask"),
|
||||||
}
|
}
|
||||||
|
|
||||||
VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}
|
VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue