diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 6d4c49fd4..fd19adae9 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -760,6 +760,17 @@ DEFAULT_CONFIG = { "inline_shell": False, # Timeout (seconds) for each !`cmd` snippet when inline_shell is on. "inline_shell_timeout": 10, + # Run the keyword/pattern security scanner on skills the agent + # writes via skill_manage (create/edit/patch). Off by default + # because the agent can already execute the same code paths via + # terminal() with no gate, so the scan adds friction (blocks + # skills that mention risky keywords in prose) without meaningful + # security. Turn on if you want the belt-and-suspenders — a + # dangerous verdict will then surface as a tool error to the + # agent, which can retry with the flagged content removed. + # External hub installs (trusted/community sources) are always + # scanned regardless of this setting. + "guard_agent_created": False, }, # Honcho AI-native memory -- reads ~/.honcho/config.json as single source of truth. diff --git a/tests/tools/test_skill_manager_tool.py b/tests/tools/test_skill_manager_tool.py index dd0ae17f8..9918a826c 100644 --- a/tests/tools/test_skill_manager_tool.py +++ b/tests/tools/test_skill_manager_tool.py @@ -484,3 +484,85 @@ class TestSkillManageDispatcher: raw = skill_manage(action="create", name="test-skill", content=VALID_SKILL_CONTENT) result = json.loads(raw) assert result["success"] is True + + +class TestSecurityScanGate: + """_security_scan_skill is gated by skills.guard_agent_created config flag.""" + + def test_scan_noop_when_flag_off(self, tmp_path): + """Default config (flag off) short-circuits before running scan_skill.""" + from tools.skill_manager_tool import _security_scan_skill + + with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=False), \ + patch("tools.skill_manager_tool.scan_skill") as mock_scan: + result = _security_scan_skill(tmp_path) + + assert result is None + mock_scan.assert_not_called() # scan never ran + + def test_scan_runs_when_flag_on(self, tmp_path): + """When flag is on, scan_skill is invoked and its verdict is honored.""" + from tools.skill_manager_tool import _security_scan_skill + from tools.skills_guard import ScanResult + + # Fake a safe scan result — caller should return None (allow) + fake_result = ScanResult( + skill_name="test", + source="agent-created", + trust_level="agent-created", + verdict="safe", + findings=[], + summary="ok", + ) + with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \ + patch("tools.skill_manager_tool.scan_skill", return_value=fake_result) as mock_scan: + result = _security_scan_skill(tmp_path) + + assert result is None + mock_scan.assert_called_once() + + def test_scan_blocks_dangerous_when_flag_on(self, tmp_path): + """Dangerous verdict + flag on → returns an error string for the agent.""" + from tools.skill_manager_tool import _security_scan_skill + from tools.skills_guard import ScanResult, Finding + + finding = Finding( + pattern_id="test", severity="critical", category="exfiltration", + file="SKILL.md", line=1, match="curl $TOKEN", description="test", + ) + fake_result = ScanResult( + skill_name="test", + source="agent-created", + trust_level="agent-created", + verdict="dangerous", + findings=[finding], + summary="dangerous", + ) + with patch("tools.skill_manager_tool._guard_agent_created_enabled", return_value=True), \ + patch("tools.skill_manager_tool.scan_skill", return_value=fake_result): + result = _security_scan_skill(tmp_path) + + assert result is not None + assert "Security scan blocked" in result + + def test_guard_flag_reads_config_default_false(self): + """_guard_agent_created_enabled returns False when config doesn't set it.""" + from tools.skill_manager_tool import _guard_agent_created_enabled + + with patch("hermes_cli.config.load_config", return_value={"skills": {}}): + assert _guard_agent_created_enabled() is False + + def test_guard_flag_reads_config_when_set(self): + """_guard_agent_created_enabled returns True when user explicitly enables.""" + from tools.skill_manager_tool import _guard_agent_created_enabled + + with patch("hermes_cli.config.load_config", + return_value={"skills": {"guard_agent_created": True}}): + assert _guard_agent_created_enabled() is True + + def test_guard_flag_handles_config_error(self): + """If load_config raises, _guard_agent_created_enabled defaults to False (fail-safe off).""" + from tools.skill_manager_tool import _guard_agent_created_enabled + + with patch("hermes_cli.config.load_config", side_effect=RuntimeError("boom")): + assert _guard_agent_created_enabled() is False diff --git a/tests/tools/test_skills_guard.py b/tests/tools/test_skills_guard.py index 12c527ca7..ccc55da20 100644 --- a/tests/tools/test_skills_guard.py +++ b/tests/tools/test_skills_guard.py @@ -174,27 +174,24 @@ class TestShouldAllowInstall: assert allowed is True assert "agent-created" in reason - def test_dangerous_agent_created_allowed(self): - """Agent-created skills bypass verdict gating — agent can already - execute the same code via terminal(), so skill_manage allows all - verdicts. This prevents friction when the agent writes skills that - mention risky keywords in prose (e.g. describing cache-busting or - persistence semantics in a PR-review skill).""" + def test_dangerous_agent_created_asks(self): + """Agent-created skills with dangerous verdict return None (ask for confirmation) + when the scan runs. The caller (_security_scan_skill) surfaces this as an error + to the agent, who can retry without the flagged content. + + This gate only runs when skills.guard_agent_created is enabled (off by default).""" f = [Finding("env_exfil_curl", "critical", "exfiltration", "SKILL.md", 1, "curl $TOKEN", "exfiltration")] allowed, reason = should_allow_install(self._result("agent-created", "dangerous", f)) - assert allowed is True - assert "agent-created" in reason + assert allowed is None + assert "Requires confirmation" in reason - def test_force_noop_for_agent_created_dangerous(self): - """With agent-created dangerous mapped to 'allow', force becomes a - no-op — the allow branch returns first. Force still works for any - trust level that maps to block (community/trusted).""" + def test_force_overrides_dangerous_for_agent_created(self): f = [Finding("x", "critical", "c", "f", 1, "m", "d")] allowed, reason = should_allow_install( self._result("agent-created", "dangerous", f), force=True ) assert allowed is True - assert "agent-created" in reason + assert "Force-installed" in reason # --------------------------------------------------------------------------- diff --git a/tools/skill_manager_tool.py b/tools/skill_manager_tool.py index 493b434c5..c28f421a7 100644 --- a/tools/skill_manager_tool.py +++ b/tools/skill_manager_tool.py @@ -44,8 +44,8 @@ from typing import Dict, Any, Optional, Tuple logger = logging.getLogger(__name__) -# Import security scanner — agent-created skills get the same scrutiny as -# community hub installs. +# Import security scanner — external hub installs always get scanned; +# agent-created skills only get scanned when skills.guard_agent_created is on. try: from tools.skills_guard import scan_skill, should_allow_install, format_scan_report _GUARD_AVAILABLE = True @@ -53,10 +53,31 @@ except ImportError: _GUARD_AVAILABLE = False +def _guard_agent_created_enabled() -> bool: + """Read skills.guard_agent_created from config (default False). + + Off by default because the agent can already execute the same code + paths via terminal() with no gate, so the scan adds friction without + meaningful security. Users who want belt-and-suspenders can turn it + on via `hermes config set skills.guard_agent_created true`. + """ + try: + from hermes_cli.config import load_config + cfg = load_config() + return bool(cfg.get("skills", {}).get("guard_agent_created", False)) + except Exception: + return False + + def _security_scan_skill(skill_dir: Path) -> Optional[str]: - """Scan a skill directory after write. Returns error string if blocked, else None.""" + """Scan a skill directory after write. Returns error string if blocked, else None. + + No-op when skills.guard_agent_created is disabled (the default). + """ if not _GUARD_AVAILABLE: return None + if not _guard_agent_created_enabled(): + return None try: result = scan_skill(skill_dir, source="agent-created") allowed, reason = should_allow_install(result) @@ -65,7 +86,8 @@ def _security_scan_skill(skill_dir: Path) -> Optional[str]: return f"Security scan blocked this skill ({reason}):\n{report}" if allowed is None: # "ask" verdict — for agent-created skills this means dangerous - # findings were detected. Block the skill and include the report. + # findings were detected. Surface as an error so the agent can + # retry with the flagged content removed. report = format_scan_report(result) logger.warning("Agent-created skill blocked (dangerous findings): %s", reason) return f"Security scan blocked this skill ({reason}):\n{report}" diff --git a/tools/skills_guard.py b/tools/skills_guard.py index fadbb8173..ffb965b52 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -43,11 +43,11 @@ INSTALL_POLICY = { "builtin": ("allow", "allow", "allow"), "trusted": ("allow", "allow", "block"), "community": ("allow", "block", "block"), - # Agent-created skills run in the same process as the agent that - # wrote them — the agent could already execute the same code via - # terminal(), so a dangerous-pattern gate on skill_manage adds - # friction without meaningful security. Allow all verdicts. - "agent-created": ("allow", "allow", "allow"), + # Agent-created: "ask" on dangerous surfaces as an error to the agent, + # which can retry without the flagged content. This gate only runs when + # skills.guard_agent_created is enabled (off by default) — see + # tools/skill_manager_tool.py::_guard_agent_created_enabled. + "agent-created": ("allow", "allow", "ask"), } VERDICT_INDEX = {"safe": 0, "caution": 1, "dangerous": 2}