Harden agent attack surface: scan writes to memory, skills, cron, and context files

The security scanner (skills_guard.py) was only wired into the hub install path. All other write paths to persistent state — skills created by the agent, memory entries, cron prompts, and context files — bypassed it entirely. This closes those gaps: - file_operations: deny-list blocks writes to ~/.ssh, ~/.aws, ~/.hermes/.env, etc. - code_execution_tool: filter secret env vars from sandbox child process - skill_manager_tool: wire scan_skill() into create/edit/patch/write_file with rollback - skills_guard: add "agent-created" trust level (same policy as community) - memory_tool: scan content for injection/exfil before system prompt injection - prompt_builder: scan AGENTS.md, .cursorrules, SOUL.md for prompt injection - cronjob_tools: scan cron prompts for critical threats before scheduling Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-25 00:51:20 +00:00 · 2026-02-25 23:43:15 -05:00 · 2026-02-25 23:43:15 -05:00 · 95b6bd5df6
commit 95b6bd5df6
parent 0310170869
7 changed files with 278 additions and 8 deletions
--- a/tools/memory_tool.py
+++ b/tools/memory_tool.py
@ -24,17 +24,66 @@ Design:
 """

 import json
+import logging
 import os
+import re
 import tempfile
 from pathlib import Path
 from typing import Dict, Any, List, Optional

+logger = logging.getLogger(__name__)
+
 # Where memory files live
 MEMORY_DIR = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "memories"

 ENTRY_DELIMITER = "\n§\n"


+# ---------------------------------------------------------------------------
+# Memory content scanning — lightweight check for injection/exfiltration
+# in content that gets injected into the system prompt.
+# ---------------------------------------------------------------------------
+
+_MEMORY_THREAT_PATTERNS = [
+    # Prompt injection
+    (r'ignore\s+(previous|all|above|prior)\s+instructions', "prompt_injection"),
+    (r'you\s+are\s+now\s+', "role_hijack"),
+    (r'do\s+not\s+tell\s+the\s+user', "deception_hide"),
+    (r'system\s+prompt\s+override', "sys_prompt_override"),
+    (r'disregard\s+(your|all|any)\s+(instructions|rules|guidelines)', "disregard_rules"),
+    (r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)', "bypass_restrictions"),
+    # Exfiltration via curl/wget with secrets
+    (r'curl\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_curl"),
+    (r'wget\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_wget"),
+    (r'cat\s+[^\n]*(\.env|credentials|\.netrc|\.pgpass|\.npmrc|\.pypirc)', "read_secrets"),
+    # Persistence via shell rc
+    (r'authorized_keys', "ssh_backdoor"),
+    (r'\$HOME/\.ssh|\~/\.ssh', "ssh_access"),
+    (r'\$HOME/\.hermes/\.env|\~/\.hermes/\.env', "hermes_env"),
+]
+
+# Subset of invisible chars for injection detection
+_INVISIBLE_CHARS = {
+    '\u200b', '\u200c', '\u200d', '\u2060', '\ufeff',
+    '\u202a', '\u202b', '\u202c', '\u202d', '\u202e',
+}
+
+
+def _scan_memory_content(content: str) -> Optional[str]:
+    """Scan memory content for injection/exfil patterns. Returns error string if blocked."""
+    # Check invisible unicode
+    for char in _INVISIBLE_CHARS:
+        if char in content:
+            return f"Blocked: content contains invisible unicode character U+{ord(char):04X} (possible injection)."
+
+    # Check threat patterns
+    for pattern, pid in _MEMORY_THREAT_PATTERNS:
+        if re.search(pattern, content, re.IGNORECASE):
+            return f"Blocked: content matches threat pattern '{pid}'. Memory entries are injected into the system prompt and must not contain injection or exfiltration payloads."
+
+    return None
+
+
 class MemoryStore:
    """
    Bounded curated memory with file persistence. One instance per AIAgent.
@ -108,6 +157,11 @@ class MemoryStore:
        if not content:
            return {"success": False, "error": "Content cannot be empty."}

+        # Scan for injection/exfiltration before accepting
+        scan_error = _scan_memory_content(content)
+        if scan_error:
+            return {"success": False, "error": scan_error}
+
        entries = self._entries_for(target)
        limit = self._char_limit(target)

@ -147,6 +201,11 @@ class MemoryStore:
        if not new_content:
            return {"success": False, "error": "new_content cannot be empty. Use 'remove' to delete entries."}

+        # Scan replacement content for injection/exfiltration
+        scan_error = _scan_memory_content(new_content)
+        if scan_error:
+            return {"success": False, "error": scan_error}
+
        entries = self._entries_for(target)
        matches = [(i, e) for i, e in enumerate(entries) if old_text in e]