mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(security): expand memory content scanning patterns to parity with skills guard (#9151)
Expand _MEMORY_THREAT_PATTERNS from 13 to 24 regex patterns and align _INVISIBLE_CHARS with skills_guard.py (10 → 17 characters). Key changes: - Add multi-word bypass prevention (?:\w+\s+)* to injection patterns - Add missing injection patterns: role_pretend, leak_system_prompt, remove_filters, fake_update, translate_execute, html_comment_injection, hidden_div - Add exfiltration patterns: send_to_url, context_exfil - Add persistence patterns: agent_config_mod, hermes_config_mod (both require modification-verb prefix to avoid false positives on mere mentions of config filenames) - Add hardcoded secret detection pattern - Add role_hijack precision fix: require article after "now" to avoid blocking "you are now ready/connected/set up" etc. - Expand invisible unicode set with directional isolates (U+2066-2069) and invisible math operators (U+2062-2064) Test coverage expanded from ~8 to ~30 scan tests including dedicated false-positive regression tests for all precision-sensitive patterns. Known limitations (deferred to follow-up PRs): - prompt_builder.py and cronjob_tools.py still use older pattern sets - No semantic/LLM-based scanning (regex-only approach) - No cross-entry or cross-store analysis
This commit is contained in:
parent
7ebebfbb8d
commit
00bd24e27c
2 changed files with 230 additions and 17 deletions
|
|
@ -65,28 +65,72 @@ ENTRY_DELIMITER = "\n§\n"
|
|||
# in content that gets injected into the system prompt.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Threat patterns for memory content scanning.
|
||||
# These patterns are aligned with skills_guard.py THREAT_PATTERNS but
|
||||
# simplified to (regex, pattern_id) tuples — memory entries are short-form
|
||||
# text, not multi-file skill bundles, so structural/extraction checks are
|
||||
# not needed here.
|
||||
#
|
||||
# Multi-word bypass: patterns use (?:\w+\s+)* between key tokens to prevent
|
||||
# attackers from inserting filler words (e.g. "ignore all prior instructions"
|
||||
# instead of "ignore all instructions"). This mirrors the fix applied to
|
||||
# skills_guard.py in commit 4ea29978.
|
||||
_MEMORY_THREAT_PATTERNS = [
|
||||
# Prompt injection
|
||||
(r'ignore\s+(previous|all|above|prior)\s+instructions', "prompt_injection"),
|
||||
(r'you\s+are\s+now\s+', "role_hijack"),
|
||||
(r'do\s+not\s+tell\s+the\s+user', "deception_hide"),
|
||||
# ── Prompt injection ──
|
||||
(r'ignore\s+(?:\w+\s+)*(previous|all|above|prior)\s+(?:\w+\s+)*instructions', "prompt_injection"),
|
||||
(r'you\s+are\s+(?:\w+\s+)*now\s+(?:a|an|the)\s+', "role_hijack"),
|
||||
(r'do\s+not\s+(?:\w+\s+)*tell\s+(?:\w+\s+)*the\s+user', "deception_hide"),
|
||||
(r'system\s+prompt\s+override', "sys_prompt_override"),
|
||||
(r'disregard\s+(your|all|any)\s+(instructions|rules|guidelines)', "disregard_rules"),
|
||||
(r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)', "bypass_restrictions"),
|
||||
# Exfiltration via curl/wget with secrets
|
||||
(r'disregard\s+(?:\w+\s+)*(your|all|any)\s+(?:\w+\s+)*(instructions|rules|guidelines)', "disregard_rules"),
|
||||
(r'act\s+as\s+(if|though)\s+(?:\w+\s+)*you\s+(?:\w+\s+)*(have\s+no|don\'t\s+have)\s+(?:\w+\s+)*(restrictions|limits|rules)', "bypass_restrictions"),
|
||||
(r'pretend\s+(?:\w+\s+)*(you\s+are|to\s+be)\s+', "role_pretend"),
|
||||
(r'output\s+(?:\w+\s+)*(system|initial)\s+prompt', "leak_system_prompt"),
|
||||
(r'(respond|answer|reply)\s+without\s+(?:\w+\s+)*(restrictions|limitations|filters|safety)', "remove_filters"),
|
||||
(r'you\s+have\s+been\s+(?:\w+\s+)*(updated|upgraded|patched)\s+to', "fake_update"),
|
||||
(r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)', "translate_execute"),
|
||||
(r'<!--[^>]*(?:ignore|override|system|secret|hidden)[^>]*-->', "html_comment_injection"),
|
||||
(r'<\s*div\s+style\s*=\s*["\'][\s\S]*?display\s*:\s*none', "hidden_div"),
|
||||
|
||||
# ── Exfiltration via curl/wget/fetch with secrets ──
|
||||
(r'curl\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_curl"),
|
||||
(r'wget\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_wget"),
|
||||
(r'cat\s+[^\n]*(\.env|credentials|\.netrc|\.pgpass|\.npmrc|\.pypirc)', "read_secrets"),
|
||||
# Persistence via shell rc
|
||||
(r'(send|post|upload|transmit)\s+.*\s+(to|at)\s+https?://', "send_to_url"),
|
||||
(r'(include|output|print|share)\s+(?:\w+\s+)*(conversation|chat\s+history|previous\s+messages|full\s+context|entire\s+context)', "context_exfil"),
|
||||
|
||||
# ── Persistence / SSH backdoor ──
|
||||
(r'authorized_keys', "ssh_backdoor"),
|
||||
(r'\$HOME/\.ssh|\~/\.ssh', "ssh_access"),
|
||||
(r'\$HOME/\.hermes/\.env|\~/\.hermes/\.env', "hermes_env"),
|
||||
(r'(update|modify|edit|write|change|append|add\s+to)\s+.*(?:AGENTS\.md|CLAUDE\.md|\.cursorrules|\.clinerules)', "agent_config_mod"),
|
||||
(r'(update|modify|edit|write|change|append|add\s+to)\s+.*\.hermes/(config\.yaml|SOUL\.md)', "hermes_config_mod"),
|
||||
|
||||
# ── Hardcoded secrets ──
|
||||
(r'(?:api[_-]?key|token|secret|password)\s*[=:]\s*["\'][A-Za-z0-9+/=_-]{20,}', "hardcoded_secret"),
|
||||
]
|
||||
|
||||
# Subset of invisible chars for injection detection
|
||||
# Invisible unicode characters for injection detection.
|
||||
# Full set aligned with skills_guard.py INVISIBLE_CHARS — includes
|
||||
# directional isolates (U+2066-U+2069) and invisible math operators
|
||||
# (U+2062-U+2064) that were previously missing.
|
||||
_INVISIBLE_CHARS = {
|
||||
'\u200b', '\u200c', '\u200d', '\u2060', '\ufeff',
|
||||
'\u202a', '\u202b', '\u202c', '\u202d', '\u202e',
|
||||
'\u200b', # zero-width space
|
||||
'\u200c', # zero-width non-joiner
|
||||
'\u200d', # zero-width joiner
|
||||
'\u2060', # word joiner
|
||||
'\u2062', # invisible times
|
||||
'\u2063', # invisible separator
|
||||
'\u2064', # invisible plus
|
||||
'\ufeff', # zero-width no-break space (BOM)
|
||||
'\u202a', # left-to-right embedding
|
||||
'\u202b', # right-to-left embedding
|
||||
'\u202c', # pop directional formatting
|
||||
'\u202d', # left-to-right override
|
||||
'\u202e', # right-to-left override
|
||||
'\u2066', # left-to-right isolate
|
||||
'\u2067', # right-to-left isolate
|
||||
'\u2068', # first strong isolate
|
||||
'\u2069', # pop directional isolate
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue