mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
Three independent security-scanner hardenings, re-homed onto the current
shared threat-pattern architecture (tools/threat_patterns.py):
- approval.py: add bash/sh/zsh/ksh heredoc to DANGEROUS_PATTERNS. The
existing heredoc pattern only covered python/perl/ruby/node, so
`bash <<'EOF' ... EOF` ran arbitrary shell — including exfil pipelines
whose inner commands don't individually match a pattern — with no prompt.
- threat_patterns.py: apply unicodedata.normalize("NFKC", ...) before
pattern matching so full-width / compatibility homographs (e.g.
`cat ~/.hermes/.env`) are folded to ASCII and no longer bypass the
keyword scanners. Invisible-char detection still runs on the raw content
first (NFKC can strip those codepoints).
- code_execution_tool.py: add CREDS/BEARER/APIKEY to _SECRET_SUBSTRINGS so
vars like HERMES_LLM_CREDS, API_BEARER, MY_APIKEY are scrubbed from the
sandbox env. PASS was intentionally dropped from the original proposal —
it false-positives on BYPASS_CACHE / COMPASS_DIR / PASSENGER_HOST while
PASSWORD/PASSWD already cover the credential cases.
The original PR also proposed a 'synonym' injection pattern block
(overlook/forget/set aside/bypass/discard + developer-mode); dropped here
because it false-positives on ordinary AGENTS.md/SOUL.md prose ("don't
forget to follow the rules", "run in developer mode"), exactly the
bossy-English class threat_patterns.py is documented to avoid.
Salvaged from #9028.
Co-authored-by: Hermes Agent <agent@nousresearch.com>
268 lines
13 KiB
Python
268 lines
13 KiB
Python
"""Shared threat-pattern library for context window security scanning.
|
||
|
||
This module is the single source of truth for prompt-injection / promptware /
|
||
exfiltration patterns used across the context-assembly scanners
|
||
(``agent/prompt_builder.py``, ``tools/memory_tool.py``) and the tool-result
|
||
delimiter system in ``agent/tool_dispatch_helpers.py``.
|
||
|
||
Pattern philosophy
|
||
------------------
|
||
Patterns are organized by ATTACK CLASS, not by source file. Each pattern
|
||
is a ``(regex, pattern_id, scope)`` tuple, where ``scope`` controls which
|
||
scanners use it:
|
||
|
||
- ``"all"`` — applied everywhere (classic prompt injection, exfiltration)
|
||
- ``"context"`` — applied to context files + memory + tool results
|
||
(promptware / C2 / behavioral hijack; broader detection)
|
||
- ``"strict"`` — applied to memory writes + skill installs only
|
||
(aggressive checks acceptable for user-curated content but too noisy
|
||
for tool results)
|
||
|
||
The split exists because tool results contain web pages, GitHub issues,
|
||
and MCP responses — content the user did not author — and we want broad
|
||
detection there, but blocking is reserved for paths where the user can
|
||
intervene (memory writes, skill installs).
|
||
|
||
Pattern anchoring
|
||
-----------------
|
||
New patterns anchor on **C2-specific vocabulary or unambiguous attack
|
||
behavior**, NOT on bossy English. Phrases like "you are obligated to"
|
||
or "you must" alone are too common in legitimate instruction-writing
|
||
(see AGENTS.md, CLAUDE.md, etc.) to flag. See the pattern comments for
|
||
the rationale on borderline cases.
|
||
|
||
Multi-word bypass
|
||
-----------------
|
||
Patterns use ``(?:\\w+\\s+)*`` between key tokens to prevent attackers
|
||
from inserting filler words (e.g. "ignore all prior instructions" instead
|
||
of "ignore all instructions"). This mirrors the fix applied to
|
||
``skills_guard.py`` in commit 4ea29978.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import unicodedata
|
||
from typing import List, Optional, Tuple
|
||
|
||
# Each entry: (regex, pattern_id, scope)
|
||
# scope ∈ {"all", "context", "strict"}
|
||
_PATTERNS: List[Tuple[str, str, str]] = [
|
||
# ── Classic prompt injection (applies everywhere) ────────────────
|
||
(r'ignore\s+(?:\w+\s+)*(previous|all|above|prior)\s+(?:\w+\s+)*instructions', "prompt_injection", "all"),
|
||
(r'system\s+prompt\s+override', "sys_prompt_override", "all"),
|
||
(r'disregard\s+(?:\w+\s+)*(your|all|any)\s+(?:\w+\s+)*(instructions|rules|guidelines)', "disregard_rules", "all"),
|
||
(r'act\s+as\s+(if|though)\s+(?:\w+\s+)*you\s+(?:\w+\s+)*(have\s+no|don\'t\s+have)\s+(?:\w+\s+)*(restrictions|limits|rules)', "bypass_restrictions", "all"),
|
||
(r'<!--[^>]*(?:ignore|override|system|secret|hidden)[^>]*-->', "html_comment_injection", "all"),
|
||
(r'<\s*div\s+style\s*=\s*["\'][\s\S]*?display\s*:\s*none', "hidden_div", "all"),
|
||
(r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)', "translate_execute", "all"),
|
||
(r'do\s+not\s+(?:\w+\s+)*tell\s+(?:\w+\s+)*the\s+user', "deception_hide", "all"),
|
||
|
||
# ── Role-play / identity hijack (context + strict; common attack
|
||
# surface in scraped web content and poisoned context files) ──
|
||
(r'you\s+are\s+(?:\w+\s+)*now\s+(?:a|an|the)\s+', "role_hijack", "context"),
|
||
(r'pretend\s+(?:\w+\s+)*(you\s+are|to\s+be)\s+', "role_pretend", "context"),
|
||
(r'output\s+(?:\w+\s+)*(system|initial)\s+prompt', "leak_system_prompt", "context"),
|
||
(r'(respond|answer|reply)\s+without\s+(?:\w+\s+)*(restrictions|limitations|filters|safety)', "remove_filters", "context"),
|
||
(r'you\s+have\s+been\s+(?:\w+\s+)*(updated|upgraded|patched)\s+to', "fake_update", "context"),
|
||
# "name yourself X" is a Brainworm-specific tell — identity override
|
||
# via spec instead of jailbreak. Anchored on the verb pair so it
|
||
# doesn't match "name your variables" etc.
|
||
(r'\bname\s+yourself\s+\w+', "identity_override", "context"),
|
||
|
||
# ── C2 / Brainworm-style promptware (context scope) ──────────────
|
||
# These anchor on C2-specific vocabulary. "register as a node" appears
|
||
# in legitimate distributed-systems docs, but in combination with the
|
||
# other patterns the signal is strong; we WARN, not block, so a security
|
||
# researcher reading the Brainworm post in a webpage doesn't break their
|
||
# session.
|
||
(r'register\s+(as\s+)?a?\s*node', "c2_node_registration", "context"),
|
||
(r'(heartbeat|beacon|check[\s\-]?in)\s+(to|with)\s+', "c2_heartbeat", "context"),
|
||
(r'pull\s+(down\s+)?(?:new\s+)?task(?:ing|s)?\b', "c2_task_pull", "context"),
|
||
(r'connect\s+to\s+the\s+network\b', "c2_network_connect", "context"),
|
||
# Verb-anchored "you must register/connect/report/beacon" — the verbs
|
||
# are C2-specific so this avoids the broader "you must X" false positive.
|
||
(r'you\s+must\s+(?:\w+\s+){0,3}(register|connect|report|beacon)\b', "forced_action", "context"),
|
||
# Anti-forensic instructions ("never write to disk", "one-liners only")
|
||
# — extremely unusual in legitimate content; near-zero false positive.
|
||
(r'only\s+use\s+one[\s\-]?liners?\b', "anti_forensic_oneliner", "context"),
|
||
(r'never\s+(?:\w+\s+)*(?:create|write)\s+(?:\w+\s+)*(?:script|file)\s+(?:\w+\s+)*disk', "anti_forensic_disk", "context"),
|
||
# Environment-variable unsetting targeting known agent runtimes —
|
||
# this is pure attack behavior (Brainworm sub-session bypass).
|
||
(r'unset\s+\w*(?:CLAUDE|CODEX|HERMES|AGENT|OPENAI|ANTHROPIC)\w*', "env_var_unset_agent", "context"),
|
||
|
||
# ── Known C2 / red-team framework names (near-zero false positive
|
||
# outside security research; warn-only by default) ─────────────
|
||
# NOTE: do not add common English words here. Every token must be a
|
||
# distinctive offensive-security tool brand, otherwise legitimate
|
||
# AGENTS.md / SOUL.md content false-positives and the whole file is
|
||
# blocked. "praxis" was removed for exactly this reason — it's a common
|
||
# word and a legitimate agent name (Greek for practice/action), not a
|
||
# C2-specific tell like the brands below.
|
||
(r'\b(?:cobalt\s*strike|sliver|havoc|mythic|metasploit|brainworm)\b', "known_c2_framework", "context"),
|
||
(r'\bc2\s+(?:server|channel|infrastructure|beacon)\b', "c2_explicit", "context"),
|
||
(r'\bcommand\s+and\s+control\b', "c2_explicit_long", "context"),
|
||
|
||
# ── Exfiltration via curl/wget/cat with secrets (applies everywhere) ──
|
||
(r'curl\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_curl", "all"),
|
||
(r'wget\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_wget", "all"),
|
||
(r'cat\s+[^\n]*(\.env|credentials|\.netrc|\.pgpass|\.npmrc|\.pypirc)', "read_secrets", "all"),
|
||
(r'(send|post|upload|transmit)\s+.*\s+(to|at)\s+https?://', "send_to_url", "strict"),
|
||
(r'(include|output|print|share)\s+(?:\w+\s+)*(conversation|chat\s+history|previous\s+messages|full\s+context|entire\s+context)', "context_exfil", "strict"),
|
||
|
||
# ── Persistence / SSH backdoor (strict scope — memory + skills) ──
|
||
(r'authorized_keys', "ssh_backdoor", "strict"),
|
||
(r'\$HOME/\.ssh|\~/\.ssh', "ssh_access", "strict"),
|
||
(r'\$HOME/\.hermes/\.env|\~/\.hermes/\.env', "hermes_env", "strict"),
|
||
(r'(update|modify|edit|write|change|append|add\s+to)\s+.*(?:AGENTS\.md|CLAUDE\.md|\.cursorrules|\.clinerules)', "agent_config_mod", "strict"),
|
||
(r'(update|modify|edit|write|change|append|add\s+to)\s+.*\.hermes/(config\.yaml|SOUL\.md)', "hermes_config_mod", "strict"),
|
||
|
||
# ── Hardcoded secrets ────────────────────────────────────────────
|
||
(r'(?:api[_-]?key|token|secret|password)\s*[=:]\s*["\'][A-Za-z0-9+/=_-]{20,}', "hardcoded_secret", "strict"),
|
||
]
|
||
|
||
# Invisible / bidirectional unicode characters used in injection attacks.
|
||
# Aligned with skills_guard.py INVISIBLE_CHARS — directional isolates
|
||
# (U+2066-U+2069) and invisible math operators (U+2062-U+2064) are real
|
||
# attack tools.
|
||
INVISIBLE_CHARS = frozenset({
|
||
'\u200b', # zero-width space
|
||
'\u200c', # zero-width non-joiner
|
||
'\u200d', # zero-width joiner
|
||
'\u2060', # word joiner
|
||
'\u2062', # invisible times
|
||
'\u2063', # invisible separator
|
||
'\u2064', # invisible plus
|
||
'\ufeff', # zero-width no-break space (BOM)
|
||
'\u202a', # left-to-right embedding
|
||
'\u202b', # right-to-left embedding
|
||
'\u202c', # pop directional formatting
|
||
'\u202d', # left-to-right override
|
||
'\u202e', # right-to-left override
|
||
'\u2066', # left-to-right isolate
|
||
'\u2067', # right-to-left isolate
|
||
'\u2068', # first strong isolate
|
||
'\u2069', # pop directional isolate
|
||
})
|
||
|
||
|
||
# Compiled pattern sets, indexed by scope. Compiled once at import time;
|
||
# scan_for_threats() looks them up.
|
||
_COMPILED: dict[str, List[Tuple[re.Pattern, str]]] = {}
|
||
|
||
|
||
def _compile() -> None:
|
||
"""Compile pattern sets for each scope (all / context / strict).
|
||
|
||
A pattern with scope="all" lands in every set. A pattern with
|
||
scope="context" lands in context + strict (context implies the
|
||
strict scanners want it too). Scope="strict" lands in strict only.
|
||
"""
|
||
global _COMPILED
|
||
if _COMPILED:
|
||
return
|
||
|
||
all_patterns: List[Tuple[re.Pattern, str]] = []
|
||
context_patterns: List[Tuple[re.Pattern, str]] = []
|
||
strict_patterns: List[Tuple[re.Pattern, str]] = []
|
||
|
||
for pattern, pid, scope in _PATTERNS:
|
||
compiled = re.compile(pattern, re.IGNORECASE)
|
||
entry = (compiled, pid)
|
||
if scope == "all":
|
||
all_patterns.append(entry)
|
||
context_patterns.append(entry)
|
||
strict_patterns.append(entry)
|
||
elif scope == "context":
|
||
context_patterns.append(entry)
|
||
strict_patterns.append(entry)
|
||
elif scope == "strict":
|
||
strict_patterns.append(entry)
|
||
else:
|
||
raise ValueError(f"threat_patterns: unknown scope {scope!r} for pattern {pid!r}")
|
||
|
||
_COMPILED = {
|
||
"all": all_patterns,
|
||
"context": context_patterns,
|
||
"strict": strict_patterns,
|
||
}
|
||
|
||
|
||
_compile()
|
||
|
||
|
||
def scan_for_threats(content: str, scope: str = "context") -> List[str]:
|
||
"""Return a list of matched pattern IDs in ``content`` at the given scope.
|
||
|
||
``scope`` selects which pattern set to apply:
|
||
|
||
- ``"all"`` (narrow): classic injection + exfil only — minimal false
|
||
positives, suitable for any text.
|
||
- ``"context"`` (default): adds promptware / C2 / role-play patterns —
|
||
suitable for context files, memory entries, and tool results.
|
||
- ``"strict"`` (broad): adds persistence / SSH backdoor / exfil-URL
|
||
patterns — appropriate for user-mediated writes (memory tool,
|
||
skills install) where false positives can be resolved interactively.
|
||
|
||
Also checks for invisible unicode characters (returned as
|
||
``"invisible_unicode_U+XXXX"`` so the caller can surface the offending
|
||
codepoint in a log line).
|
||
"""
|
||
if not content:
|
||
return []
|
||
|
||
findings: List[str] = []
|
||
|
||
# Invisible unicode — single pass through the content set, not 17
|
||
# ``in`` lookups. Run this on the RAW content before NFKC normalisation,
|
||
# since normalisation can strip some of these codepoints.
|
||
char_set = set(content)
|
||
invisible_hits = char_set & INVISIBLE_CHARS
|
||
for ch in invisible_hits:
|
||
findings.append(f"invisible_unicode_U+{ord(ch):04X}")
|
||
|
||
# Normalise to NFKC so full-width / compatibility Unicode variants
|
||
# (e.g. cat → cat, A → A) are folded to their ASCII counterparts before
|
||
# the regex engine sees them. This prevents homograph substitution from
|
||
# bypassing keyword checks (e.g. ``cat ~/.hermes/.env``). NOTE: this
|
||
# does NOT defend against cross-script confusables (Cyrillic ``а`` U+0430),
|
||
# which NFKC leaves untouched — that needs a TR#39 confusable database.
|
||
normalised = unicodedata.normalize("NFKC", content)
|
||
|
||
# Threat patterns
|
||
patterns = _COMPILED.get(scope)
|
||
if patterns is None:
|
||
raise ValueError(f"scan_for_threats: unknown scope {scope!r}")
|
||
for compiled, pid in patterns:
|
||
if compiled.search(normalised):
|
||
findings.append(pid)
|
||
|
||
return findings
|
||
|
||
|
||
def first_threat_message(content: str, scope: str = "strict") -> Optional[str]:
|
||
"""Return a human-readable error string for the first threat found, or None.
|
||
|
||
Convenience wrapper used by paths that block on the first hit
|
||
(memory tool writes, skills install) where the caller just needs a
|
||
yes/no + a message.
|
||
"""
|
||
findings = scan_for_threats(content, scope=scope)
|
||
if not findings:
|
||
return None
|
||
pid = findings[0]
|
||
if pid.startswith("invisible_unicode_"):
|
||
codepoint = pid.replace("invisible_unicode_", "")
|
||
return f"Blocked: content contains invisible unicode character {codepoint} (possible injection)."
|
||
return (
|
||
f"Blocked: content matches threat pattern '{pid}'. "
|
||
f"Content is injected into the system prompt and must not contain "
|
||
f"injection or exfiltration payloads."
|
||
)
|
||
|
||
|
||
__all__ = [
|
||
"INVISIBLE_CHARS",
|
||
"scan_for_threats",
|
||
"first_threat_message",
|
||
]
|