mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(redact): non-reusable sentinel for prefix secrets in file reads (#35519)
When security.redact_secrets is on (default), read_file/search_files/cat applied redact_sensitive_text(code_file=True) to file content, which still ran prefix masking. An API key in config.yaml (ghp_..., sk-..., xai-..., etc.) came back as a head/tail mask like `ghp_S1...Pn2T` — a plausible-looking truncated key. When an agent read that and wrote it back to config, the masked value replaced the real credential, silently breaking auth (401). Production evidence: a config.yaml found containing the exact 13-char masked GitHub PAT. The two community PRs (#35529, #35534) fixed the corruption by NOT redacting prefixes for config reads — but that exposes the user's real keys to the agent context, model, and logs (a security regression). This takes the safer route: keep redacting, but for file content emit a NON-REUSABLE sentinel. - New `_mask_token_nonreusable`: prefix secrets -> `«redacted:ghp_…»` (vendor label preserved for debuggability; zero secret bytes; angle-bracket/ellipsis wrapper is syntactically invalid as a token so it can't be mistaken for or written back as a usable key). - New `redact_sensitive_text(file_read=True)` routes prefix matches through it (implies code_file=True). Default/log/display mode is UNCHANGED — `_mask_token` still keeps head/tail (fine for logs, never written back). - Wired the 3 file_tools.py call sites (read_file / search_files / cat) to file_read=True. Fixes both the corruption AND avoids the secret-exposure of the un-redact approach. 6 new tests (sentinel shape, no-leak, not-a-plausible-key, default mode unchanged, file_read implies code_file, sk- prefix); 88 redact tests pass; mutation-verified (reverting to the old mask fails the sentinel/leak tests). Co-authored-by: liuhao1024 <sunsky.lau@gmail.com> Co-authored-by: adammatski1972 <289282750+adammatski1972@users.noreply.github.com> Closes #35519. Supersedes #35529, #35534.
This commit is contained in:
parent
19cbbe304a
commit
de928bccde
3 changed files with 112 additions and 5 deletions
|
|
@ -402,7 +402,40 @@ def _redact_form_body(text: str) -> str:
|
|||
return _redact_query_string(text.strip())
|
||||
|
||||
|
||||
def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = False) -> str:
|
||||
def _mask_token_nonreusable(token: str) -> str:
|
||||
"""Redact a prefix-matched credential to a NON-REUSABLE sentinel.
|
||||
|
||||
Unlike :func:`_mask_token` (which keeps head/tail chars — fine for logs
|
||||
that are never fed back into a config), this emits a marker that:
|
||||
|
||||
* cannot be mistaken for a usable-but-truncated key, so an agent that
|
||||
reads it from a config file and writes it back does NOT corrupt the
|
||||
stored credential into a dead 13-char string (issue #35519); and
|
||||
* still does not leak the secret material (no head/tail chars).
|
||||
|
||||
The vendor prefix label is preserved for debuggability so the agent can
|
||||
still tell *which* credential is present (e.g. a GitHub PAT vs an OpenAI
|
||||
key) without seeing any of its bytes.
|
||||
"""
|
||||
if not token:
|
||||
return "«redacted-secret»"
|
||||
# Preserve only the recognizable vendor prefix label (e.g. "ghp_", "sk-"),
|
||||
# never any of the random secret body.
|
||||
label = ""
|
||||
for sub in _PREFIX_SUBSTRINGS:
|
||||
if token.startswith(sub):
|
||||
label = sub
|
||||
break
|
||||
return f"«redacted:{label}…»" if label else "«redacted-secret»"
|
||||
|
||||
|
||||
def redact_sensitive_text(
|
||||
text: str,
|
||||
*,
|
||||
force: bool = False,
|
||||
code_file: bool = False,
|
||||
file_read: bool = False,
|
||||
) -> str:
|
||||
"""Apply all redaction patterns to a block of text.
|
||||
|
||||
Safe to call on any string -- non-matching text passes through unchanged.
|
||||
|
|
@ -415,6 +448,17 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
|
|||
constants, "apiKey": "test" fixtures). Prefix patterns, auth headers,
|
||||
private keys, DB connstrings, JWTs, and URL secrets are still redacted.
|
||||
|
||||
Set file_read=True for file *content* returned to the agent (read_file /
|
||||
search_files / cat). Secrets are STILL redacted — they are never exposed —
|
||||
but prefix-matched credentials are replaced with a non-reusable sentinel
|
||||
(``«redacted:ghp_…»``) instead of a head/tail-preserving mask
|
||||
(``ghp_S1...Pn2T``). The old mask looked like a real-but-truncated key, so
|
||||
an agent reading it from config.yaml and writing it back silently corrupted
|
||||
the stored credential into a dead 13-char value → 401 (issue #35519). The
|
||||
sentinel is syntactically invalid as a token, so it can't be mistaken for a
|
||||
usable key or written back as one. Implies code_file=True (config/data
|
||||
files shouldn't trigger the source-code ENV/JSON false-positive paths).
|
||||
|
||||
Performance: each regex pattern is gated behind a cheap substring
|
||||
pre-check (e.g. ``"=" in text`` for ENV assignments, ``"://" in text``
|
||||
for URLs, ``"eyJ" in text`` for JWTs). On a typical hermes log line
|
||||
|
|
@ -433,9 +477,15 @@ def redact_sensitive_text(text: str, *, force: bool = False, code_file: bool = F
|
|||
if not (force or _REDACT_ENABLED):
|
||||
return text
|
||||
|
||||
# file_read content shouldn't hit the source-code ENV/JSON false-positive
|
||||
# paths either (it's config/data, not log lines).
|
||||
if file_read:
|
||||
code_file = True
|
||||
|
||||
# Known prefixes (sk-, ghp_, etc.) — gate on substring presence
|
||||
if _has_known_prefix_substring(text):
|
||||
text = _PREFIX_RE.sub(lambda m: _mask_token(m.group(1)), text)
|
||||
_prefix_sub = _mask_token_nonreusable if file_read else _mask_token
|
||||
text = _PREFIX_RE.sub(lambda m: _prefix_sub(m.group(1)), text)
|
||||
|
||||
# ENV assignments: OPENAI_API_KEY=*** (skip for code files — false positives)
|
||||
if not code_file:
|
||||
|
|
|
|||
|
|
@ -753,3 +753,60 @@ class TestTerminalOutputRedaction:
|
|||
out = "CUSTOM_TOKEN=zzzopaque1234567890abcdef"
|
||||
red = redact_terminal_output(out, "printenv")
|
||||
assert "zzzopaque1234567890abcdef" in red
|
||||
|
||||
|
||||
class TestFileReadNonReusableRedaction:
|
||||
"""#35519: prefix-matched credentials in FILE CONTENT (read_file /
|
||||
search_files / cat) must be redacted to a NON-REUSABLE sentinel — not a
|
||||
head/tail mask that looks like a real-but-truncated key and gets written
|
||||
back to config (corrupting the credential -> 401)."""
|
||||
|
||||
GHP = "ghp_S1abcdefghijklmnopqrstuvwxyz0Pn2T" # realistic GitHub PAT shape
|
||||
SK = "sk-proj-abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
|
||||
def test_file_read_uses_nonreusable_sentinel(self):
|
||||
out = redact_sensitive_text(f"token: {self.GHP}", force=True, file_read=True)
|
||||
# The sentinel marker is present and obviously a redaction...
|
||||
assert "«redacted:ghp_…»" in out, out
|
||||
# ...and the head/tail-preserving mask shape is NOT produced.
|
||||
assert "..." not in out
|
||||
# The agent can still tell which vendor credential is present.
|
||||
assert "ghp_" in out
|
||||
|
||||
def test_file_read_does_not_leak_secret_body(self):
|
||||
"""Crucial: file_read must NOT expose the real key (no un-redact)."""
|
||||
out = redact_sensitive_text(f"token: {self.GHP}", force=True, file_read=True)
|
||||
# No run of the secret body survives.
|
||||
assert "S1abcdefghij" not in out
|
||||
assert self.GHP not in out
|
||||
assert "Pn2T" not in out # not even the tail (the old mask kept it)
|
||||
|
||||
def test_file_read_sentinel_is_not_a_plausible_key(self):
|
||||
"""The sentinel can't be mistaken for / written back as a usable key:
|
||||
the old mask was a 13-char `ghp_S1...Pn2T` that broke GitHub auth when
|
||||
an agent re-saved it. The sentinel is syntactically invalid as a token
|
||||
(contains « » … and ':'), so it can't round-trip into a dead key."""
|
||||
out = redact_sensitive_text(f"GITHUB_PERSONAL_ACCESS_TOKEN: {self.GHP}",
|
||||
force=True, file_read=True)
|
||||
masked = out.split(": ", 1)[1].strip()
|
||||
# Not a bare token: contains the sentinel delimiters.
|
||||
assert masked.startswith("«") and masked.endswith("»")
|
||||
assert "…" in masked
|
||||
|
||||
def test_default_mode_unchanged_keeps_headtail_mask(self):
|
||||
"""Regression guard: NON-file_read (logs/display) keeps the existing
|
||||
head/tail mask shape — only file content gets the sentinel."""
|
||||
out = redact_sensitive_text(f"token: {self.GHP}", force=True)
|
||||
assert "«redacted" not in out # no sentinel in log mode
|
||||
assert "ghp_" in out and "..." in out # head/tail mask preserved
|
||||
|
||||
def test_file_read_implies_code_file_no_env_falsepos(self):
|
||||
"""file_read should skip the source-code ENV/JSON false-positive paths
|
||||
(it's config/data). A bare ``MAX_TOKENS=8000`` must pass through."""
|
||||
out = redact_sensitive_text("MAX_TOKENS=8000", force=True, file_read=True)
|
||||
assert out == "MAX_TOKENS=8000"
|
||||
|
||||
def test_sk_prefix_also_sentinelized(self):
|
||||
out = redact_sensitive_text(f"key: {self.SK}", force=True, file_read=True)
|
||||
assert "«redacted:sk-…»" in out
|
||||
assert self.SK not in out
|
||||
|
|
|
|||
|
|
@ -1021,7 +1021,7 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
|||
"file_size": result_dict["file_size"],
|
||||
}, ensure_ascii=False)
|
||||
if result_dict["content"]:
|
||||
result_dict["content"] = redact_sensitive_text(result_dict["content"], code_file=True)
|
||||
result_dict["content"] = redact_sensitive_text(result_dict["content"], file_read=True)
|
||||
return json.dumps(result_dict, ensure_ascii=False)
|
||||
|
||||
# ── Binary file guard ─────────────────────────────────────────
|
||||
|
|
@ -1137,7 +1137,7 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
|||
|
||||
# ── Redact secrets (after guard check to skip oversized content) ──
|
||||
if result.content:
|
||||
result.content = redact_sensitive_text(result.content, code_file=True)
|
||||
result.content = redact_sensitive_text(result.content, file_read=True)
|
||||
result_dict["content"] = result.content
|
||||
|
||||
# Large-file hint: if the file is big and the caller didn't ask
|
||||
|
|
@ -1697,7 +1697,7 @@ def search_tool(pattern: str, target: str = "content", path: str = ".",
|
|||
if hasattr(result, 'matches'):
|
||||
for m in result.matches:
|
||||
if hasattr(m, 'content') and m.content:
|
||||
m.content = redact_sensitive_text(m.content, code_file=True)
|
||||
m.content = redact_sensitive_text(m.content, file_read=True)
|
||||
result_dict = result.to_dict(densify=True)
|
||||
|
||||
if count >= 3:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue