fix(security): redact credentials before persistence in session capture

Two-layer redaction at the persistence boundary so credentials never reach state.db, session_*.json, or compression: 1. agent/chat_completion_helpers.py :: build_assistant_message - Redact assistant content before the message dict is constructed (catches PATs / API keys the model inlines into natural language) - Redact tool_call.function.arguments at the same site (catches secrets inlined into tool args, e.g. terminal command=curl -H 'Authorization: ...') Tool execution uses the raw API response object, not this dict, so redacting the persisted shape is safe. 2. run_agent.py :: _save_session_log - Add _redact_message_content() static helper that handles both string content and OpenAI/Anthropic multimodal list-of-parts (image parts pass through untouched, only text/content fields are redacted) - Apply to every message + the cached system prompt before writing session_*.json Both layers respect HERMES_REDACT_SECRETS via redact_sensitive_text — no-op when disabled. Tests (TestSaveSessionLogRedactsSecrets, 4 cases): - api key in tool content - api key in user message - api key in system prompt - multimodal list-of-parts (image part preserved, text redacted) Tests use an autouse fixture to force _REDACT_ENABLED=True because the hermetic conftest defaults the env var to false. Salvaged from PR #24758 by @vgocoder (build_assistant_message + session_log) + PR #19855 by @liuhao1024 (multimodal list helper, system_prompt redaction). Kept only the redaction concern from #19855; its unrelated whatsapp npm timeout + PATCH_SCHEMA changes are out of scope and dropped. Refs #19798 (PAT leak via assistant inline mention), #19845 (session capture credential leak). Co-authored-by: liuhao1024 <liuhao03@bilibili.com> Co-authored-by: teknium1 <127238744+teknium1@users.noreply.github.com>
2026-07-14 14:12:44 +00:00 · 2026-05-24 17:57:50 -07:00 · 2026-05-24 17:57:50 -07:00 · dcc163ee28
commit dcc163ee28
parent 243ebc7a61
3 changed files with 133 additions and 1 deletions
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -581,6 +581,17 @@ def build_assistant_message(agent, assistant_message, finish_reason: str) -> dic
    if isinstance(_san_content, str) and _san_content:
        _san_content = agent._strip_think_blocks(_san_content).strip()

+    # Defence-in-depth: redact credentials (PATs, API keys, Bearer tokens)
+    # from assistant content BEFORE the message enters conversation history.
+    # If the model accidentally inlines a secret in its natural-language
+    # response, catch it here at the persistence boundary so it never
+    # reaches state.db, session_*.json, gateway delivery, or compression.
+    # Respects HERMES_REDACT_SECRETS via redact_sensitive_text — no-op
+    # when disabled. (#19798)
+    if isinstance(_san_content, str) and _san_content:
+        from agent.redact import redact_sensitive_text
+        _san_content = redact_sensitive_text(_san_content)
+
    msg = {
        "role": "assistant",
        "content": _san_content,
@ -702,6 +713,18 @@ def build_assistant_message(agent, assistant_message, finish_reason: str) -> dic
                    "arguments": tool_call.function.arguments
                },
            }
+            # Defence-in-depth: redact credentials from tool call arguments
+            # before they enter conversation history. Tool execution uses the
+            # raw API response object, not this dict, so redacting the
+            # persisted shape is safe and only affects storage. Catches the
+            # case where a model accidentally inlines a secret into a tool
+            # call (e.g. `terminal(command="curl -H 'Authorization: Bearer
+            # sk-...'")`). (#19798)
+            if isinstance(tc_dict["function"]["arguments"], str):
+                from agent.redact import redact_sensitive_text
+                tc_dict["function"]["arguments"] = redact_sensitive_text(
+                    tc_dict["function"]["arguments"]
+                )
            # Preserve extra_content (e.g. Gemini thought_signature) so it
            # is sent back on subsequent API calls.  Without this, Gemini 3
            # thinking models reject the request with a 400 error.
--- a/run_agent.py
+++ b/run_agent.py
@ -124,6 +124,7 @@ from agent.memory_manager import StreamingContextScrubber, build_memory_context_
 from agent.think_scrubber import StreamingThinkScrubber
 from agent.retry_utils import jittered_backoff
 from agent.error_classifier import classify_api_error, FailoverReason
+from agent.redact import redact_sensitive_text
 from agent.prompt_builder import (
    DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS,
    MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE,
@ -1546,6 +1547,36 @@ class AIAgent:
        content = re.sub(r'(</think>)\n+', r'\1\n', content)
        return content.strip()

+    @staticmethod
+    def _redact_message_content(content):
+        """Apply secret redaction to message content (str or list-of-parts).
+
+        Handles both plain-string content and the OpenAI/Anthropic multimodal
+        shape where ``content`` is a list of ``{"type": "text", "text": ...}``
+        / ``{"type": "image_url", ...}`` / ``{"type": "input_text", "content": ...}``
+        parts. Image / binary parts are left untouched; only text fields are
+        passed through ``redact_sensitive_text``.
+
+        Respects ``HERMES_REDACT_SECRETS`` via ``redact_sensitive_text`` —
+        when disabled the helper is effectively a no-op.
+        """
+        if content is None:
+            return content
+        if isinstance(content, str):
+            return redact_sensitive_text(content)
+        if isinstance(content, list):
+            redacted = []
+            for part in content:
+                if isinstance(part, dict):
+                    part = dict(part)
+                    if isinstance(part.get("text"), str):
+                        part["text"] = redact_sensitive_text(part["text"])
+                    if isinstance(part.get("content"), str):
+                        part["content"] = redact_sensitive_text(part["content"])
+                redacted.append(part)
+            return redacted
+        return content
+
    def _save_session_log(self, messages: List[Dict[str, Any]] = None):
        """Optional per-session JSON snapshot writer.

@ -1581,6 +1612,14 @@ class AIAgent:
                if msg.get("role") == "assistant" and msg.get("content"):
                    msg = dict(msg)
                    msg["content"] = self._clean_session_content(msg["content"])
+                # Defence-in-depth: redact credentials from every message
+                # content before persistence. Catches PATs / API keys / Bearer
+                # tokens that may have leaked into assistant responses, tool
+                # output, or user paste. Respects HERMES_REDACT_SECRETS via
+                # redact_sensitive_text — no-op when disabled. (#19798, #19845)
+                if "content" in msg:
+                    msg = dict(msg)
+                    msg["content"] = self._redact_message_content(msg.get("content"))
                cleaned.append(msg)

            # Guard: never overwrite a larger session log with fewer messages.
@ -1606,7 +1645,7 @@ class AIAgent:
                "platform": self.platform,
                "session_start": self.session_start.isoformat(),
                "last_updated": datetime.now().isoformat(),
-                "system_prompt": self._cached_system_prompt or "",
+                "system_prompt": redact_sensitive_text(self._cached_system_prompt or ""),
                "tools": self.tools or [],
                "message_count": len(cleaned),
                "messages": cleaned,
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -600,6 +600,76 @@ class TestSessionJsonSnapshotOptIn:
        assert hasattr(agent, "logs_dir")


+class TestSaveSessionLogRedactsSecrets:
+    """Regression: session_*.json must not contain plaintext credentials (#19798, #19845)."""
+
+    @pytest.fixture(autouse=True)
+    def _ensure_redaction_enabled(self, monkeypatch):
+        """Force redaction on regardless of host HERMES_REDACT_SECRETS state.
+        The hermetic conftest blanks the env var; the module-level
+        ``_REDACT_ENABLED`` constant is captured at import time, so we
+        flip it directly for the duration of these tests."""
+        monkeypatch.delenv("HERMES_REDACT_SECRETS", raising=False)
+        monkeypatch.setattr("agent.redact._REDACT_ENABLED", True)
+
+    def test_redacts_api_key_in_tool_content(self, agent, tmp_path):
+        agent._session_json_enabled = True
+        agent.logs_dir = tmp_path
+        messages = [
+            {"role": "user", "content": "Hello"},
+            {
+                "role": "tool",
+                "content": "Response: Authorization: Bearer sk-proj-abc123def456ghi789jkl012mno",
+            },
+        ]
+        agent._save_session_log(messages)
+
+        snapshot = (tmp_path / f"session_{agent.session_id}.json").read_text(encoding="utf-8")
+        assert "sk-proj-abc123def456ghi789jkl012mno" not in snapshot
+
+    def test_redacts_api_key_in_user_message(self, agent, tmp_path):
+        agent._session_json_enabled = True
+        agent.logs_dir = tmp_path
+        messages = [
+            {"role": "user", "content": "My key is sk-ant-api03-abc123def456ghi789jkl012mno please use it"},
+        ]
+        agent._save_session_log(messages)
+
+        snapshot = (tmp_path / f"session_{agent.session_id}.json").read_text(encoding="utf-8")
+        assert "sk-ant-api03-abc123def456ghi789jkl012mno" not in snapshot
+
+    def test_redacts_system_prompt_credentials(self, agent, tmp_path):
+        agent._session_json_enabled = True
+        agent.logs_dir = tmp_path
+        agent._cached_system_prompt = "Use key sk-proj-realkey1234567890123456 for API calls"
+        agent._save_session_log([{"role": "user", "content": "test"}])
+
+        snapshot = (tmp_path / f"session_{agent.session_id}.json").read_text(encoding="utf-8")
+        assert "sk-proj-realkey1234567890123456" not in snapshot
+
+    def test_redacts_list_type_multimodal_content(self, agent, tmp_path):
+        """OpenAI/Anthropic multimodal shape: content = list of {type, text|image_url} parts."""
+        agent._session_json_enabled = True
+        agent.logs_dir = tmp_path
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Key: gsk_abc123def456ghi789jkl012mno"},
+                    {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}},
+                ],
+            },
+        ]
+        agent._save_session_log(messages)
+
+        snapshot_text = (tmp_path / f"session_{agent.session_id}.json").read_text(encoding="utf-8")
+        snapshot = json.loads(snapshot_text)
+        parts = snapshot["messages"][0]["content"]
+        assert "gsk_abc123def456ghi789jkl012mno" not in parts[0]["text"]
+        # Image part preserved untouched
+        assert parts[1]["image_url"]["url"].startswith("data:image")
+
+
 class TestGetMessagesUpToLastAssistant:
    def test_empty_list(self, agent):
        assert agent._get_messages_up_to_last_assistant([]) == []