fix(anthropic): strip output-only SDK fields from replayed content blocks

HTTP 400 "messages.N.content.M.text.parsed_output: Extra inputs are not permitted" on the native Anthropic transport. Anthropic SDK 0.87.0 response blocks carry output-only attributes the Messages *input* schema forbids: text blocks get `parsed_output` and `citations=None`, tool_use blocks get `caller`. normalize_response captured blocks verbatim via _to_plain_data and replayed them as request input on the next turn, so the forbidden fields leaked back -> 400. Like the earlier thinking-block bug, one poisoned turn wedges every subsequent request in the session (even the diagnostic turn), recoverable only by switching models or deleting the session. This is a defect in the anthropic_content_blocks channel added for the interleaved-thinking fix: it preserved block ORDER correctly but copied every SDK attribute, including output-only ones. Fix — whitelist input-permitted fields per block type at all three leak points: - agent/transports/anthropic.py normalize_response: sanitize at CAPTURE so the poison never persists to state.db (defence-in-depth). - agent/anthropic_adapter.py _sanitize_replay_block (new): whitelist used on the ordered-blocks replay path; also recovers already-poisoned stored sessions. - agent/anthropic_adapter.py _convert_content_part_to_anthropic: a stored `text` part is rebuilt from whitelisted fields instead of dict(part) verbatim (this was the exact content.N.text.parsed_output failure locus). Whitelist not blacklist, so future SDK output-only fields can't reintroduce it. Block order and thinking-block signatures are preserved (the reason the channel exists). Adds tests/agent/test_anthropic_output_field_leak.py; full adapter suite green (163 tests). Existing poisoned state.db rows scrubbed out-of-band.
2026-07-29 18:46:59 +00:00 · 2026-05-30 23:13:32 -04:00 · 2026-05-30 23:13:32 -04:00 · 529bb1c3d5
commit 529bb1c3d5
parent aaccaada28
3 changed files with 182 additions and 17 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -1571,6 +1571,15 @@ def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]:

    if ptype == "input_text":
        block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")}
+    elif ptype == "text":
+        # A stored Anthropic text block. Rebuild from whitelisted fields only —
+        # SDK response text blocks carry output-only siblings (parsed_output,
+        # citations=None) that the Messages INPUT schema rejects with HTTP 400
+        # "Extra inputs are not permitted". Do NOT dict(part) it verbatim.
+        block = {"type": "text", "text": part.get("text", "")}
+        cits = part.get("citations")
+        if isinstance(cits, list) and cits:
+            block["citations"] = cits
    elif ptype in {"image_url", "input_image"}:
        image_value = part.get("image_url", {})
        url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "")
@ -1685,6 +1694,58 @@ def _content_parts_to_anthropic_blocks(parts: Any) -> List[Dict[str, Any]]:
    return out


+def _sanitize_replay_block(b: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """Strip output-only fields from a stored Anthropic content block so it is
+    valid as REQUEST input on replay.
+
+    The SDK response objects carry output-only attributes that the Messages
+    *input* schema forbids ("Extra inputs are not permitted"): text blocks get
+    ``parsed_output``/``citations`` (when null), tool_use blocks get ``caller``,
+    etc. ``normalize_response`` captured blocks verbatim via ``_to_plain_data``,
+    so these leak back as input on the next turn → HTTP 400.
+
+    Whitelist per type (NOT a blacklist) so future SDK output-only fields can't
+    reintroduce the bug. Returns a clean block, or None to drop it.
+    """
+    if not isinstance(b, dict):
+        return None
+    btype = b.get("type")
+    if btype == "text":
+        out: Dict[str, Any] = {"type": "text", "text": b.get("text", "")}
+        # citations is input-valid ONLY when it's a non-empty list; the SDK
+        # emits citations=None on responses, which the input schema rejects.
+        cits = b.get("citations")
+        if isinstance(cits, list) and cits:
+            out["citations"] = cits
+        if isinstance(b.get("cache_control"), dict):
+            out["cache_control"] = b["cache_control"]
+        return out
+    if btype == "thinking":
+        out = {"type": "thinking", "thinking": b.get("thinking", "")}
+        if b.get("signature"):
+            out["signature"] = b["signature"]
+        return out
+    if btype == "redacted_thinking":
+        # Only valid with its data payload; drop if missing.
+        return {"type": "redacted_thinking", "data": b["data"]} if b.get("data") else None
+    if btype == "tool_use":
+        out = {
+            "type": "tool_use",
+            "id": _sanitize_tool_id(b.get("id", "")),
+            "name": b.get("name", ""),
+            "input": b.get("input", {}),
+        }
+        if isinstance(b.get("cache_control"), dict):
+            out["cache_control"] = b["cache_control"]
+        return out
+    if btype == "image":
+        src = b.get("source")
+        return {"type": "image", "source": src} if isinstance(src, dict) else None
+    # Unknown/unsupported block type on the input path — drop rather than risk
+    # another "Extra inputs are not permitted".
+    return None
+
+
 def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
    """Convert an assistant message to Anthropic content blocks.

@ -1694,24 +1755,20 @@ def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
    content = m.get("content", "")
    # Anthropic interleaved-thinking fast path: when this turn carries a
    # verbatim, order-preserving block list (set by normalize_response only
-    # for turns that interleave SIGNED thinking with tool_use), replay it
-    # unchanged. Reconstructing from the parallel reasoning_details +
-    # tool_calls fields front-loads thinking and reorders signed blocks,
-    # which Anthropic rejects with HTTP 400 ("thinking ... blocks in the
-    # latest assistant message cannot be modified"). Block order — and thus
-    # each thinking block's signature — must survive verbatim. tool_use IDs
-    # are sanitized to match the tool_result IDs produced elsewhere; the
-    # downstream mcp_ prefixing pass handles tool names on these blocks.
+    # for turns that interleave SIGNED thinking with tool_use), replay it.
+    # Each block is run through _sanitize_replay_block to strip output-only
+    # SDK fields (parsed_output, caller, citations=None, …) that the Messages
+    # INPUT schema forbids — replaying them verbatim caused HTTP 400 "Extra
+    # inputs are not permitted" (text.parsed_output). Block ORDER is preserved
+    # (the reason this channel exists); only forbidden sibling fields are
+    # dropped, leaving thinking signatures and tool_use id/name/input intact.
    ordered_blocks = m.get("anthropic_content_blocks")
    if isinstance(ordered_blocks, list) and ordered_blocks:
        replayed: List[Dict[str, Any]] = []
        for b in ordered_blocks:
-            if not isinstance(b, dict):
-                continue
-            blk = copy.deepcopy(b)
-            if blk.get("type") == "tool_use" and "id" in blk:
-                blk["id"] = _sanitize_tool_id(blk.get("id", ""))
-            replayed.append(blk)
+            clean = _sanitize_replay_block(b)
+            if clean is not None:
+                replayed.append(clean)
        if replayed:
            return {"role": "assistant", "content": replayed}

--- a/agent/transports/anthropic.py
+++ b/agent/transports/anthropic.py
@ -84,7 +84,7 @@ class AnthropicTransport(ProviderTransport):
        to OpenAI finish_reason, and collects reasoning_details in provider_data.
        """
        import json
-        from agent.anthropic_adapter import _to_plain_data
+        from agent.anthropic_adapter import _to_plain_data, _sanitize_replay_block
        from agent.transports.types import ToolCall

        strip_tool_prefix = kwargs.get("strip_tool_prefix", False)
@ -108,14 +108,26 @@ class AnthropicTransport(ProviderTransport):

        for block in response.content:
            block_dict = _to_plain_data(block)
+            clean_block = None
            if isinstance(block_dict, dict):
-                ordered_blocks.append(block_dict)
+                # Sanitize at capture so output-only SDK fields (parsed_output,
+                # caller, citations=None, …) never persist to state.db and leak
+                # back as request input on replay → HTTP 400 "Extra inputs are
+                # not permitted". Defence-in-depth with the replay-side sanitize.
+                clean_block = _sanitize_replay_block(block_dict)
+                if clean_block is not None:
+                    ordered_blocks.append(clean_block)
            if block.type == "text":
                text_parts.append(block.text)
            elif block.type in ("thinking", "redacted_thinking"):
                if block.type == "thinking":
                    reasoning_parts.append(block.thinking)
-                if isinstance(block_dict, dict):
+                # Use the sanitized block (clean_block) for reasoning_details too,
+                # since _extract_preserved_thinking_blocks replays these on the
+                # non-ordered path. Falls back to raw only if sanitize dropped it.
+                if isinstance(clean_block, dict):
+                    reasoning_details.append(clean_block)
+                elif isinstance(block_dict, dict):
                    reasoning_details.append(block_dict)
            elif block.type == "tool_use":
                name = block.name
--- a/tests/agent/test_anthropic_output_field_leak.py
+++ b/tests/agent/test_anthropic_output_field_leak.py
@ -0,0 +1,96 @@
+"""Regression: output-only SDK fields must not leak into Anthropic request input.
+
+Reproduces HTTP 400 `messages.N.content.M.text.parsed_output: Extra inputs are
+not permitted`. Anthropic SDK response blocks carry output-only attributes
+(text blocks: `parsed_output`, `citations=None`; tool_use blocks: `caller`)
+that the Messages *input* schema forbids. normalize_response captured blocks
+verbatim via _to_plain_data and replayed them as input → 400.
+
+Fix: whitelist input-permitted fields per block type at three points —
+normalize_response capture, _sanitize_replay_block (ordered-blocks replay), and
+_convert_content_part_to_anthropic (content-list replay).
+"""
+import sys, os
+sys.path.insert(0, os.path.expanduser("~/.hermes/hermes-agent"))
+
+import pytest
+from agent.anthropic_adapter import (
+    _sanitize_replay_block,
+    _convert_content_part_to_anthropic,
+    _convert_assistant_message,
+)
+
+FORBIDDEN = {"parsed_output", "caller"}
+
+
+def _assert_clean(block):
+    """No forbidden output-only key, and no null citations, anywhere."""
+    assert isinstance(block, dict)
+    for k in FORBIDDEN:
+        assert k not in block, f"forbidden field {k!r} survived: {block}"
+    if "citations" in block:
+        assert isinstance(block["citations"], list) and block["citations"], \
+            "citations must be a non-empty list if present (None/[] is input-invalid)"
+
+
+class TestSanitizeReplayBlock:
+    def test_text_block_strips_parsed_output_and_null_citations(self):
+        poisoned = {"type": "text", "text": "hi", "parsed_output": None, "citations": None}
+        out = _sanitize_replay_block(poisoned)
+        _assert_clean(out)
+        assert out == {"type": "text", "text": "hi"}
+
+    def test_tool_use_strips_caller(self):
+        poisoned = {"type": "tool_use", "id": "toolu_1", "name": "read_file",
+                    "input": {"path": "a"}, "caller": {"type": "agent"}}
+        out = _sanitize_replay_block(poisoned)
+        _assert_clean(out)
+        assert out["name"] == "read_file" and out["input"] == {"path": "a"}
+
+    def test_thinking_preserves_signature(self):
+        b = {"type": "thinking", "thinking": "x", "signature": "sig-AAA"}
+        out = _sanitize_replay_block(b)
+        assert out == {"type": "thinking", "thinking": "x", "signature": "sig-AAA"}
+
+    def test_text_keeps_real_citations(self):
+        real = [{"type": "char_location", "cited_text": "q"}]
+        out = _sanitize_replay_block({"type": "text", "text": "t", "citations": real})
+        assert out["citations"] == real
+
+    def test_unknown_type_dropped(self):
+        assert _sanitize_replay_block({"type": "server_tool_use", "foo": 1}) is None
+
+
+class TestContentPartConversion:
+    def test_stored_text_block_with_parsed_output_cleaned(self):
+        # The exact content.N.text.parsed_output failure shape.
+        part = {"type": "text", "text": "hello", "parsed_output": None, "citations": None}
+        out = _convert_content_part_to_anthropic(part)
+        _assert_clean(out)
+
+
+class TestAssistantReplay:
+    def test_interleaved_blocks_replayed_clean_and_ordered(self):
+        m = {
+            "role": "assistant",
+            "anthropic_content_blocks": [
+                {"type": "thinking", "thinking": "plan", "signature": "s1"},
+                {"type": "text", "text": "doing it", "parsed_output": None, "citations": None},
+                {"type": "tool_use", "id": "toolu_1", "name": "read_file",
+                 "input": {"path": "a"}, "caller": {"type": "agent"}},
+            ],
+        }
+        out = _convert_assistant_message(m)
+        blocks = out["content"]
+        # order preserved
+        assert [b["type"] for b in blocks] == ["thinking", "text", "tool_use"]
+        # every block clean
+        for b in blocks:
+            _assert_clean(b)
+        # signature + tool fields intact
+        assert blocks[0]["signature"] == "s1"
+        assert blocks[2]["name"] == "read_file"
+
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v"]))