From 529bb1c3d516f7580af39d6095f0a7d97f7e9ad5 Mon Sep 17 00:00:00 2001 From: RaumfahrerSpiffy <56406949+RaumfahrerSpiffy@users.noreply.github.com> Date: Sat, 30 May 2026 23:13:32 -0400 Subject: [PATCH] fix(anthropic): strip output-only SDK fields from replayed content blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HTTP 400 "messages.N.content.M.text.parsed_output: Extra inputs are not permitted" on the native Anthropic transport. Anthropic SDK 0.87.0 response blocks carry output-only attributes the Messages *input* schema forbids: text blocks get `parsed_output` and `citations=None`, tool_use blocks get `caller`. normalize_response captured blocks verbatim via _to_plain_data and replayed them as request input on the next turn, so the forbidden fields leaked back -> 400. Like the earlier thinking-block bug, one poisoned turn wedges every subsequent request in the session (even the diagnostic turn), recoverable only by switching models or deleting the session. This is a defect in the anthropic_content_blocks channel added for the interleaved-thinking fix: it preserved block ORDER correctly but copied every SDK attribute, including output-only ones. Fix — whitelist input-permitted fields per block type at all three leak points: - agent/transports/anthropic.py normalize_response: sanitize at CAPTURE so the poison never persists to state.db (defence-in-depth). - agent/anthropic_adapter.py _sanitize_replay_block (new): whitelist used on the ordered-blocks replay path; also recovers already-poisoned stored sessions. - agent/anthropic_adapter.py _convert_content_part_to_anthropic: a stored `text` part is rebuilt from whitelisted fields instead of dict(part) verbatim (this was the exact content.N.text.parsed_output failure locus). Whitelist not blacklist, so future SDK output-only fields can't reintroduce it. Block order and thinking-block signatures are preserved (the reason the channel exists). Adds tests/agent/test_anthropic_output_field_leak.py; full adapter suite green (163 tests). Existing poisoned state.db rows scrubbed out-of-band. --- agent/anthropic_adapter.py | 85 +++++++++++++--- agent/transports/anthropic.py | 18 +++- .../agent/test_anthropic_output_field_leak.py | 96 +++++++++++++++++++ 3 files changed, 182 insertions(+), 17 deletions(-) create mode 100644 tests/agent/test_anthropic_output_field_leak.py diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index 25ee73580f7..a95fd2f10f4 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -1571,6 +1571,15 @@ def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]: if ptype == "input_text": block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")} + elif ptype == "text": + # A stored Anthropic text block. Rebuild from whitelisted fields only — + # SDK response text blocks carry output-only siblings (parsed_output, + # citations=None) that the Messages INPUT schema rejects with HTTP 400 + # "Extra inputs are not permitted". Do NOT dict(part) it verbatim. + block = {"type": "text", "text": part.get("text", "")} + cits = part.get("citations") + if isinstance(cits, list) and cits: + block["citations"] = cits elif ptype in {"image_url", "input_image"}: image_value = part.get("image_url", {}) url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "") @@ -1685,6 +1694,58 @@ def _content_parts_to_anthropic_blocks(parts: Any) -> List[Dict[str, Any]]: return out +def _sanitize_replay_block(b: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Strip output-only fields from a stored Anthropic content block so it is + valid as REQUEST input on replay. + + The SDK response objects carry output-only attributes that the Messages + *input* schema forbids ("Extra inputs are not permitted"): text blocks get + ``parsed_output``/``citations`` (when null), tool_use blocks get ``caller``, + etc. ``normalize_response`` captured blocks verbatim via ``_to_plain_data``, + so these leak back as input on the next turn → HTTP 400. + + Whitelist per type (NOT a blacklist) so future SDK output-only fields can't + reintroduce the bug. Returns a clean block, or None to drop it. + """ + if not isinstance(b, dict): + return None + btype = b.get("type") + if btype == "text": + out: Dict[str, Any] = {"type": "text", "text": b.get("text", "")} + # citations is input-valid ONLY when it's a non-empty list; the SDK + # emits citations=None on responses, which the input schema rejects. + cits = b.get("citations") + if isinstance(cits, list) and cits: + out["citations"] = cits + if isinstance(b.get("cache_control"), dict): + out["cache_control"] = b["cache_control"] + return out + if btype == "thinking": + out = {"type": "thinking", "thinking": b.get("thinking", "")} + if b.get("signature"): + out["signature"] = b["signature"] + return out + if btype == "redacted_thinking": + # Only valid with its data payload; drop if missing. + return {"type": "redacted_thinking", "data": b["data"]} if b.get("data") else None + if btype == "tool_use": + out = { + "type": "tool_use", + "id": _sanitize_tool_id(b.get("id", "")), + "name": b.get("name", ""), + "input": b.get("input", {}), + } + if isinstance(b.get("cache_control"), dict): + out["cache_control"] = b["cache_control"] + return out + if btype == "image": + src = b.get("source") + return {"type": "image", "source": src} if isinstance(src, dict) else None + # Unknown/unsupported block type on the input path — drop rather than risk + # another "Extra inputs are not permitted". + return None + + def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]: """Convert an assistant message to Anthropic content blocks. @@ -1694,24 +1755,20 @@ def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]: content = m.get("content", "") # Anthropic interleaved-thinking fast path: when this turn carries a # verbatim, order-preserving block list (set by normalize_response only - # for turns that interleave SIGNED thinking with tool_use), replay it - # unchanged. Reconstructing from the parallel reasoning_details + - # tool_calls fields front-loads thinking and reorders signed blocks, - # which Anthropic rejects with HTTP 400 ("thinking ... blocks in the - # latest assistant message cannot be modified"). Block order — and thus - # each thinking block's signature — must survive verbatim. tool_use IDs - # are sanitized to match the tool_result IDs produced elsewhere; the - # downstream mcp_ prefixing pass handles tool names on these blocks. + # for turns that interleave SIGNED thinking with tool_use), replay it. + # Each block is run through _sanitize_replay_block to strip output-only + # SDK fields (parsed_output, caller, citations=None, …) that the Messages + # INPUT schema forbids — replaying them verbatim caused HTTP 400 "Extra + # inputs are not permitted" (text.parsed_output). Block ORDER is preserved + # (the reason this channel exists); only forbidden sibling fields are + # dropped, leaving thinking signatures and tool_use id/name/input intact. ordered_blocks = m.get("anthropic_content_blocks") if isinstance(ordered_blocks, list) and ordered_blocks: replayed: List[Dict[str, Any]] = [] for b in ordered_blocks: - if not isinstance(b, dict): - continue - blk = copy.deepcopy(b) - if blk.get("type") == "tool_use" and "id" in blk: - blk["id"] = _sanitize_tool_id(blk.get("id", "")) - replayed.append(blk) + clean = _sanitize_replay_block(b) + if clean is not None: + replayed.append(clean) if replayed: return {"role": "assistant", "content": replayed} diff --git a/agent/transports/anthropic.py b/agent/transports/anthropic.py index e7c99751c7c..3a209f2d753 100644 --- a/agent/transports/anthropic.py +++ b/agent/transports/anthropic.py @@ -84,7 +84,7 @@ class AnthropicTransport(ProviderTransport): to OpenAI finish_reason, and collects reasoning_details in provider_data. """ import json - from agent.anthropic_adapter import _to_plain_data + from agent.anthropic_adapter import _to_plain_data, _sanitize_replay_block from agent.transports.types import ToolCall strip_tool_prefix = kwargs.get("strip_tool_prefix", False) @@ -108,14 +108,26 @@ class AnthropicTransport(ProviderTransport): for block in response.content: block_dict = _to_plain_data(block) + clean_block = None if isinstance(block_dict, dict): - ordered_blocks.append(block_dict) + # Sanitize at capture so output-only SDK fields (parsed_output, + # caller, citations=None, …) never persist to state.db and leak + # back as request input on replay → HTTP 400 "Extra inputs are + # not permitted". Defence-in-depth with the replay-side sanitize. + clean_block = _sanitize_replay_block(block_dict) + if clean_block is not None: + ordered_blocks.append(clean_block) if block.type == "text": text_parts.append(block.text) elif block.type in ("thinking", "redacted_thinking"): if block.type == "thinking": reasoning_parts.append(block.thinking) - if isinstance(block_dict, dict): + # Use the sanitized block (clean_block) for reasoning_details too, + # since _extract_preserved_thinking_blocks replays these on the + # non-ordered path. Falls back to raw only if sanitize dropped it. + if isinstance(clean_block, dict): + reasoning_details.append(clean_block) + elif isinstance(block_dict, dict): reasoning_details.append(block_dict) elif block.type == "tool_use": name = block.name diff --git a/tests/agent/test_anthropic_output_field_leak.py b/tests/agent/test_anthropic_output_field_leak.py new file mode 100644 index 00000000000..a691f34ec0b --- /dev/null +++ b/tests/agent/test_anthropic_output_field_leak.py @@ -0,0 +1,96 @@ +"""Regression: output-only SDK fields must not leak into Anthropic request input. + +Reproduces HTTP 400 `messages.N.content.M.text.parsed_output: Extra inputs are +not permitted`. Anthropic SDK response blocks carry output-only attributes +(text blocks: `parsed_output`, `citations=None`; tool_use blocks: `caller`) +that the Messages *input* schema forbids. normalize_response captured blocks +verbatim via _to_plain_data and replayed them as input → 400. + +Fix: whitelist input-permitted fields per block type at three points — +normalize_response capture, _sanitize_replay_block (ordered-blocks replay), and +_convert_content_part_to_anthropic (content-list replay). +""" +import sys, os +sys.path.insert(0, os.path.expanduser("~/.hermes/hermes-agent")) + +import pytest +from agent.anthropic_adapter import ( + _sanitize_replay_block, + _convert_content_part_to_anthropic, + _convert_assistant_message, +) + +FORBIDDEN = {"parsed_output", "caller"} + + +def _assert_clean(block): + """No forbidden output-only key, and no null citations, anywhere.""" + assert isinstance(block, dict) + for k in FORBIDDEN: + assert k not in block, f"forbidden field {k!r} survived: {block}" + if "citations" in block: + assert isinstance(block["citations"], list) and block["citations"], \ + "citations must be a non-empty list if present (None/[] is input-invalid)" + + +class TestSanitizeReplayBlock: + def test_text_block_strips_parsed_output_and_null_citations(self): + poisoned = {"type": "text", "text": "hi", "parsed_output": None, "citations": None} + out = _sanitize_replay_block(poisoned) + _assert_clean(out) + assert out == {"type": "text", "text": "hi"} + + def test_tool_use_strips_caller(self): + poisoned = {"type": "tool_use", "id": "toolu_1", "name": "read_file", + "input": {"path": "a"}, "caller": {"type": "agent"}} + out = _sanitize_replay_block(poisoned) + _assert_clean(out) + assert out["name"] == "read_file" and out["input"] == {"path": "a"} + + def test_thinking_preserves_signature(self): + b = {"type": "thinking", "thinking": "x", "signature": "sig-AAA"} + out = _sanitize_replay_block(b) + assert out == {"type": "thinking", "thinking": "x", "signature": "sig-AAA"} + + def test_text_keeps_real_citations(self): + real = [{"type": "char_location", "cited_text": "q"}] + out = _sanitize_replay_block({"type": "text", "text": "t", "citations": real}) + assert out["citations"] == real + + def test_unknown_type_dropped(self): + assert _sanitize_replay_block({"type": "server_tool_use", "foo": 1}) is None + + +class TestContentPartConversion: + def test_stored_text_block_with_parsed_output_cleaned(self): + # The exact content.N.text.parsed_output failure shape. + part = {"type": "text", "text": "hello", "parsed_output": None, "citations": None} + out = _convert_content_part_to_anthropic(part) + _assert_clean(out) + + +class TestAssistantReplay: + def test_interleaved_blocks_replayed_clean_and_ordered(self): + m = { + "role": "assistant", + "anthropic_content_blocks": [ + {"type": "thinking", "thinking": "plan", "signature": "s1"}, + {"type": "text", "text": "doing it", "parsed_output": None, "citations": None}, + {"type": "tool_use", "id": "toolu_1", "name": "read_file", + "input": {"path": "a"}, "caller": {"type": "agent"}}, + ], + } + out = _convert_assistant_message(m) + blocks = out["content"] + # order preserved + assert [b["type"] for b in blocks] == ["thinking", "text", "tool_use"] + # every block clean + for b in blocks: + _assert_clean(b) + # signature + tool fields intact + assert blocks[0]["signature"] == "s1" + assert blocks[2]["name"] == "read_file" + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, "-v"]))