fix(anthropic): strip output-only SDK fields from replayed content blocks

HTTP 400 "messages.N.content.M.text.parsed_output: Extra inputs are not
permitted" on the native Anthropic transport. Anthropic SDK 0.87.0 response
blocks carry output-only attributes the Messages *input* schema forbids: text
blocks get `parsed_output` and `citations=None`, tool_use blocks get `caller`.
normalize_response captured blocks verbatim via _to_plain_data and replayed
them as request input on the next turn, so the forbidden fields leaked back ->
400. Like the earlier thinking-block bug, one poisoned turn wedges every
subsequent request in the session (even the diagnostic turn), recoverable only
by switching models or deleting the session.

This is a defect in the anthropic_content_blocks channel added for the
interleaved-thinking fix: it preserved block ORDER correctly but copied every
SDK attribute, including output-only ones.

Fix — whitelist input-permitted fields per block type at all three leak points:
- agent/transports/anthropic.py normalize_response: sanitize at CAPTURE so the
  poison never persists to state.db (defence-in-depth).
- agent/anthropic_adapter.py _sanitize_replay_block (new): whitelist used on the
  ordered-blocks replay path; also recovers already-poisoned stored sessions.
- agent/anthropic_adapter.py _convert_content_part_to_anthropic: a stored
  `text` part is rebuilt from whitelisted fields instead of dict(part) verbatim
  (this was the exact content.N.text.parsed_output failure locus).

Whitelist not blacklist, so future SDK output-only fields can't reintroduce it.
Block order and thinking-block signatures are preserved (the reason the channel
exists). Adds tests/agent/test_anthropic_output_field_leak.py; full adapter
suite green (163 tests). Existing poisoned state.db rows scrubbed out-of-band.
This commit is contained in:
RaumfahrerSpiffy 2026-05-30 23:13:32 -04:00 committed by Teknium
parent aaccaada28
commit 529bb1c3d5
3 changed files with 182 additions and 17 deletions

View file

@ -1571,6 +1571,15 @@ def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]:
if ptype == "input_text":
block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")}
elif ptype == "text":
# A stored Anthropic text block. Rebuild from whitelisted fields only —
# SDK response text blocks carry output-only siblings (parsed_output,
# citations=None) that the Messages INPUT schema rejects with HTTP 400
# "Extra inputs are not permitted". Do NOT dict(part) it verbatim.
block = {"type": "text", "text": part.get("text", "")}
cits = part.get("citations")
if isinstance(cits, list) and cits:
block["citations"] = cits
elif ptype in {"image_url", "input_image"}:
image_value = part.get("image_url", {})
url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "")
@ -1685,6 +1694,58 @@ def _content_parts_to_anthropic_blocks(parts: Any) -> List[Dict[str, Any]]:
return out
def _sanitize_replay_block(b: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Strip output-only fields from a stored Anthropic content block so it is
valid as REQUEST input on replay.
The SDK response objects carry output-only attributes that the Messages
*input* schema forbids ("Extra inputs are not permitted"): text blocks get
``parsed_output``/``citations`` (when null), tool_use blocks get ``caller``,
etc. ``normalize_response`` captured blocks verbatim via ``_to_plain_data``,
so these leak back as input on the next turn HTTP 400.
Whitelist per type (NOT a blacklist) so future SDK output-only fields can't
reintroduce the bug. Returns a clean block, or None to drop it.
"""
if not isinstance(b, dict):
return None
btype = b.get("type")
if btype == "text":
out: Dict[str, Any] = {"type": "text", "text": b.get("text", "")}
# citations is input-valid ONLY when it's a non-empty list; the SDK
# emits citations=None on responses, which the input schema rejects.
cits = b.get("citations")
if isinstance(cits, list) and cits:
out["citations"] = cits
if isinstance(b.get("cache_control"), dict):
out["cache_control"] = b["cache_control"]
return out
if btype == "thinking":
out = {"type": "thinking", "thinking": b.get("thinking", "")}
if b.get("signature"):
out["signature"] = b["signature"]
return out
if btype == "redacted_thinking":
# Only valid with its data payload; drop if missing.
return {"type": "redacted_thinking", "data": b["data"]} if b.get("data") else None
if btype == "tool_use":
out = {
"type": "tool_use",
"id": _sanitize_tool_id(b.get("id", "")),
"name": b.get("name", ""),
"input": b.get("input", {}),
}
if isinstance(b.get("cache_control"), dict):
out["cache_control"] = b["cache_control"]
return out
if btype == "image":
src = b.get("source")
return {"type": "image", "source": src} if isinstance(src, dict) else None
# Unknown/unsupported block type on the input path — drop rather than risk
# another "Extra inputs are not permitted".
return None
def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
"""Convert an assistant message to Anthropic content blocks.
@ -1694,24 +1755,20 @@ def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
content = m.get("content", "")
# Anthropic interleaved-thinking fast path: when this turn carries a
# verbatim, order-preserving block list (set by normalize_response only
# for turns that interleave SIGNED thinking with tool_use), replay it
# unchanged. Reconstructing from the parallel reasoning_details +
# tool_calls fields front-loads thinking and reorders signed blocks,
# which Anthropic rejects with HTTP 400 ("thinking ... blocks in the
# latest assistant message cannot be modified"). Block order — and thus
# each thinking block's signature — must survive verbatim. tool_use IDs
# are sanitized to match the tool_result IDs produced elsewhere; the
# downstream mcp_ prefixing pass handles tool names on these blocks.
# for turns that interleave SIGNED thinking with tool_use), replay it.
# Each block is run through _sanitize_replay_block to strip output-only
# SDK fields (parsed_output, caller, citations=None, …) that the Messages
# INPUT schema forbids — replaying them verbatim caused HTTP 400 "Extra
# inputs are not permitted" (text.parsed_output). Block ORDER is preserved
# (the reason this channel exists); only forbidden sibling fields are
# dropped, leaving thinking signatures and tool_use id/name/input intact.
ordered_blocks = m.get("anthropic_content_blocks")
if isinstance(ordered_blocks, list) and ordered_blocks:
replayed: List[Dict[str, Any]] = []
for b in ordered_blocks:
if not isinstance(b, dict):
continue
blk = copy.deepcopy(b)
if blk.get("type") == "tool_use" and "id" in blk:
blk["id"] = _sanitize_tool_id(blk.get("id", ""))
replayed.append(blk)
clean = _sanitize_replay_block(b)
if clean is not None:
replayed.append(clean)
if replayed:
return {"role": "assistant", "content": replayed}

View file

@ -84,7 +84,7 @@ class AnthropicTransport(ProviderTransport):
to OpenAI finish_reason, and collects reasoning_details in provider_data.
"""
import json
from agent.anthropic_adapter import _to_plain_data
from agent.anthropic_adapter import _to_plain_data, _sanitize_replay_block
from agent.transports.types import ToolCall
strip_tool_prefix = kwargs.get("strip_tool_prefix", False)
@ -108,14 +108,26 @@ class AnthropicTransport(ProviderTransport):
for block in response.content:
block_dict = _to_plain_data(block)
clean_block = None
if isinstance(block_dict, dict):
ordered_blocks.append(block_dict)
# Sanitize at capture so output-only SDK fields (parsed_output,
# caller, citations=None, …) never persist to state.db and leak
# back as request input on replay → HTTP 400 "Extra inputs are
# not permitted". Defence-in-depth with the replay-side sanitize.
clean_block = _sanitize_replay_block(block_dict)
if clean_block is not None:
ordered_blocks.append(clean_block)
if block.type == "text":
text_parts.append(block.text)
elif block.type in ("thinking", "redacted_thinking"):
if block.type == "thinking":
reasoning_parts.append(block.thinking)
if isinstance(block_dict, dict):
# Use the sanitized block (clean_block) for reasoning_details too,
# since _extract_preserved_thinking_blocks replays these on the
# non-ordered path. Falls back to raw only if sanitize dropped it.
if isinstance(clean_block, dict):
reasoning_details.append(clean_block)
elif isinstance(block_dict, dict):
reasoning_details.append(block_dict)
elif block.type == "tool_use":
name = block.name

View file

@ -0,0 +1,96 @@
"""Regression: output-only SDK fields must not leak into Anthropic request input.
Reproduces HTTP 400 `messages.N.content.M.text.parsed_output: Extra inputs are
not permitted`. Anthropic SDK response blocks carry output-only attributes
(text blocks: `parsed_output`, `citations=None`; tool_use blocks: `caller`)
that the Messages *input* schema forbids. normalize_response captured blocks
verbatim via _to_plain_data and replayed them as input 400.
Fix: whitelist input-permitted fields per block type at three points
normalize_response capture, _sanitize_replay_block (ordered-blocks replay), and
_convert_content_part_to_anthropic (content-list replay).
"""
import sys, os
sys.path.insert(0, os.path.expanduser("~/.hermes/hermes-agent"))
import pytest
from agent.anthropic_adapter import (
_sanitize_replay_block,
_convert_content_part_to_anthropic,
_convert_assistant_message,
)
FORBIDDEN = {"parsed_output", "caller"}
def _assert_clean(block):
"""No forbidden output-only key, and no null citations, anywhere."""
assert isinstance(block, dict)
for k in FORBIDDEN:
assert k not in block, f"forbidden field {k!r} survived: {block}"
if "citations" in block:
assert isinstance(block["citations"], list) and block["citations"], \
"citations must be a non-empty list if present (None/[] is input-invalid)"
class TestSanitizeReplayBlock:
def test_text_block_strips_parsed_output_and_null_citations(self):
poisoned = {"type": "text", "text": "hi", "parsed_output": None, "citations": None}
out = _sanitize_replay_block(poisoned)
_assert_clean(out)
assert out == {"type": "text", "text": "hi"}
def test_tool_use_strips_caller(self):
poisoned = {"type": "tool_use", "id": "toolu_1", "name": "read_file",
"input": {"path": "a"}, "caller": {"type": "agent"}}
out = _sanitize_replay_block(poisoned)
_assert_clean(out)
assert out["name"] == "read_file" and out["input"] == {"path": "a"}
def test_thinking_preserves_signature(self):
b = {"type": "thinking", "thinking": "x", "signature": "sig-AAA"}
out = _sanitize_replay_block(b)
assert out == {"type": "thinking", "thinking": "x", "signature": "sig-AAA"}
def test_text_keeps_real_citations(self):
real = [{"type": "char_location", "cited_text": "q"}]
out = _sanitize_replay_block({"type": "text", "text": "t", "citations": real})
assert out["citations"] == real
def test_unknown_type_dropped(self):
assert _sanitize_replay_block({"type": "server_tool_use", "foo": 1}) is None
class TestContentPartConversion:
def test_stored_text_block_with_parsed_output_cleaned(self):
# The exact content.N.text.parsed_output failure shape.
part = {"type": "text", "text": "hello", "parsed_output": None, "citations": None}
out = _convert_content_part_to_anthropic(part)
_assert_clean(out)
class TestAssistantReplay:
def test_interleaved_blocks_replayed_clean_and_ordered(self):
m = {
"role": "assistant",
"anthropic_content_blocks": [
{"type": "thinking", "thinking": "plan", "signature": "s1"},
{"type": "text", "text": "doing it", "parsed_output": None, "citations": None},
{"type": "tool_use", "id": "toolu_1", "name": "read_file",
"input": {"path": "a"}, "caller": {"type": "agent"}},
],
}
out = _convert_assistant_message(m)
blocks = out["content"]
# order preserved
assert [b["type"] for b in blocks] == ["thinking", "text", "tool_use"]
# every block clean
for b in blocks:
_assert_clean(b)
# signature + tool fields intact
assert blocks[0]["signature"] == "s1"
assert blocks[2]["name"] == "read_file"
if __name__ == "__main__":
raise SystemExit(pytest.main([__file__, "-v"]))