mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
fix(anthropic): strip output-only SDK fields from replayed content blocks
HTTP 400 "messages.N.content.M.text.parsed_output: Extra inputs are not permitted" on the native Anthropic transport. Anthropic SDK 0.87.0 response blocks carry output-only attributes the Messages *input* schema forbids: text blocks get `parsed_output` and `citations=None`, tool_use blocks get `caller`. normalize_response captured blocks verbatim via _to_plain_data and replayed them as request input on the next turn, so the forbidden fields leaked back -> 400. Like the earlier thinking-block bug, one poisoned turn wedges every subsequent request in the session (even the diagnostic turn), recoverable only by switching models or deleting the session. This is a defect in the anthropic_content_blocks channel added for the interleaved-thinking fix: it preserved block ORDER correctly but copied every SDK attribute, including output-only ones. Fix — whitelist input-permitted fields per block type at all three leak points: - agent/transports/anthropic.py normalize_response: sanitize at CAPTURE so the poison never persists to state.db (defence-in-depth). - agent/anthropic_adapter.py _sanitize_replay_block (new): whitelist used on the ordered-blocks replay path; also recovers already-poisoned stored sessions. - agent/anthropic_adapter.py _convert_content_part_to_anthropic: a stored `text` part is rebuilt from whitelisted fields instead of dict(part) verbatim (this was the exact content.N.text.parsed_output failure locus). Whitelist not blacklist, so future SDK output-only fields can't reintroduce it. Block order and thinking-block signatures are preserved (the reason the channel exists). Adds tests/agent/test_anthropic_output_field_leak.py; full adapter suite green (163 tests). Existing poisoned state.db rows scrubbed out-of-band.
This commit is contained in:
parent
aaccaada28
commit
529bb1c3d5
3 changed files with 182 additions and 17 deletions
|
|
@ -1571,6 +1571,15 @@ def _convert_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]:
|
|||
|
||||
if ptype == "input_text":
|
||||
block: Dict[str, Any] = {"type": "text", "text": part.get("text", "")}
|
||||
elif ptype == "text":
|
||||
# A stored Anthropic text block. Rebuild from whitelisted fields only —
|
||||
# SDK response text blocks carry output-only siblings (parsed_output,
|
||||
# citations=None) that the Messages INPUT schema rejects with HTTP 400
|
||||
# "Extra inputs are not permitted". Do NOT dict(part) it verbatim.
|
||||
block = {"type": "text", "text": part.get("text", "")}
|
||||
cits = part.get("citations")
|
||||
if isinstance(cits, list) and cits:
|
||||
block["citations"] = cits
|
||||
elif ptype in {"image_url", "input_image"}:
|
||||
image_value = part.get("image_url", {})
|
||||
url = image_value.get("url", "") if isinstance(image_value, dict) else str(image_value or "")
|
||||
|
|
@ -1685,6 +1694,58 @@ def _content_parts_to_anthropic_blocks(parts: Any) -> List[Dict[str, Any]]:
|
|||
return out
|
||||
|
||||
|
||||
def _sanitize_replay_block(b: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Strip output-only fields from a stored Anthropic content block so it is
|
||||
valid as REQUEST input on replay.
|
||||
|
||||
The SDK response objects carry output-only attributes that the Messages
|
||||
*input* schema forbids ("Extra inputs are not permitted"): text blocks get
|
||||
``parsed_output``/``citations`` (when null), tool_use blocks get ``caller``,
|
||||
etc. ``normalize_response`` captured blocks verbatim via ``_to_plain_data``,
|
||||
so these leak back as input on the next turn → HTTP 400.
|
||||
|
||||
Whitelist per type (NOT a blacklist) so future SDK output-only fields can't
|
||||
reintroduce the bug. Returns a clean block, or None to drop it.
|
||||
"""
|
||||
if not isinstance(b, dict):
|
||||
return None
|
||||
btype = b.get("type")
|
||||
if btype == "text":
|
||||
out: Dict[str, Any] = {"type": "text", "text": b.get("text", "")}
|
||||
# citations is input-valid ONLY when it's a non-empty list; the SDK
|
||||
# emits citations=None on responses, which the input schema rejects.
|
||||
cits = b.get("citations")
|
||||
if isinstance(cits, list) and cits:
|
||||
out["citations"] = cits
|
||||
if isinstance(b.get("cache_control"), dict):
|
||||
out["cache_control"] = b["cache_control"]
|
||||
return out
|
||||
if btype == "thinking":
|
||||
out = {"type": "thinking", "thinking": b.get("thinking", "")}
|
||||
if b.get("signature"):
|
||||
out["signature"] = b["signature"]
|
||||
return out
|
||||
if btype == "redacted_thinking":
|
||||
# Only valid with its data payload; drop if missing.
|
||||
return {"type": "redacted_thinking", "data": b["data"]} if b.get("data") else None
|
||||
if btype == "tool_use":
|
||||
out = {
|
||||
"type": "tool_use",
|
||||
"id": _sanitize_tool_id(b.get("id", "")),
|
||||
"name": b.get("name", ""),
|
||||
"input": b.get("input", {}),
|
||||
}
|
||||
if isinstance(b.get("cache_control"), dict):
|
||||
out["cache_control"] = b["cache_control"]
|
||||
return out
|
||||
if btype == "image":
|
||||
src = b.get("source")
|
||||
return {"type": "image", "source": src} if isinstance(src, dict) else None
|
||||
# Unknown/unsupported block type on the input path — drop rather than risk
|
||||
# another "Extra inputs are not permitted".
|
||||
return None
|
||||
|
||||
|
||||
def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Convert an assistant message to Anthropic content blocks.
|
||||
|
||||
|
|
@ -1694,24 +1755,20 @@ def _convert_assistant_message(m: Dict[str, Any]) -> Dict[str, Any]:
|
|||
content = m.get("content", "")
|
||||
# Anthropic interleaved-thinking fast path: when this turn carries a
|
||||
# verbatim, order-preserving block list (set by normalize_response only
|
||||
# for turns that interleave SIGNED thinking with tool_use), replay it
|
||||
# unchanged. Reconstructing from the parallel reasoning_details +
|
||||
# tool_calls fields front-loads thinking and reorders signed blocks,
|
||||
# which Anthropic rejects with HTTP 400 ("thinking ... blocks in the
|
||||
# latest assistant message cannot be modified"). Block order — and thus
|
||||
# each thinking block's signature — must survive verbatim. tool_use IDs
|
||||
# are sanitized to match the tool_result IDs produced elsewhere; the
|
||||
# downstream mcp_ prefixing pass handles tool names on these blocks.
|
||||
# for turns that interleave SIGNED thinking with tool_use), replay it.
|
||||
# Each block is run through _sanitize_replay_block to strip output-only
|
||||
# SDK fields (parsed_output, caller, citations=None, …) that the Messages
|
||||
# INPUT schema forbids — replaying them verbatim caused HTTP 400 "Extra
|
||||
# inputs are not permitted" (text.parsed_output). Block ORDER is preserved
|
||||
# (the reason this channel exists); only forbidden sibling fields are
|
||||
# dropped, leaving thinking signatures and tool_use id/name/input intact.
|
||||
ordered_blocks = m.get("anthropic_content_blocks")
|
||||
if isinstance(ordered_blocks, list) and ordered_blocks:
|
||||
replayed: List[Dict[str, Any]] = []
|
||||
for b in ordered_blocks:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
blk = copy.deepcopy(b)
|
||||
if blk.get("type") == "tool_use" and "id" in blk:
|
||||
blk["id"] = _sanitize_tool_id(blk.get("id", ""))
|
||||
replayed.append(blk)
|
||||
clean = _sanitize_replay_block(b)
|
||||
if clean is not None:
|
||||
replayed.append(clean)
|
||||
if replayed:
|
||||
return {"role": "assistant", "content": replayed}
|
||||
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class AnthropicTransport(ProviderTransport):
|
|||
to OpenAI finish_reason, and collects reasoning_details in provider_data.
|
||||
"""
|
||||
import json
|
||||
from agent.anthropic_adapter import _to_plain_data
|
||||
from agent.anthropic_adapter import _to_plain_data, _sanitize_replay_block
|
||||
from agent.transports.types import ToolCall
|
||||
|
||||
strip_tool_prefix = kwargs.get("strip_tool_prefix", False)
|
||||
|
|
@ -108,14 +108,26 @@ class AnthropicTransport(ProviderTransport):
|
|||
|
||||
for block in response.content:
|
||||
block_dict = _to_plain_data(block)
|
||||
clean_block = None
|
||||
if isinstance(block_dict, dict):
|
||||
ordered_blocks.append(block_dict)
|
||||
# Sanitize at capture so output-only SDK fields (parsed_output,
|
||||
# caller, citations=None, …) never persist to state.db and leak
|
||||
# back as request input on replay → HTTP 400 "Extra inputs are
|
||||
# not permitted". Defence-in-depth with the replay-side sanitize.
|
||||
clean_block = _sanitize_replay_block(block_dict)
|
||||
if clean_block is not None:
|
||||
ordered_blocks.append(clean_block)
|
||||
if block.type == "text":
|
||||
text_parts.append(block.text)
|
||||
elif block.type in ("thinking", "redacted_thinking"):
|
||||
if block.type == "thinking":
|
||||
reasoning_parts.append(block.thinking)
|
||||
if isinstance(block_dict, dict):
|
||||
# Use the sanitized block (clean_block) for reasoning_details too,
|
||||
# since _extract_preserved_thinking_blocks replays these on the
|
||||
# non-ordered path. Falls back to raw only if sanitize dropped it.
|
||||
if isinstance(clean_block, dict):
|
||||
reasoning_details.append(clean_block)
|
||||
elif isinstance(block_dict, dict):
|
||||
reasoning_details.append(block_dict)
|
||||
elif block.type == "tool_use":
|
||||
name = block.name
|
||||
|
|
|
|||
96
tests/agent/test_anthropic_output_field_leak.py
Normal file
96
tests/agent/test_anthropic_output_field_leak.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
"""Regression: output-only SDK fields must not leak into Anthropic request input.
|
||||
|
||||
Reproduces HTTP 400 `messages.N.content.M.text.parsed_output: Extra inputs are
|
||||
not permitted`. Anthropic SDK response blocks carry output-only attributes
|
||||
(text blocks: `parsed_output`, `citations=None`; tool_use blocks: `caller`)
|
||||
that the Messages *input* schema forbids. normalize_response captured blocks
|
||||
verbatim via _to_plain_data and replayed them as input → 400.
|
||||
|
||||
Fix: whitelist input-permitted fields per block type at three points —
|
||||
normalize_response capture, _sanitize_replay_block (ordered-blocks replay), and
|
||||
_convert_content_part_to_anthropic (content-list replay).
|
||||
"""
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.expanduser("~/.hermes/hermes-agent"))
|
||||
|
||||
import pytest
|
||||
from agent.anthropic_adapter import (
|
||||
_sanitize_replay_block,
|
||||
_convert_content_part_to_anthropic,
|
||||
_convert_assistant_message,
|
||||
)
|
||||
|
||||
FORBIDDEN = {"parsed_output", "caller"}
|
||||
|
||||
|
||||
def _assert_clean(block):
|
||||
"""No forbidden output-only key, and no null citations, anywhere."""
|
||||
assert isinstance(block, dict)
|
||||
for k in FORBIDDEN:
|
||||
assert k not in block, f"forbidden field {k!r} survived: {block}"
|
||||
if "citations" in block:
|
||||
assert isinstance(block["citations"], list) and block["citations"], \
|
||||
"citations must be a non-empty list if present (None/[] is input-invalid)"
|
||||
|
||||
|
||||
class TestSanitizeReplayBlock:
|
||||
def test_text_block_strips_parsed_output_and_null_citations(self):
|
||||
poisoned = {"type": "text", "text": "hi", "parsed_output": None, "citations": None}
|
||||
out = _sanitize_replay_block(poisoned)
|
||||
_assert_clean(out)
|
||||
assert out == {"type": "text", "text": "hi"}
|
||||
|
||||
def test_tool_use_strips_caller(self):
|
||||
poisoned = {"type": "tool_use", "id": "toolu_1", "name": "read_file",
|
||||
"input": {"path": "a"}, "caller": {"type": "agent"}}
|
||||
out = _sanitize_replay_block(poisoned)
|
||||
_assert_clean(out)
|
||||
assert out["name"] == "read_file" and out["input"] == {"path": "a"}
|
||||
|
||||
def test_thinking_preserves_signature(self):
|
||||
b = {"type": "thinking", "thinking": "x", "signature": "sig-AAA"}
|
||||
out = _sanitize_replay_block(b)
|
||||
assert out == {"type": "thinking", "thinking": "x", "signature": "sig-AAA"}
|
||||
|
||||
def test_text_keeps_real_citations(self):
|
||||
real = [{"type": "char_location", "cited_text": "q"}]
|
||||
out = _sanitize_replay_block({"type": "text", "text": "t", "citations": real})
|
||||
assert out["citations"] == real
|
||||
|
||||
def test_unknown_type_dropped(self):
|
||||
assert _sanitize_replay_block({"type": "server_tool_use", "foo": 1}) is None
|
||||
|
||||
|
||||
class TestContentPartConversion:
|
||||
def test_stored_text_block_with_parsed_output_cleaned(self):
|
||||
# The exact content.N.text.parsed_output failure shape.
|
||||
part = {"type": "text", "text": "hello", "parsed_output": None, "citations": None}
|
||||
out = _convert_content_part_to_anthropic(part)
|
||||
_assert_clean(out)
|
||||
|
||||
|
||||
class TestAssistantReplay:
|
||||
def test_interleaved_blocks_replayed_clean_and_ordered(self):
|
||||
m = {
|
||||
"role": "assistant",
|
||||
"anthropic_content_blocks": [
|
||||
{"type": "thinking", "thinking": "plan", "signature": "s1"},
|
||||
{"type": "text", "text": "doing it", "parsed_output": None, "citations": None},
|
||||
{"type": "tool_use", "id": "toolu_1", "name": "read_file",
|
||||
"input": {"path": "a"}, "caller": {"type": "agent"}},
|
||||
],
|
||||
}
|
||||
out = _convert_assistant_message(m)
|
||||
blocks = out["content"]
|
||||
# order preserved
|
||||
assert [b["type"] for b in blocks] == ["thinking", "text", "tool_use"]
|
||||
# every block clean
|
||||
for b in blocks:
|
||||
_assert_clean(b)
|
||||
# signature + tool fields intact
|
||||
assert blocks[0]["signature"] == "s1"
|
||||
assert blocks[2]["name"] == "read_file"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(pytest.main([__file__, "-v"]))
|
||||
Loading…
Add table
Add a link
Reference in a new issue