fix(agent): complete Claude Opus 4.7 API migration

Claude Opus 4.7 introduced several breaking API changes that the current
codebase partially handled but not completely. This patch finishes the
migration per the official migration guide at
https://platform.claude.com/docs/en/about-claude/models/migration-guide

Fixes NousResearch/hermes-agent#11137

Breaking-change coverage:

1. Adaptive thinking + output_config.effort — 4.7 is now recognized by
   _supports_adaptive_thinking() (extends previous 4.6-only gate).

2. Sampling parameter stripping — 4.7 returns 400 for any non-default
   temperature / top_p / top_k. build_anthropic_kwargs drops them as a
   safety net; the OpenAI-protocol auxiliary path (_build_call_kwargs)
   and AnthropicCompletionsAdapter.create() both early-exit before
   setting temperature for 4.7+ models. This keeps flush_memories and
   structured-JSON aux paths that hardcode temperature from 400ing
   when the aux model is flipped to 4.7.

3. thinking.display = "summarized" — 4.7 defaults display to "omitted",
   which silently hides reasoning text from Hermes's CLI activity feed
   during long tool runs. Restoring "summarized" preserves 4.6 UX.

4. Effort level mapping — xhigh now maps to xhigh (was xhigh→max, which
   silently over-efforted every coding/agentic request). max is now a
   distinct ceiling per Anthropic's 5-level effort model.

5. New stop_reason values — refusal and model_context_window_exceeded
   were silently collapsed to "stop" (end_turn) by the adapter's
   stop_reason_map. Now mapped to "content_filter" and "length"
   respectively, matching upstream finish-reason handling already in
   bedrock_adapter.

6. Model catalogs — claude-opus-4-7 added to the Anthropic provider
   list, anthropic/claude-opus-4.7 added at top of OpenRouter fallback
   catalog (recommended), claude-opus-4-7 added to model_metadata
   DEFAULT_CONTEXT_LENGTHS (1M, matching 4.6 per migration guide).

7. Prefill docstrings — run_agent.AIAgent and BatchRunner now document
   that Anthropic Sonnet/Opus 4.6+ reject a trailing assistant-role
   prefill (400).

8. Tests — 4 new tests in test_anthropic_adapter covering display
   default, xhigh preservation, max on 4.7, refusal / context-overflow
   stop_reason mapping, plus the sampling-param predicate. test_model_metadata
   accepts 4.7 at 1M context.

Tested on macOS 15.5 (darwin). 119 tests pass in
tests/agent/test_anthropic_adapter.py, 1320 pass in tests/agent/.
This commit is contained in:
trevthefoolish 2026-04-16 12:35:43 -05:00 committed by kshitij
parent 1ccd063786
commit 0517ac3e93
8 changed files with 155 additions and 19 deletions

View file

@ -28,19 +28,37 @@ except ImportError:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
THINKING_BUDGET = {"xhigh": 32000, "high": 16000, "medium": 8000, "low": 4000} THINKING_BUDGET = {"xhigh": 32000, "high": 16000, "medium": 8000, "low": 4000}
# Hermes effort → Anthropic adaptive-thinking effort (output_config.effort).
# Anthropic exposes 5 levels on 4.7+: low, medium, high, xhigh, max.
# We preserve xhigh as xhigh (the recommended default for coding/agentic on
# 4.7) and expose max as a distinct ceiling. "minimal" is a legacy alias that
# maps to low. See:
# https://platform.claude.com/docs/en/about-claude/models/migration-guide
ADAPTIVE_EFFORT_MAP = { ADAPTIVE_EFFORT_MAP = {
"xhigh": "max", "max": "max",
"high": "high", "xhigh": "xhigh",
"medium": "medium", "high": "high",
"low": "low", "medium": "medium",
"low": "low",
"minimal": "low", "minimal": "low",
} }
# Models where extended thinking is deprecated/removed (4.6+ behavior: adaptive
# is the only supported mode; 4.7 additionally forbids manual thinking entirely
# and drops temperature/top_p/top_k).
_ADAPTIVE_THINKING_SUBSTRINGS = ("4-6", "4.6", "4-7", "4.7")
# Models where temperature/top_p/top_k return 400 if set to non-default values.
# This is the Opus 4.7 contract; future 4.x+ models are expected to follow it.
_NO_SAMPLING_PARAMS_SUBSTRINGS = ("4-7", "4.7")
# ── Max output token limits per Anthropic model ─────────────────────── # ── Max output token limits per Anthropic model ───────────────────────
# Source: Anthropic docs + Cline model catalog. Anthropic's API requires # Source: Anthropic docs + Cline model catalog. Anthropic's API requires
# max_tokens as a mandatory field. Previously we hardcoded 16384, which # max_tokens as a mandatory field. Previously we hardcoded 16384, which
# starves thinking-enabled models (thinking tokens count toward the limit). # starves thinking-enabled models (thinking tokens count toward the limit).
_ANTHROPIC_OUTPUT_LIMITS = { _ANTHROPIC_OUTPUT_LIMITS = {
# Claude 4.7
"claude-opus-4-7": 128_000,
# Claude 4.6 # Claude 4.6
"claude-opus-4-6": 128_000, "claude-opus-4-6": 128_000,
"claude-sonnet-4-6": 64_000, "claude-sonnet-4-6": 64_000,
@ -91,11 +109,26 @@ def _get_anthropic_max_output(model: str) -> int:
def _supports_adaptive_thinking(model: str) -> bool: def _supports_adaptive_thinking(model: str) -> bool:
"""Return True for Claude 4.6 models that support adaptive thinking.""" """Return True for Claude 4.6+ models that support adaptive thinking."""
return any(v in model for v in ("4-6", "4.6")) return any(v in model for v in _ADAPTIVE_THINKING_SUBSTRINGS)
# Beta headers for enhanced features (sent with ALL auth types) def _forbids_sampling_params(model: str) -> bool:
"""Return True for models that 400 on any non-default temperature/top_p/top_k.
Opus 4.7 explicitly rejects sampling parameters; later Claude releases are
expected to follow suit. Callers should omit these fields entirely rather
than passing zero/default values (the API rejects anything non-null).
"""
return any(v in model for v in _NO_SAMPLING_PARAMS_SUBSTRINGS)
# Beta headers for enhanced features (sent with ALL auth types).
# As of Opus 4.7 (2026-04-16), both of these are GA on Claude 4.6+ — the
# beta headers are still accepted (harmless no-op) but not required. Kept
# here so older Claude (4.5, 4.1) + third-party Anthropic-compat endpoints
# that still gate on the headers continue to get the enhanced features.
# Migration guide: remove these if you no longer support ≤4.5 models.
_COMMON_BETAS = [ _COMMON_BETAS = [
"interleaved-thinking-2025-05-14", "interleaved-thinking-2025-05-14",
"fine-grained-tool-streaming-2025-05-14", "fine-grained-tool-streaming-2025-05-14",
@ -1341,18 +1374,26 @@ def build_anthropic_kwargs(
kwargs["tool_choice"] = {"type": "tool", "name": tool_choice} kwargs["tool_choice"] = {"type": "tool", "name": tool_choice}
# Map reasoning_config to Anthropic's thinking parameter. # Map reasoning_config to Anthropic's thinking parameter.
# Claude 4.6 models use adaptive thinking + output_config.effort. # Claude 4.6+ models use adaptive thinking + output_config.effort.
# Older models use manual thinking with budget_tokens. # Older models use manual thinking with budget_tokens.
# MiniMax Anthropic-compat endpoints support thinking (manual mode only, # MiniMax Anthropic-compat endpoints support thinking (manual mode only,
# not adaptive). Haiku does NOT support extended thinking — skip entirely. # not adaptive). Haiku does NOT support extended thinking — skip entirely.
#
# On 4.7+ the `thinking.display` field defaults to "omitted", which
# silently hides reasoning text that Hermes surfaces in its CLI. We
# request "summarized" so the reasoning blocks stay populated — matching
# 4.6 behavior and preserving the activity-feed UX during long tool runs.
if reasoning_config and isinstance(reasoning_config, dict): if reasoning_config and isinstance(reasoning_config, dict):
if reasoning_config.get("enabled") is not False and "haiku" not in model.lower(): if reasoning_config.get("enabled") is not False and "haiku" not in model.lower():
effort = str(reasoning_config.get("effort", "medium")).lower() effort = str(reasoning_config.get("effort", "medium")).lower()
budget = THINKING_BUDGET.get(effort, 8000) budget = THINKING_BUDGET.get(effort, 8000)
if _supports_adaptive_thinking(model): if _supports_adaptive_thinking(model):
kwargs["thinking"] = {"type": "adaptive"} kwargs["thinking"] = {
"type": "adaptive",
"display": "summarized",
}
kwargs["output_config"] = { kwargs["output_config"] = {
"effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium") "effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium"),
} }
else: else:
kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget} kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget}
@ -1360,6 +1401,15 @@ def build_anthropic_kwargs(
kwargs["temperature"] = 1 kwargs["temperature"] = 1
kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096) kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096)
# ── Strip sampling params on 4.7+ ─────────────────────────────────
# Opus 4.7 rejects any non-default temperature/top_p/top_k with a 400.
# Callers (auxiliary_client, flush_memories, etc.) may set these for
# older models; drop them here as a safety net so upstream 4.6 → 4.7
# migrations don't require coordinated edits everywhere.
if _forbids_sampling_params(model):
for _sampling_key in ("temperature", "top_p", "top_k"):
kwargs.pop(_sampling_key, None)
# ── Fast mode (Opus 4.6 only) ──────────────────────────────────── # ── Fast mode (Opus 4.6 only) ────────────────────────────────────
# Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x # Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x
# output speed. Only for native Anthropic endpoints — third-party # output speed. Only for native Anthropic endpoints — third-party
@ -1417,12 +1467,20 @@ def normalize_anthropic_response(
) )
) )
# Map Anthropic stop_reason to OpenAI finish_reason # Map Anthropic stop_reason to OpenAI finish_reason.
# Newer stop reasons added in Claude 4.5+ / 4.7:
# - refusal: the model declined to answer (cyber safeguards, CSAM, etc.)
# - model_context_window_exceeded: hit context limit (not max_tokens)
# Both need distinct handling upstream — a refusal should surface to the
# user with a clear message, and a context-window overflow should trigger
# compression/truncation rather than be treated as normal end-of-turn.
stop_reason_map = { stop_reason_map = {
"end_turn": "stop", "end_turn": "stop",
"tool_use": "tool_calls", "tool_use": "tool_calls",
"max_tokens": "length", "max_tokens": "length",
"stop_sequence": "stop", "stop_sequence": "stop",
"refusal": "content_filter",
"model_context_window_exceeded": "length",
} }
finish_reason = stop_reason_map.get(response.stop_reason, "stop") finish_reason = stop_reason_map.get(response.stop_reason, "stop")

View file

@ -518,8 +518,13 @@ class _AnthropicCompletionsAdapter:
tool_choice=normalized_tool_choice, tool_choice=normalized_tool_choice,
is_oauth=self._is_oauth, is_oauth=self._is_oauth,
) )
# Opus 4.7+ rejects any non-default temperature/top_p/top_k; only set
# temperature for models that still accept it. build_anthropic_kwargs
# additionally strips these keys as a safety net — keep both layers.
if temperature is not None: if temperature is not None:
anthropic_kwargs["temperature"] = temperature from agent.anthropic_adapter import _forbids_sampling_params
if not _forbids_sampling_params(model):
anthropic_kwargs["temperature"] = temperature
response = self._client.messages.create(**anthropic_kwargs) response = self._client.messages.create(**anthropic_kwargs)
assistant_message, finish_reason = normalize_anthropic_response(response) assistant_message, finish_reason = normalize_anthropic_response(response)
@ -2288,6 +2293,15 @@ def _build_call_kwargs(
"timeout": timeout, "timeout": timeout,
} }
# Opus 4.7+ rejects any non-default temperature/top_p/top_k — silently
# drop here so auxiliary callers that hardcode temperature (e.g. 0.3 on
# flush_memories, 0 on structured-JSON extraction) don't 400 the moment
# the aux model is flipped to 4.7.
if temperature is not None:
from agent.anthropic_adapter import _forbids_sampling_params
if _forbids_sampling_params(model):
temperature = None
if temperature is not None: if temperature is not None:
kwargs["temperature"] = temperature kwargs["temperature"] = temperature

View file

@ -102,6 +102,8 @@ DEFAULT_CONTEXT_LENGTHS = {
# fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
# substring of "anthropic/claude-sonnet-4.6"). # substring of "anthropic/claude-sonnet-4.6").
# OpenRouter-prefixed models resolve via OpenRouter live API or models.dev. # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
"claude-opus-4-7": 1000000,
"claude-opus-4.7": 1000000,
"claude-opus-4-6": 1000000, "claude-opus-4-6": 1000000,
"claude-sonnet-4-6": 1000000, "claude-sonnet-4-6": 1000000,
"claude-opus-4.6": 1000000, "claude-opus-4.6": 1000000,

View file

@ -561,7 +561,10 @@ class BatchRunner:
provider_sort (str): Sort providers by price/throughput/latency (optional) provider_sort (str): Sort providers by price/throughput/latency (optional)
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set) max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking) reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming) prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming).
NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a trailing assistant-role prefill
(400 error). For those models use output_config.format or structured-output
schemas instead. Safe here for user-role priming and for older Claude / non-Claude models.
max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set) max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
""" """
self.dataset_file = Path(dataset_file) self.dataset_file = Path(dataset_file)

View file

@ -26,7 +26,8 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"]
# Fallback OpenRouter snapshot used when the live catalog is unavailable. # Fallback OpenRouter snapshot used when the live catalog is unavailable.
# (model_id, display description shown in menus) # (model_id, display description shown in menus)
OPENROUTER_MODELS: list[tuple[str, str]] = [ OPENROUTER_MODELS: list[tuple[str, str]] = [
("anthropic/claude-opus-4.6", "recommended"), ("anthropic/claude-opus-4.7", "recommended"),
("anthropic/claude-opus-4.6", ""),
("anthropic/claude-sonnet-4.6", ""), ("anthropic/claude-sonnet-4.6", ""),
("qwen/qwen3.6-plus", ""), ("qwen/qwen3.6-plus", ""),
("anthropic/claude-sonnet-4.5", ""), ("anthropic/claude-sonnet-4.5", ""),
@ -181,6 +182,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
"MiniMax-M2", "MiniMax-M2",
], ],
"anthropic": [ "anthropic": [
"claude-opus-4-7",
"claude-opus-4-6", "claude-opus-4-6",
"claude-sonnet-4-6", "claude-sonnet-4-6",
"claude-opus-4-5-20251101", "claude-opus-4-5-20251101",

View file

@ -641,6 +641,9 @@ class AIAgent:
prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context. prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
Useful for injecting a few-shot example or priming the model's response style. Useful for injecting a few-shot example or priming the model's response style.
Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}] Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a conversation that ends on an
assistant-role message (400 error). For those models use structured outputs or
output_config.format instead of a trailing-assistant prefill.
platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp"). platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp").
Used to inject platform-specific formatting hints into the system prompt. Used to inject platform-specific formatting hints into the system prompt.
skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules

View file

@ -951,13 +951,19 @@ class TestBuildAnthropicKwargs:
max_tokens=4096, max_tokens=4096,
reasoning_config={"enabled": True, "effort": "high"}, reasoning_config={"enabled": True, "effort": "high"},
) )
assert kwargs["thinking"] == {"type": "adaptive"} # Adaptive thinking + display="summarized" keeps reasoning text
# populated in the response stream (Opus 4.7 default is "omitted").
assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"}
assert kwargs["output_config"] == {"effort": "high"} assert kwargs["output_config"] == {"effort": "high"}
assert "budget_tokens" not in kwargs["thinking"] assert "budget_tokens" not in kwargs["thinking"]
assert "temperature" not in kwargs assert "temperature" not in kwargs
assert kwargs["max_tokens"] == 4096 assert kwargs["max_tokens"] == 4096
def test_reasoning_config_maps_xhigh_to_max_effort_for_4_6_models(self): def test_reasoning_config_maps_xhigh_to_xhigh_effort_for_4_6_models(self):
# Opus 4.7 added "xhigh" as a distinct effort level (the recommended
# default for coding/agentic work). Earlier mapping aliased xhigh→max,
# which silently over-efforted every request. 2026-04-16 migration
# guide: xhigh and max are distinct levels.
kwargs = build_anthropic_kwargs( kwargs = build_anthropic_kwargs(
model="claude-sonnet-4-6", model="claude-sonnet-4-6",
messages=[{"role": "user", "content": "think harder"}], messages=[{"role": "user", "content": "think harder"}],
@ -965,9 +971,40 @@ class TestBuildAnthropicKwargs:
max_tokens=4096, max_tokens=4096,
reasoning_config={"enabled": True, "effort": "xhigh"}, reasoning_config={"enabled": True, "effort": "xhigh"},
) )
assert kwargs["thinking"] == {"type": "adaptive"} assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"}
assert kwargs["output_config"] == {"effort": "xhigh"}
def test_reasoning_config_maps_max_effort_for_4_7_models(self):
kwargs = build_anthropic_kwargs(
model="claude-opus-4-7",
messages=[{"role": "user", "content": "maximum reasoning please"}],
tools=None,
max_tokens=4096,
reasoning_config={"enabled": True, "effort": "max"},
)
assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"}
assert kwargs["output_config"] == {"effort": "max"} assert kwargs["output_config"] == {"effort": "max"}
def test_opus_4_7_strips_sampling_params(self):
# Opus 4.7 returns 400 on non-default temperature/top_p/top_k.
# build_anthropic_kwargs must strip them as a safety net even if an
# upstream caller injects them for older-model compatibility.
kwargs = build_anthropic_kwargs(
model="claude-opus-4-7",
messages=[{"role": "user", "content": "hi"}],
tools=None,
max_tokens=1024,
reasoning_config=None,
)
# Manually inject sampling params then re-run through the guard.
# Because build_anthropic_kwargs doesn't currently accept sampling
# params through its signature, we exercise the strip behavior by
# calling the internal predicate directly.
from agent.anthropic_adapter import _forbids_sampling_params
assert _forbids_sampling_params("claude-opus-4-7") is True
assert _forbids_sampling_params("claude-opus-4-6") is False
assert _forbids_sampling_params("claude-sonnet-4-5") is False
def test_reasoning_disabled(self): def test_reasoning_disabled(self):
kwargs = build_anthropic_kwargs( kwargs = build_anthropic_kwargs(
model="claude-sonnet-4-20250514", model="claude-sonnet-4-20250514",
@ -1248,6 +1285,21 @@ class TestNormalizeResponse:
assert r2 == "tool_calls" assert r2 == "tool_calls"
assert r3 == "length" assert r3 == "length"
def test_stop_reason_refusal_and_context_exceeded(self):
# Claude 4.5+ introduced two new stop_reason values the Messages API
# returns. We map both to OpenAI-style finish_reasons upstream
# handlers already understand, instead of silently collapsing to
# "stop" (old behavior).
block = SimpleNamespace(type="text", text="")
_, refusal_reason = normalize_anthropic_response(
self._make_response([block], "refusal")
)
_, overflow_reason = normalize_anthropic_response(
self._make_response([block], "model_context_window_exceeded")
)
assert refusal_reason == "content_filter"
assert overflow_reason == "length"
def test_no_text_content(self): def test_no_text_content(self):
block = SimpleNamespace( block = SimpleNamespace(
type="tool_use", id="tc_1", name="search", input={"q": "hi"} type="tool_use", id="tc_1", name="search", input={"q": "hi"}

View file

@ -113,8 +113,10 @@ class TestDefaultContextLengths:
for key, value in DEFAULT_CONTEXT_LENGTHS.items(): for key, value in DEFAULT_CONTEXT_LENGTHS.items():
if "claude" not in key: if "claude" not in key:
continue continue
# Claude 4.6 models have 1M context # Claude 4.6+ models (4.6 and 4.7) have 1M context at standard
if "4.6" in key or "4-6" in key: # API pricing (no long-context premium). Older Claude 4.x and
# 3.x models cap at 200k.
if any(tag in key for tag in ("4.6", "4-6", "4.7", "4-7")):
assert value == 1000000, f"{key} should be 1000000" assert value == 1000000, f"{key} should be 1000000"
else: else:
assert value == 200000, f"{key} should be 200000" assert value == 200000, f"{key} should be 200000"