fix(agent): complete Claude Opus 4.7 API migration

Claude Opus 4.7 introduced several breaking API changes that the current codebase partially handled but not completely. This patch finishes the migration per the official migration guide at https://platform.claude.com/docs/en/about-claude/models/migration-guide Fixes NousResearch/hermes-agent#11137 Breaking-change coverage: 1. Adaptive thinking + output_config.effort — 4.7 is now recognized by _supports_adaptive_thinking() (extends previous 4.6-only gate). 2. Sampling parameter stripping — 4.7 returns 400 for any non-default temperature / top_p / top_k. build_anthropic_kwargs drops them as a safety net; the OpenAI-protocol auxiliary path (_build_call_kwargs) and AnthropicCompletionsAdapter.create() both early-exit before setting temperature for 4.7+ models. This keeps flush_memories and structured-JSON aux paths that hardcode temperature from 400ing when the aux model is flipped to 4.7. 3. thinking.display = "summarized" — 4.7 defaults display to "omitted", which silently hides reasoning text from Hermes's CLI activity feed during long tool runs. Restoring "summarized" preserves 4.6 UX. 4. Effort level mapping — xhigh now maps to xhigh (was xhigh→max, which silently over-efforted every coding/agentic request). max is now a distinct ceiling per Anthropic's 5-level effort model. 5. New stop_reason values — refusal and model_context_window_exceeded were silently collapsed to "stop" (end_turn) by the adapter's stop_reason_map. Now mapped to "content_filter" and "length" respectively, matching upstream finish-reason handling already in bedrock_adapter. 6. Model catalogs — claude-opus-4-7 added to the Anthropic provider list, anthropic/claude-opus-4.7 added at top of OpenRouter fallback catalog (recommended), claude-opus-4-7 added to model_metadata DEFAULT_CONTEXT_LENGTHS (1M, matching 4.6 per migration guide). 7. Prefill docstrings — run_agent.AIAgent and BatchRunner now document that Anthropic Sonnet/Opus 4.6+ reject a trailing assistant-role prefill (400). 8. Tests — 4 new tests in test_anthropic_adapter covering display default, xhigh preservation, max on 4.7, refusal / context-overflow stop_reason mapping, plus the sampling-param predicate. test_model_metadata accepts 4.7 at 1M context. Tested on macOS 15.5 (darwin). 119 tests pass in tests/agent/test_anthropic_adapter.py, 1320 pass in tests/agent/.
2026-06-09 08:21:50 +00:00 · 2026-04-16 12:35:43 -05:00 · 2026-04-16 12:35:43 -05:00 · 0517ac3e93
commit 0517ac3e93
parent 1ccd063786
8 changed files with 155 additions and 19 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -28,19 +28,37 @@ except ImportError:
 logger = logging.getLogger(__name__)

 THINKING_BUDGET = {"xhigh": 32000, "high": 16000, "medium": 8000, "low": 4000}
+# Hermes effort → Anthropic adaptive-thinking effort (output_config.effort).
+# Anthropic exposes 5 levels on 4.7+: low, medium, high, xhigh, max.
+# We preserve xhigh as xhigh (the recommended default for coding/agentic on
+# 4.7) and expose max as a distinct ceiling. "minimal" is a legacy alias that
+# maps to low.  See:
+# https://platform.claude.com/docs/en/about-claude/models/migration-guide
 ADAPTIVE_EFFORT_MAP = {
-    "xhigh": "max",
-    "high": "high",
-    "medium": "medium",
-    "low": "low",
+    "max":     "max",
+    "xhigh":   "xhigh",
+    "high":    "high",
+    "medium":  "medium",
+    "low":     "low",
    "minimal": "low",
 }

+# Models where extended thinking is deprecated/removed (4.6+ behavior: adaptive
+# is the only supported mode; 4.7 additionally forbids manual thinking entirely
+# and drops temperature/top_p/top_k).
+_ADAPTIVE_THINKING_SUBSTRINGS = ("4-6", "4.6", "4-7", "4.7")
+
+# Models where temperature/top_p/top_k return 400 if set to non-default values.
+# This is the Opus 4.7 contract; future 4.x+ models are expected to follow it.
+_NO_SAMPLING_PARAMS_SUBSTRINGS = ("4-7", "4.7")
+
 # ── Max output token limits per Anthropic model ───────────────────────
 # Source: Anthropic docs + Cline model catalog.  Anthropic's API requires
 # max_tokens as a mandatory field.  Previously we hardcoded 16384, which
 # starves thinking-enabled models (thinking tokens count toward the limit).
 _ANTHROPIC_OUTPUT_LIMITS = {
+    # Claude 4.7
+    "claude-opus-4-7":   128_000,
    # Claude 4.6
    "claude-opus-4-6":   128_000,
    "claude-sonnet-4-6":  64_000,
@ -91,11 +109,26 @@ def _get_anthropic_max_output(model: str) -> int:


 def _supports_adaptive_thinking(model: str) -> bool:
-    """Return True for Claude 4.6 models that support adaptive thinking."""
-    return any(v in model for v in ("4-6", "4.6"))
+    """Return True for Claude 4.6+ models that support adaptive thinking."""
+    return any(v in model for v in _ADAPTIVE_THINKING_SUBSTRINGS)


-# Beta headers for enhanced features (sent with ALL auth types)
+def _forbids_sampling_params(model: str) -> bool:
+    """Return True for models that 400 on any non-default temperature/top_p/top_k.
+
+    Opus 4.7 explicitly rejects sampling parameters; later Claude releases are
+    expected to follow suit.  Callers should omit these fields entirely rather
+    than passing zero/default values (the API rejects anything non-null).
+    """
+    return any(v in model for v in _NO_SAMPLING_PARAMS_SUBSTRINGS)
+
+
+# Beta headers for enhanced features (sent with ALL auth types).
+# As of Opus 4.7 (2026-04-16), both of these are GA on Claude 4.6+ — the
+# beta headers are still accepted (harmless no-op) but not required. Kept
+# here so older Claude (4.5, 4.1) + third-party Anthropic-compat endpoints
+# that still gate on the headers continue to get the enhanced features.
+# Migration guide: remove these if you no longer support ≤4.5 models.
 _COMMON_BETAS = [
    "interleaved-thinking-2025-05-14",
    "fine-grained-tool-streaming-2025-05-14",
@ -1341,18 +1374,26 @@ def build_anthropic_kwargs(
            kwargs["tool_choice"] = {"type": "tool", "name": tool_choice}

    # Map reasoning_config to Anthropic's thinking parameter.
-    # Claude 4.6 models use adaptive thinking + output_config.effort.
+    # Claude 4.6+ models use adaptive thinking + output_config.effort.
    # Older models use manual thinking with budget_tokens.
    # MiniMax Anthropic-compat endpoints support thinking (manual mode only,
    # not adaptive).  Haiku does NOT support extended thinking — skip entirely.
+    #
+    # On 4.7+ the `thinking.display` field defaults to "omitted", which
+    # silently hides reasoning text that Hermes surfaces in its CLI. We
+    # request "summarized" so the reasoning blocks stay populated — matching
+    # 4.6 behavior and preserving the activity-feed UX during long tool runs.
    if reasoning_config and isinstance(reasoning_config, dict):
        if reasoning_config.get("enabled") is not False and "haiku" not in model.lower():
            effort = str(reasoning_config.get("effort", "medium")).lower()
            budget = THINKING_BUDGET.get(effort, 8000)
            if _supports_adaptive_thinking(model):
-                kwargs["thinking"] = {"type": "adaptive"}
+                kwargs["thinking"] = {
+                    "type": "adaptive",
+                    "display": "summarized",
+                }
                kwargs["output_config"] = {
-                    "effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium")
+                    "effort": ADAPTIVE_EFFORT_MAP.get(effort, "medium"),
                }
            else:
                kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget}
@ -1360,6 +1401,15 @@ def build_anthropic_kwargs(
                kwargs["temperature"] = 1
                kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096)

+    # ── Strip sampling params on 4.7+ ─────────────────────────────────
+    # Opus 4.7 rejects any non-default temperature/top_p/top_k with a 400.
+    # Callers (auxiliary_client, flush_memories, etc.) may set these for
+    # older models; drop them here as a safety net so upstream 4.6 → 4.7
+    # migrations don't require coordinated edits everywhere.
+    if _forbids_sampling_params(model):
+        for _sampling_key in ("temperature", "top_p", "top_k"):
+            kwargs.pop(_sampling_key, None)
+
    # ── Fast mode (Opus 4.6 only) ────────────────────────────────────
    # Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x
    # output speed. Only for native Anthropic endpoints — third-party
@ -1417,12 +1467,20 @@ def normalize_anthropic_response(
                )
            )

-    # Map Anthropic stop_reason to OpenAI finish_reason
+    # Map Anthropic stop_reason to OpenAI finish_reason.
+    # Newer stop reasons added in Claude 4.5+ / 4.7:
+    #   - refusal: the model declined to answer (cyber safeguards, CSAM, etc.)
+    #   - model_context_window_exceeded: hit context limit (not max_tokens)
+    # Both need distinct handling upstream — a refusal should surface to the
+    # user with a clear message, and a context-window overflow should trigger
+    # compression/truncation rather than be treated as normal end-of-turn.
    stop_reason_map = {
        "end_turn": "stop",
        "tool_use": "tool_calls",
        "max_tokens": "length",
        "stop_sequence": "stop",
+        "refusal": "content_filter",
+        "model_context_window_exceeded": "length",
    }
    finish_reason = stop_reason_map.get(response.stop_reason, "stop")

--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -518,8 +518,13 @@ class _AnthropicCompletionsAdapter:
            tool_choice=normalized_tool_choice,
            is_oauth=self._is_oauth,
        )
+        # Opus 4.7+ rejects any non-default temperature/top_p/top_k; only set
+        # temperature for models that still accept it. build_anthropic_kwargs
+        # additionally strips these keys as a safety net — keep both layers.
        if temperature is not None:
-            anthropic_kwargs["temperature"] = temperature
+            from agent.anthropic_adapter import _forbids_sampling_params
+            if not _forbids_sampling_params(model):
+                anthropic_kwargs["temperature"] = temperature

        response = self._client.messages.create(**anthropic_kwargs)
        assistant_message, finish_reason = normalize_anthropic_response(response)
@ -2288,6 +2293,15 @@ def _build_call_kwargs(
        "timeout": timeout,
    }

+    # Opus 4.7+ rejects any non-default temperature/top_p/top_k — silently
+    # drop here so auxiliary callers that hardcode temperature (e.g. 0.3 on
+    # flush_memories, 0 on structured-JSON extraction) don't 400 the moment
+    # the aux model is flipped to 4.7.
+    if temperature is not None:
+        from agent.anthropic_adapter import _forbids_sampling_params
+        if _forbids_sampling_params(model):
+            temperature = None
+
    if temperature is not None:
        kwargs["temperature"] = temperature

--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -102,6 +102,8 @@ DEFAULT_CONTEXT_LENGTHS = {
    # fuzzy-match collisions (e.g. "anthropic/claude-sonnet-4" is a
    # substring of "anthropic/claude-sonnet-4.6").
    # OpenRouter-prefixed models resolve via OpenRouter live API or models.dev.
+    "claude-opus-4-7": 1000000,
+    "claude-opus-4.7": 1000000,
    "claude-opus-4-6": 1000000,
    "claude-sonnet-4-6": 1000000,
    "claude-opus-4.6": 1000000,
--- a/batch_runner.py
+++ b/batch_runner.py
@ -561,7 +561,10 @@ class BatchRunner:
            provider_sort (str): Sort providers by price/throughput/latency (optional)
            max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
            reasoning_config (Dict): OpenRouter reasoning config override (e.g. {"effort": "none"} to disable thinking)
-            prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming)
+            prefill_messages (List[Dict]): Messages to prepend as prefilled conversation context (few-shot priming).
+                NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a trailing assistant-role prefill
+                (400 error).  For those models use output_config.format or structured-output
+                schemas instead.  Safe here for user-role priming and for older Claude / non-Claude models.
            max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
        """
        self.dataset_file = Path(dataset_file)
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@ -26,7 +26,8 @@ COPILOT_REASONING_EFFORTS_O_SERIES = ["low", "medium", "high"]
 # Fallback OpenRouter snapshot used when the live catalog is unavailable.
 # (model_id, display description shown in menus)
 OPENROUTER_MODELS: list[tuple[str, str]] = [
-    ("anthropic/claude-opus-4.6",       "recommended"),
+    ("anthropic/claude-opus-4.7",       "recommended"),
+    ("anthropic/claude-opus-4.6",       ""),
    ("anthropic/claude-sonnet-4.6",     ""),
    ("qwen/qwen3.6-plus",               ""),
    ("anthropic/claude-sonnet-4.5",     ""),
@ -181,6 +182,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "MiniMax-M2",
    ],
    "anthropic": [
+        "claude-opus-4-7",
        "claude-opus-4-6",
        "claude-sonnet-4-6",
        "claude-opus-4-5-20251101",
--- a/run_agent.py
+++ b/run_agent.py
@ -641,6 +641,9 @@ class AIAgent:
            prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
                Useful for injecting a few-shot example or priming the model's response style.
                Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
+                NOTE: Anthropic Sonnet 4.6+ and Opus 4.6+ reject a conversation that ends on an
+                assistant-role message (400 error).  For those models use structured outputs or
+                output_config.format instead of a trailing-assistant prefill.
            platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp").
                Used to inject platform-specific formatting hints into the system prompt.
            skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules
--- a/tests/agent/test_anthropic_adapter.py
+++ b/tests/agent/test_anthropic_adapter.py
@ -951,13 +951,19 @@ class TestBuildAnthropicKwargs:
            max_tokens=4096,
            reasoning_config={"enabled": True, "effort": "high"},
        )
-        assert kwargs["thinking"] == {"type": "adaptive"}
+        # Adaptive thinking + display="summarized" keeps reasoning text
+        # populated in the response stream (Opus 4.7 default is "omitted").
+        assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"}
        assert kwargs["output_config"] == {"effort": "high"}
        assert "budget_tokens" not in kwargs["thinking"]
        assert "temperature" not in kwargs
        assert kwargs["max_tokens"] == 4096

-    def test_reasoning_config_maps_xhigh_to_max_effort_for_4_6_models(self):
+    def test_reasoning_config_maps_xhigh_to_xhigh_effort_for_4_6_models(self):
+        # Opus 4.7 added "xhigh" as a distinct effort level (the recommended
+        # default for coding/agentic work). Earlier mapping aliased xhigh→max,
+        # which silently over-efforted every request. 2026-04-16 migration
+        # guide: xhigh and max are distinct levels.
        kwargs = build_anthropic_kwargs(
            model="claude-sonnet-4-6",
            messages=[{"role": "user", "content": "think harder"}],
@ -965,9 +971,40 @@ class TestBuildAnthropicKwargs:
            max_tokens=4096,
            reasoning_config={"enabled": True, "effort": "xhigh"},
        )
-        assert kwargs["thinking"] == {"type": "adaptive"}
+        assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"}
+        assert kwargs["output_config"] == {"effort": "xhigh"}
+
+    def test_reasoning_config_maps_max_effort_for_4_7_models(self):
+        kwargs = build_anthropic_kwargs(
+            model="claude-opus-4-7",
+            messages=[{"role": "user", "content": "maximum reasoning please"}],
+            tools=None,
+            max_tokens=4096,
+            reasoning_config={"enabled": True, "effort": "max"},
+        )
+        assert kwargs["thinking"] == {"type": "adaptive", "display": "summarized"}
        assert kwargs["output_config"] == {"effort": "max"}

+    def test_opus_4_7_strips_sampling_params(self):
+        # Opus 4.7 returns 400 on non-default temperature/top_p/top_k.
+        # build_anthropic_kwargs must strip them as a safety net even if an
+        # upstream caller injects them for older-model compatibility.
+        kwargs = build_anthropic_kwargs(
+            model="claude-opus-4-7",
+            messages=[{"role": "user", "content": "hi"}],
+            tools=None,
+            max_tokens=1024,
+            reasoning_config=None,
+        )
+        # Manually inject sampling params then re-run through the guard.
+        # Because build_anthropic_kwargs doesn't currently accept sampling
+        # params through its signature, we exercise the strip behavior by
+        # calling the internal predicate directly.
+        from agent.anthropic_adapter import _forbids_sampling_params
+        assert _forbids_sampling_params("claude-opus-4-7") is True
+        assert _forbids_sampling_params("claude-opus-4-6") is False
+        assert _forbids_sampling_params("claude-sonnet-4-5") is False
+
    def test_reasoning_disabled(self):
        kwargs = build_anthropic_kwargs(
            model="claude-sonnet-4-20250514",
@ -1248,6 +1285,21 @@ class TestNormalizeResponse:
        assert r2 == "tool_calls"
        assert r3 == "length"

+    def test_stop_reason_refusal_and_context_exceeded(self):
+        # Claude 4.5+ introduced two new stop_reason values the Messages API
+        # returns.  We map both to OpenAI-style finish_reasons upstream
+        # handlers already understand, instead of silently collapsing to
+        # "stop" (old behavior).
+        block = SimpleNamespace(type="text", text="")
+        _, refusal_reason = normalize_anthropic_response(
+            self._make_response([block], "refusal")
+        )
+        _, overflow_reason = normalize_anthropic_response(
+            self._make_response([block], "model_context_window_exceeded")
+        )
+        assert refusal_reason == "content_filter"
+        assert overflow_reason == "length"
+
    def test_no_text_content(self):
        block = SimpleNamespace(
            type="tool_use", id="tc_1", name="search", input={"q": "hi"}
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -113,8 +113,10 @@ class TestDefaultContextLengths:
        for key, value in DEFAULT_CONTEXT_LENGTHS.items():
            if "claude" not in key:
                continue
-            # Claude 4.6 models have 1M context
-            if "4.6" in key or "4-6" in key:
+            # Claude 4.6+ models (4.6 and 4.7) have 1M context at standard
+            # API pricing (no long-context premium).  Older Claude 4.x and
+            # 3.x models cap at 200k.
+            if any(tag in key for tag in ("4.6", "4-6", "4.7", "4-7")):
                assert value == 1000000, f"{key} should be 1000000"
            else:
                assert value == 200000, f"{key} should be 200000"