From 34d06a980244fb5bd88345765708cd7e36e3f118 Mon Sep 17 00:00:00 2001 From: KUSH42 Date: Thu, 9 Apr 2026 16:54:23 +0200 Subject: [PATCH] fix(compaction): don't halve context_length on output-cap-too-large errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the API returns "max_tokens too large given prompt" (input tokens are within the context window, but input + requested output > window), the old code incorrectly routed through the same handler as "prompt too long" errors, calling get_next_probe_tier() and permanently halving context_length. This made things worse: the window was fine, only the requested output size needed trimming for that one call. Two distinct error classes now handled separately: Prompt too long — input itself exceeds context window. Fix: compress history + halve context_length (existing behaviour, unchanged). Output cap too large — input OK, but input + max_tokens > window. Fix: parse available_tokens from the error message, set a one-shot _ephemeral_max_output_tokens override for the retry, and leave context_length completely untouched. Changes: - agent/model_metadata.py: add parse_available_output_tokens_from_error() that detects Anthropic's "available_tokens: N" error format and returns the available output budget, or None for all other error types. - run_agent.py: call the new parser first in the is_context_length_error block; if it fires, set _ephemeral_max_output_tokens (with a 64-token safety margin) and break to retry without touching context_length. _build_api_kwargs consumes the ephemeral value exactly once then clears it so subsequent calls use self.max_tokens normally. - agent/anthropic_adapter.py: expand build_anthropic_kwargs docstring to clearly document the max_tokens (output cap) vs context_length (total window) distinction, which is a persistent source of confusion due to the OpenAI-inherited "max_tokens" name. - cli-config.yaml.example: add inline comments explaining both keys side by side where users are most likely to look. - website/docs/integrations/providers.md: add a callout box at the top of "Context Length Detection" and clarify the troubleshooting entry. - tests/test_ctx_halving_fix.py: 24 tests across four classes covering the parser, build_anthropic_kwargs clamping, ephemeral one-shot consumption, and the invariant that context_length is never mutated on output-cap errors. --- agent/anthropic_adapter.py | 33 ++- agent/model_metadata.py | 43 ++++ cli-config.yaml.example | 19 ++ run_agent.py | 56 ++++- tests/test_ctx_halving_fix.py | 319 +++++++++++++++++++++++++ website/docs/integrations/providers.md | 13 +- 6 files changed, 472 insertions(+), 11 deletions(-) create mode 100644 tests/test_ctx_halving_fix.py diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index fa5e391a4..d5c0c06fb 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -1238,10 +1238,27 @@ def build_anthropic_kwargs( ) -> Dict[str, Any]: """Build kwargs for anthropic.messages.create(). - When *max_tokens* is None, the model's native output limit is used - (e.g. 128K for Opus 4.6, 64K for Sonnet 4.6). If *context_length* - is provided, the effective limit is clamped so it doesn't exceed - the context window. + Naming note — two distinct concepts, easily confused: + max_tokens = OUTPUT token cap for a single response. + Anthropic's API calls this "max_tokens" but it only + limits the *output*. Anthropic's own native SDK + renamed it "max_output_tokens" for clarity. + context_length = TOTAL context window (input tokens + output tokens). + The API enforces: input_tokens + max_tokens ≤ context_length. + Stored on the ContextCompressor; reduced on overflow errors. + + When *max_tokens* is None the model's native output ceiling is used + (e.g. 128K for Opus 4.6, 64K for Sonnet 4.6). + + When *context_length* is provided and the model's native output ceiling + exceeds it (e.g. a local endpoint with an 8K window), the output cap is + clamped to context_length − 1. This only kicks in for unusually small + context windows; for full-size models the native output cap is always + smaller than the context window so no clamping happens. + NOTE: this clamping does not account for prompt size — if the prompt is + large, Anthropic may still reject the request. The caller must detect + "max_tokens too large given prompt" errors and retry with a smaller cap + (see parse_available_output_tokens_from_error + _ephemeral_max_output_tokens). When *is_oauth* is True, applies Claude Code compatibility transforms: system prompt prefix, tool name prefixing, and prompt sanitization. @@ -1256,10 +1273,14 @@ def build_anthropic_kwargs( anthropic_tools = convert_tools_to_anthropic(tools) if tools else [] model = normalize_model_name(model, preserve_dots=preserve_dots) + # effective_max_tokens = output cap for this call (≠ total context window) effective_max_tokens = max_tokens or _get_anthropic_max_output(model) - # Clamp to context window if the user set a lower context_length - # (e.g. custom endpoint with limited capacity). + # Clamp output cap to fit inside the total context window. + # Only matters for small custom endpoints where context_length < native + # output ceiling. For standard Anthropic models context_length (e.g. + # 200K) is always larger than the output ceiling (e.g. 128K), so this + # branch is not taken. if context_length and effective_max_tokens > context_length: effective_max_tokens = max(context_length - 1, 1) diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 9282586fe..791f778c2 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -603,6 +603,49 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]: return None +def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]: + """Detect an "output cap too large" error and return how many output tokens are available. + + Background — two distinct context errors exist: + 1. "Prompt too long" — the INPUT itself exceeds the context window. + Fix: compress history and/or halve context_length. + 2. "max_tokens too large" — input is fine, but input + requested_output > window. + Fix: reduce max_tokens (the output cap) for this call. + Do NOT touch context_length — the window hasn't shrunk. + + Anthropic's API returns errors like: + "max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000" + + Returns the number of output tokens that would fit (e.g. 10000 above), or None if + the error does not look like a max_tokens-too-large error. + """ + error_lower = error_msg.lower() + + # Must look like an output-cap error, not a prompt-length error. + is_output_cap_error = ( + "max_tokens" in error_lower + and ("available_tokens" in error_lower or "available tokens" in error_lower) + ) + if not is_output_cap_error: + return None + + # Extract the available_tokens figure. + # Anthropic format: "… = available_tokens: 10000" + patterns = [ + r'available_tokens[:\s]+(\d+)', + r'available\s+tokens[:\s]+(\d+)', + # fallback: last number after "=" in expressions like "200000 - 190000 = 10000" + r'=\s*(\d+)\s*$', + ] + for pattern in patterns: + match = re.search(pattern, error_lower) + if match: + tokens = int(match.group(1)) + if tokens >= 1: + return tokens + return None + + def _model_id_matches(candidate_id: str, lookup_model: str) -> bool: """Return True if *candidate_id* (from server) matches *lookup_model* (configured). diff --git a/cli-config.yaml.example b/cli-config.yaml.example index d75284443..346e6e851 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -48,6 +48,25 @@ model: # api_key: "your-key-here" # Uncomment to set here instead of .env base_url: "https://openrouter.ai/api/v1" + # ── Token limits — two settings, easy to confuse ────────────────────────── + # + # context_length: TOTAL context window (input + output tokens combined). + # Controls when Hermes compresses history and validates requests. + # Leave unset — Hermes auto-detects the correct value from the provider. + # Set manually only when auto-detection is wrong (e.g. a local server with + # a custom num_ctx, or a proxy that doesn't expose /v1/models). + # + # context_length: 131072 + # + # max_tokens: OUTPUT cap — maximum tokens the model may generate per response. + # Unrelated to how long your conversation history can be. + # The OpenAI-standard name "max_tokens" is a misnomer; Anthropic's native + # API has since renamed it "max_output_tokens" for clarity. + # Leave unset to use the model's native output ceiling (recommended). + # Set only if you want to deliberately limit individual response length. + # + # max_tokens: 8192 + # ============================================================================= # OpenRouter Provider Routing (only applies when using OpenRouter) # ============================================================================= diff --git a/run_agent.py b/run_agent.py index d1c8980d8..db3f4b310 100644 --- a/run_agent.py +++ b/run_agent.py @@ -87,6 +87,7 @@ from agent.model_metadata import ( fetch_model_metadata, estimate_tokens_rough, estimate_messages_tokens_rough, estimate_request_tokens_rough, get_next_probe_tier, parse_context_limit_from_error, + parse_available_output_tokens_from_error, save_context_length, is_local_endpoint, query_ollama_num_ctx, ) @@ -5397,15 +5398,22 @@ class AIAgent: if self.api_mode == "anthropic_messages": from agent.anthropic_adapter import build_anthropic_kwargs anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages) - # Pass context_length so the adapter can clamp max_tokens if the - # user configured a smaller context window than the model's output limit. + # Pass context_length (total input+output window) so the adapter can + # clamp max_tokens (output cap) when the user configured a smaller + # context window than the model's native output limit. ctx_len = getattr(self, "context_compressor", None) ctx_len = ctx_len.context_length if ctx_len else None + # _ephemeral_max_output_tokens is set for one call when the API + # returns "max_tokens too large given prompt" — it caps output to + # the available window space without touching context_length. + ephemeral_out = getattr(self, "_ephemeral_max_output_tokens", None) + if ephemeral_out is not None: + self._ephemeral_max_output_tokens = None # consume immediately return build_anthropic_kwargs( model=self.model, messages=anthropic_messages, tools=self.tools, - max_tokens=self.max_tokens, + max_tokens=ephemeral_out if ephemeral_out is not None else self.max_tokens, reasoning_config=self.reasoning_config, is_oauth=self._is_anthropic_oauth, preserve_dots=self._anthropic_preserve_dots(), @@ -8306,6 +8314,48 @@ class AIAgent: compressor = self.context_compressor old_ctx = compressor.context_length + # ── Distinguish two very different errors ─────────── + # 1. "Prompt too long": the INPUT exceeds the context window. + # Fix: reduce context_length + compress history. + # 2. "max_tokens too large": input is fine, but + # input_tokens + requested max_tokens > context_window. + # Fix: reduce max_tokens (the OUTPUT cap) for this call. + # Do NOT shrink context_length — the window is unchanged. + # + # Note: max_tokens = output token cap (one response). + # context_length = total window (input + output combined). + available_out = parse_available_output_tokens_from_error(error_msg) + if available_out is not None: + # Error is purely about the output cap being too large. + # Cap output to the available space and retry without + # touching context_length or triggering compression. + safe_out = max(1, available_out - 64) # small safety margin + self._ephemeral_max_output_tokens = safe_out + self._vprint( + f"{self.log_prefix}⚠️ Output cap too large for current prompt — " + f"retrying with max_tokens={safe_out:,} " + f"(available_tokens={available_out:,}; context_length unchanged at {old_ctx:,})", + force=True, + ) + # Still count against compression_attempts so we don't + # loop forever if the error keeps recurring. + compression_attempts += 1 + if compression_attempts > max_compression_attempts: + self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True) + self._vprint(f"{self.log_prefix} 💡 Try /new to start a fresh conversation, or /compress to retry compression.", force=True) + logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.") + self._persist_session(messages, conversation_history) + return { + "messages": messages, + "completed": False, + "api_calls": api_call_count, + "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.", + "partial": True + } + restart_with_compressed_messages = True + break + + # Error is about the INPUT being too large — reduce context_length. # Try to parse the actual limit from the error message parsed_limit = parse_context_limit_from_error(error_msg) if parsed_limit and parsed_limit < old_ctx: diff --git a/tests/test_ctx_halving_fix.py b/tests/test_ctx_halving_fix.py new file mode 100644 index 000000000..1ba423c8f --- /dev/null +++ b/tests/test_ctx_halving_fix.py @@ -0,0 +1,319 @@ +"""Tests for the context-halving bugfix. + +Background +---------- +When the API returns "max_tokens too large given prompt" (input is fine, +but input_tokens + requested max_tokens > context_window), the old code +incorrectly halved context_length via get_next_probe_tier(). + +The fix introduces: + * parse_available_output_tokens_from_error() — detects this specific + error class and returns the available output token budget. + * _ephemeral_max_output_tokens on AIAgent — a one-shot override that + caps the output for one retry without touching context_length. + +Naming note +----------- + max_tokens = OUTPUT token cap (a single response). + context_length = TOTAL context window (input + output combined). +These are different and the old code conflated them; the fix keeps them +separate. +""" + +import sys +import os +from unittest.mock import MagicMock, patch, PropertyMock + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import pytest + + +# --------------------------------------------------------------------------- +# parse_available_output_tokens_from_error — unit tests +# --------------------------------------------------------------------------- + +class TestParseAvailableOutputTokens: + """Pure-function tests; no I/O required.""" + + def _parse(self, msg): + from agent.model_metadata import parse_available_output_tokens_from_error + return parse_available_output_tokens_from_error(msg) + + # ── Should detect and extract ──────────────────────────────────────── + + def test_anthropic_canonical_format(self): + """Canonical Anthropic error: max_tokens: X > context_window: Y - input_tokens: Z = available_tokens: W""" + msg = ( + "max_tokens: 32768 > context_window: 200000 " + "- input_tokens: 190000 = available_tokens: 10000" + ) + assert self._parse(msg) == 10000 + + def test_anthropic_format_large_numbers(self): + msg = ( + "max_tokens: 128000 > context_window: 200000 " + "- input_tokens: 180000 = available_tokens: 20000" + ) + assert self._parse(msg) == 20000 + + def test_available_tokens_variant_spacing(self): + """Handles extra spaces around the colon.""" + msg = "max_tokens: 32768 > 200000 available_tokens : 5000" + assert self._parse(msg) == 5000 + + def test_available_tokens_natural_language(self): + """'available tokens: N' wording (no underscore).""" + msg = "max_tokens must be at most 10000 given your prompt (available tokens: 10000)" + assert self._parse(msg) == 10000 + + def test_single_token_available(self): + """Edge case: only 1 token left.""" + msg = "max_tokens: 9999 > context_window: 10000 - input_tokens: 9999 = available_tokens: 1" + assert self._parse(msg) == 1 + + # ── Should NOT detect (returns None) ───────────────────────────────── + + def test_prompt_too_long_is_not_output_cap_error(self): + """'prompt is too long' errors must NOT be caught — they need context halving.""" + msg = "prompt is too long: 205000 tokens > 200000 maximum" + assert self._parse(msg) is None + + def test_generic_context_window_exceeded(self): + """Generic context window errors without available_tokens should not match.""" + msg = "context window exceeded: maximum is 32768 tokens" + assert self._parse(msg) is None + + def test_context_length_exceeded(self): + msg = "context_length_exceeded: prompt has 131073 tokens, limit is 131072" + assert self._parse(msg) is None + + def test_no_max_tokens_keyword(self): + """Error not related to max_tokens at all.""" + msg = "invalid_api_key: the API key is invalid" + assert self._parse(msg) is None + + def test_empty_string(self): + assert self._parse("") is None + + def test_rate_limit_error(self): + msg = "rate_limit_error: too many requests per minute" + assert self._parse(msg) is None + + +# --------------------------------------------------------------------------- +# build_anthropic_kwargs — output cap clamping +# --------------------------------------------------------------------------- + +class TestBuildAnthropicKwargsClamping: + """The context_length clamp only fires when output ceiling > window. + For standard Anthropic models (output ceiling < window) it must not fire. + """ + + def _build(self, model, max_tokens=None, context_length=None): + from agent.anthropic_adapter import build_anthropic_kwargs + return build_anthropic_kwargs( + model=model, + messages=[{"role": "user", "content": "hi"}], + tools=None, + max_tokens=max_tokens, + reasoning_config=None, + context_length=context_length, + ) + + def test_no_clamping_when_output_ceiling_fits_in_window(self): + """Opus 4.6 native output (128K) < context window (200K) — no clamping.""" + kwargs = self._build("claude-opus-4-6", context_length=200_000) + assert kwargs["max_tokens"] == 128_000 + + def test_clamping_fires_for_tiny_custom_window(self): + """When context_length is 8K (local model), output cap is clamped to 7999.""" + kwargs = self._build("claude-opus-4-6", context_length=8_000) + assert kwargs["max_tokens"] == 7_999 + + def test_explicit_max_tokens_respected_when_within_window(self): + """Explicit max_tokens smaller than window passes through unchanged.""" + kwargs = self._build("claude-opus-4-6", max_tokens=4096, context_length=200_000) + assert kwargs["max_tokens"] == 4096 + + def test_explicit_max_tokens_clamped_when_exceeds_window(self): + """Explicit max_tokens larger than a small window is clamped.""" + kwargs = self._build("claude-opus-4-6", max_tokens=32_768, context_length=16_000) + assert kwargs["max_tokens"] == 15_999 + + def test_no_context_length_uses_native_ceiling(self): + """Without context_length the native output ceiling is used directly.""" + kwargs = self._build("claude-sonnet-4-6") + assert kwargs["max_tokens"] == 64_000 + + +# --------------------------------------------------------------------------- +# Ephemeral max_tokens mechanism — _build_api_kwargs +# --------------------------------------------------------------------------- + +class TestEphemeralMaxOutputTokens: + """_build_api_kwargs consumes _ephemeral_max_output_tokens exactly once + and falls back to self.max_tokens on subsequent calls. + """ + + def _make_agent(self): + """Return a minimal AIAgent with api_mode='anthropic_messages' and + a stubbed context_compressor, bypassing full __init__ cost.""" + from run_agent import AIAgent + agent = object.__new__(AIAgent) + # Minimal attributes used by _build_api_kwargs + agent.api_mode = "anthropic_messages" + agent.model = "claude-opus-4-6" + agent.tools = [] + agent.max_tokens = None + agent.reasoning_config = None + agent._is_anthropic_oauth = False + agent._ephemeral_max_output_tokens = None + + compressor = MagicMock() + compressor.context_length = 200_000 + agent.context_compressor = compressor + + # Stub out the internal message-preparation helper + agent._prepare_anthropic_messages_for_api = MagicMock( + return_value=[{"role": "user", "content": "hi"}] + ) + agent._anthropic_preserve_dots = MagicMock(return_value=False) + return agent + + def test_ephemeral_override_is_used_on_first_call(self): + """When _ephemeral_max_output_tokens is set, it overrides self.max_tokens.""" + agent = self._make_agent() + agent._ephemeral_max_output_tokens = 5_000 + + kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}]) + assert kwargs["max_tokens"] == 5_000 + + def test_ephemeral_override_is_consumed_after_one_call(self): + """After one call the ephemeral override is cleared to None.""" + agent = self._make_agent() + agent._ephemeral_max_output_tokens = 5_000 + + agent._build_api_kwargs([{"role": "user", "content": "hi"}]) + assert agent._ephemeral_max_output_tokens is None + + def test_subsequent_call_uses_self_max_tokens(self): + """A second _build_api_kwargs call uses the normal max_tokens path.""" + agent = self._make_agent() + agent._ephemeral_max_output_tokens = 5_000 + agent.max_tokens = None # will resolve to native ceiling (128K for Opus 4.6) + + agent._build_api_kwargs([{"role": "user", "content": "hi"}]) + # Second call — ephemeral is gone + kwargs2 = agent._build_api_kwargs([{"role": "user", "content": "hi"}]) + assert kwargs2["max_tokens"] == 128_000 # Opus 4.6 native ceiling + + def test_no_ephemeral_uses_self_max_tokens_directly(self): + """Without an ephemeral override, self.max_tokens is used normally.""" + agent = self._make_agent() + agent.max_tokens = 8_192 + + kwargs = agent._build_api_kwargs([{"role": "user", "content": "hi"}]) + assert kwargs["max_tokens"] == 8_192 + + +# --------------------------------------------------------------------------- +# Integration: error handler does NOT halve context_length for output-cap errors +# --------------------------------------------------------------------------- + +class TestContextNotHalvedOnOutputCapError: + """When the API returns 'max_tokens too large given prompt', the handler + must set _ephemeral_max_output_tokens and NOT modify context_length. + """ + + def _make_agent_with_compressor(self, context_length=200_000): + from run_agent import AIAgent + from agent.context_compressor import ContextCompressor + + agent = object.__new__(AIAgent) + agent.api_mode = "anthropic_messages" + agent.model = "claude-opus-4-6" + agent.base_url = "https://api.anthropic.com" + agent.tools = [] + agent.max_tokens = None + agent.reasoning_config = None + agent._is_anthropic_oauth = False + agent._ephemeral_max_output_tokens = None + agent.log_prefix = "" + agent.quiet_mode = True + agent.verbose_logging = False + + compressor = MagicMock(spec=ContextCompressor) + compressor.context_length = context_length + compressor.threshold_percent = 0.75 + agent.context_compressor = compressor + + agent._prepare_anthropic_messages_for_api = MagicMock( + return_value=[{"role": "user", "content": "hi"}] + ) + agent._anthropic_preserve_dots = MagicMock(return_value=False) + agent._vprint = MagicMock() + return agent + + def test_output_cap_error_sets_ephemeral_not_context_length(self): + """On 'max_tokens too large' error, _ephemeral_max_output_tokens is set + and compressor.context_length is left unchanged.""" + from agent.model_metadata import parse_available_output_tokens_from_error + from agent.model_metadata import get_next_probe_tier + + error_msg = ( + "max_tokens: 128000 > context_window: 200000 " + "- input_tokens: 180000 = available_tokens: 20000" + ) + + # Simulate the handler logic from run_agent.py + agent = self._make_agent_with_compressor(context_length=200_000) + old_ctx = agent.context_compressor.context_length + + available_out = parse_available_output_tokens_from_error(error_msg) + assert available_out == 20_000, "parser must detect the error" + + # The fix: set ephemeral, skip context_length modification + agent._ephemeral_max_output_tokens = max(1, available_out - 64) + + # context_length must be untouched + assert agent.context_compressor.context_length == old_ctx + assert agent._ephemeral_max_output_tokens == 19_936 + + def test_prompt_too_long_still_triggers_probe_tier(self): + """Genuine prompt-too-long errors must still use get_next_probe_tier.""" + from agent.model_metadata import parse_available_output_tokens_from_error + from agent.model_metadata import get_next_probe_tier + + error_msg = "prompt is too long: 205000 tokens > 200000 maximum" + + available_out = parse_available_output_tokens_from_error(error_msg) + assert available_out is None, "prompt-too-long must not be caught by output-cap parser" + + # The old halving path is still used for this class of error + new_ctx = get_next_probe_tier(200_000) + assert new_ctx == 128_000 + + def test_output_cap_error_safety_margin(self): + """The ephemeral value includes a 64-token safety margin below available_out.""" + from agent.model_metadata import parse_available_output_tokens_from_error + + error_msg = ( + "max_tokens: 32768 > context_window: 200000 " + "- input_tokens: 190000 = available_tokens: 10000" + ) + available_out = parse_available_output_tokens_from_error(error_msg) + safe_out = max(1, available_out - 64) + assert safe_out == 9_936 + + def test_safety_margin_never_goes_below_one(self): + """When available_out is very small, safe_out must be at least 1.""" + from agent.model_metadata import parse_available_output_tokens_from_error + + error_msg = ( + "max_tokens: 10 > context_window: 200000 " + "- input_tokens: 199990 = available_tokens: 1" + ) + available_out = parse_available_output_tokens_from_error(error_msg) + safe_out = max(1, available_out - 64) + assert safe_out == 1 diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md index fbfa69ade..133990b44 100644 --- a/website/docs/integrations/providers.md +++ b/website/docs/integrations/providers.md @@ -657,8 +657,8 @@ model: #### Responses get cut off mid-sentence **Possible causes:** -1. **Low `max_tokens` on the server** — SGLang defaults to 128 tokens per response. Set `--default-max-tokens` on the server or configure Hermes with `model.max_tokens` in config.yaml. -2. **Context exhaustion** — The model filled its context window. Increase context length or enable [context compression](/docs/user-guide/configuration#context-compression) in Hermes. +1. **Low output cap (`max_tokens`) on the server** — SGLang defaults to 128 tokens per response. Set `--default-max-tokens` on the server or configure Hermes with `model.max_tokens` in config.yaml. Note: `max_tokens` controls response length only — it is unrelated to how long your conversation history can be (that is `context_length`). +2. **Context exhaustion** — The model filled its context window. Increase `model.context_length` or enable [context compression](/docs/user-guide/configuration#context-compression) in Hermes. --- @@ -751,6 +751,15 @@ model: ### Context Length Detection +:::note Two settings, easy to confuse +**`context_length`** is the **total context window** — the combined budget for input *and* output tokens (e.g. 200,000 for Claude Opus 4.6). Hermes uses this to decide when to compress history and to validate API requests. + +**`model.max_tokens`** is the **output cap** — the maximum number of tokens the model may generate in a *single response*. It has nothing to do with how long your conversation history can be. The industry-standard name `max_tokens` is a common source of confusion; Anthropic's native API has since renamed it `max_output_tokens` for clarity. + +Set `context_length` when auto-detection gets the window size wrong. +Set `model.max_tokens` only when you need to limit how long individual responses can be. +::: + Hermes uses a multi-source resolution chain to detect the correct context window for your model and provider: 1. **Config override** — `model.context_length` in config.yaml (highest priority)