fix(model_metadata): consult DEFAULT_CONTEXT_LENGTHS before 256K fallback on custom endpoints

Problem: get_model_context_length() had an early return at the end of the
custom-endpoint probe branch (step 3) that returned DEFAULT_FALLBACK_CONTEXT
(256K) without ever consulting the hardcoded DEFAULT_CONTEXT_LENGTHS catalog
(step 8). Models served through a custom/proxied gateway (e.g. corporate
Anthropic proxy) that didn't expose Ollama or local-server endpoints would
hit this path and get capped at 256K, even when the model name clearly
matched a known entry in the catalog (e.g. claude-opus-4-8 → 1M).

Changes:
- agent/model_metadata.py: Before returning DEFAULT_FALLBACK_CONTEXT at the
  end of the custom-endpoint branch, consult DEFAULT_CONTEXT_LENGTHS using
  the same longest-key-first fuzzy matching as step 8. Only fall through
  to 256K if no catalog entry matches.
- tests/agent/test_model_metadata.py: Updated existing test and added new
  test covering the custom-endpoint → catalog fallback behavior.

Fixes #38865
This commit is contained in:
islam666 2026-06-04 16:19:24 +00:00 committed by Teknium
parent f1d3afb151
commit 2e61de0638
2 changed files with 89 additions and 3 deletions

View file

@ -1684,6 +1684,26 @@ def get_model_context_length(
"in config.yaml to override.",
model, base_url, f"{DEFAULT_FALLBACK_CONTEXT:,}",
)
# 3b. Before falling back to the hard 256K default, consult the
# hardcoded catalog as a last resort. A proxied/custom Anthropic
# gateway (e.g. corporate proxy) fails the Ollama/local probes
# above, but the model name may still match an entry in
# DEFAULT_CONTEXT_LENGTHS (e.g. "claude-opus-4-8" → 1M).
# Without this, the early return here short-circuits the catalog
# lookup at step 8 and silently caps context at 256K.
model_lower = model.lower()
for default_model, length in sorted(
DEFAULT_CONTEXT_LENGTHS.items(),
key=lambda x: len(x[0]),
reverse=True,
):
if default_model in model_lower:
logger.info(
"Using hardcoded context length %s for model %r "
"(custom endpoint, catalog match on %r)",
f"{length:,}", model, default_model,
)
return length
return DEFAULT_FALLBACK_CONTEXT
# 4. Anthropic /v1/models API (only for regular API keys, not OAuth)

View file

@ -18,6 +18,7 @@ from unittest.mock import patch, MagicMock
from agent.model_metadata import (
CONTEXT_PROBE_TIERS,
DEFAULT_CONTEXT_LENGTHS,
DEFAULT_FALLBACK_CONTEXT,
_strip_provider_prefix,
estimate_tokens_rough,
estimate_messages_tokens_rough,
@ -773,17 +774,24 @@ class TestGetModelContextLength:
@patch("agent.model_metadata.fetch_model_metadata")
@patch("agent.model_metadata.fetch_endpoint_model_metadata")
def test_custom_endpoint_without_metadata_skips_name_based_default(self, mock_endpoint_fetch, mock_fetch):
def test_custom_endpoint_without_metadata_falls_back_to_catalog(self, mock_endpoint_fetch, mock_fetch):
"""Custom endpoint with no metadata should fall back to the hardcoded
catalog (not 256K) when the model name matches a known entry.
Previously this returned CONTEXT_PROBE_TIERS[0] (256K) because the
custom-endpoint branch short-circuited before the catalog lookup.
See #38865.
"""
mock_fetch.return_value = {}
mock_endpoint_fetch.return_value = {}
# GLM-5-TEE matches the "glm" entry in DEFAULT_CONTEXT_LENGTHS
result = get_model_context_length(
"zai-org/GLM-5-TEE",
base_url="https://llm.chutes.ai/v1",
api_key="test-key",
)
assert result == CONTEXT_PROBE_TIERS[0]
assert result == 202752 # "glm" entry in DEFAULT_CONTEXT_LENGTHS
@patch("agent.model_metadata.fetch_model_metadata")
@patch("agent.model_metadata.fetch_endpoint_model_metadata")
@ -858,6 +866,64 @@ class TestGetModelContextLength:
assert result == 200000
@patch("agent.model_metadata.fetch_model_metadata")
def test_custom_endpoint_falls_back_to_hardcoded_catalog(self, mock_fetch):
"""Custom/proxied endpoint that fails all probes should still resolve
via DEFAULT_CONTEXT_LENGTHS instead of returning 256K.
Regression test for #38865: a corporate Anthropic proxy (custom
base_url) caused the custom-endpoint branch to short-circuit before
the catalog lookup, capping context at 256K even for models like
claude-opus-4-8 that are in the hardcoded catalog with 1M.
"""
mock_fetch.return_value = {}
# Patch all the probe functions that the custom-endpoint branch calls
# so they all fail (return None/empty), simulating a proxy that
# doesn't expose Ollama or local-server endpoints.
with (
patch(
"agent.model_metadata._resolve_endpoint_context_length",
return_value=None,
),
patch(
"agent.model_metadata._query_ollama_api_show",
return_value=None,
),
patch(
"agent.model_metadata._query_local_context_length",
return_value=None,
),
patch(
"agent.model_metadata.is_local_endpoint",
return_value=False,
),
):
# A known model behind a custom proxy should resolve to its
# catalog value (1M), NOT the 256K fallback.
ctx = get_model_context_length(
"claude-opus-4-8",
base_url="https://my-gateway.example.com/v1/claude",
)
assert ctx == 1000000, f"Expected 1000000, got {ctx}"
# Another known model
ctx2 = get_model_context_length(
"claude-sonnet-4-6",
base_url="https://my-gateway.example.com/v1/claude",
)
assert ctx2 == 1000000, f"Expected 1000000, got {ctx2}"
# An unknown model on a custom endpoint should still fall back
# to 256K (no catalog match).
ctx3 = get_model_context_length(
"totally-unknown-model",
base_url="https://my-gateway.example.com/v1/claude",
)
assert ctx3 == DEFAULT_FALLBACK_CONTEXT, (
f"Expected {DEFAULT_FALLBACK_CONTEXT}, got {ctx3}"
)
# =========================================================================
# Bedrock context resolution — must run BEFORE custom-endpoint probe