From 91eef6255e39e0be7b0730aabf6ad2ea49eefe77 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Tue, 12 May 2026 01:05:25 +0530 Subject: [PATCH] fix: correct context-length resolution for kimi-k2.6 on Ollama Cloud and Kimi Coding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kimi-k2.6 (which supports 262K context) was incorrectly resolved as 32K, tripping the 64K minimum-context guard and preventing use of the model on Ollama Cloud and Kimi Coding / Moonshot providers. Three fixes in the context-length resolution chain: 1. Ollama Cloud native /api/show query: new _query_ollama_api_show() queries the Ollama native API for authoritative GGUF model_info context_length. For hosted Ollama, prefers model_info over num_ctx since users can't set their own num_ctx on Cloud. Added at step 5e in get_model_context_length(), before the models.dev fallback. 2. models.dev :cloud/-cloud suffix fallback: lookup_models_dev_context() now also tries appending :cloud and -cloud suffixes when the bare model name doesn't match. models.dev stores 'kimi-k2.6:cloud' but users and the live API use bare 'kimi-k2.6'. 3. Kimi-family 32K guard: after the OpenRouter metadata step, reject exactly 32768 for Kimi-named models (kimi-*, moonshot*) and fall through to hardcoded defaults ('kimi': 262144). OpenRouter reports 32768 for moonshotai/kimi-k2.6 but the model actually supports 262K. Narrow filter — only 32768, only Kimi-family — becomes dead code when OpenRouter updates its metadata. --- --- agent/model_metadata.py | 130 +++++++++++++++++++++++++++++++++++++--- agent/models_dev.py | 22 +++++++ 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/agent/model_metadata.py b/agent/model_metadata.py index e19ef1cbdb1..0e735c0ba8e 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -1006,6 +1006,79 @@ def query_ollama_num_ctx(model: str, base_url: str, api_key: str = "") -> Option return None +def _query_ollama_api_show(model: str, base_url: str, api_key: str = "") -> Optional[int]: + """Query an Ollama server's native ``/api/show`` for context length. + + Provider-agnostic: works against ANY Ollama-compatible server regardless + of hostname — local Ollama, Ollama Cloud (``ollama.com``), custom Ollama + hosting behind a reverse proxy, etc. For non-Ollama servers the POST + returns 404/405 quickly; the function handles errors gracefully. + + For hosted servers the GGUF ``model_info.*.context_length`` is the + authoritative source: the user can't set their own ``num_ctx``, and the + OpenAI-compat ``/v1/models`` endpoint correctly omits ``context_length`` + per the OpenAI schema. + + Resolution order for hosted Ollama: + 1. ``model_info.*.context_length`` — GGUF training max (authoritative) + 2. ``parameters`` → ``num_ctx`` — server-side Modelfile override + The order is flipped vs ``query_ollama_num_ctx()`` because local users + control ``num_ctx`` themselves; hosted users can't. + """ + import httpx + + server_url = base_url.rstrip("/") + if server_url.endswith("/v1"): + server_url = server_url[:-3] + + headers = _auth_headers(api_key) + + try: + with httpx.Client(timeout=5.0, headers=headers) as client: + resp = client.post(f"{server_url}/api/show", json={"name": model}) + if resp.status_code != 200: + return None + data = resp.json() + + # Hosted Ollama: GGUF model_info is the real max — prefer it over + # num_ctx which the Cloud operator may have capped arbitrarily. + model_info = data.get("model_info", {}) + for key, value in model_info.items(): + if "context_length" in key and isinstance(value, (int, float)): + ctx = int(value) + if ctx >= 1024: + return ctx + + # Fall back to num_ctx from Modelfile parameters (rare on Cloud) + params = data.get("parameters", "") + if "num_ctx" in params: + for line in params.split("\n"): + if "num_ctx" in line: + parts = line.strip().split() + if len(parts) >= 2: + try: + ctx = int(parts[-1]) + if ctx >= 1024: + return ctx + except ValueError: + pass + except Exception: + pass + return None + + +def _model_name_suggests_kimi(model: str) -> bool: + """Return True if the model name looks like a Kimi-family model. + + Catches ``kimi-k2.6``, ``kimi-k2.5``, ``kimi-k2-thinking``, + ``moonshotai/Kimi-K2.6``, and similar variants. Used as a guard + against stale OpenRouter metadata that underreports these models + as 32K context when they actually support 262K+. + """ + lower = model.lower() + return lower.startswith("kimi") or "moonshot" in lower + + def _query_local_context_length(model: str, base_url: str, api_key: str = "") -> Optional[int]: """Query a local server for the model's context length.""" import httpx @@ -1307,12 +1380,17 @@ def get_model_context_length( 2. Active endpoint metadata (/models for explicit custom endpoints) 3. Local server query (for local endpoints) 4. Anthropic /v1/models API (API-key users only, not OAuth) - 5. OpenRouter live API metadata - 6. Nous suffix-match via OpenRouter cache - 7. models.dev registry lookup (provider-aware) - 8. Thin hardcoded defaults (broad family patterns) - 9. Default fallback (256K) - """ + 5. Provider-aware lookups (before generic OpenRouter cache): + a. Copilot live /models API + b. Nous suffix-match via OpenRouter cache + c. Codex OAuth /models probe + d. GMI /models endpoint + e. Ollama native /api/show probe (any base_url, provider-agnostic) + f. models.dev registry lookup (with :cloud/-cloud suffix fallback) + 6. OpenRouter live API metadata (Kimi-family 32k guard) + 7. Hardcoded defaults (broad family patterns, longest-key-first) + 8. Local server query (last resort) + 9. Default fallback (256K)""" # 0. Explicit config override — user knows best if config_context_length is not None and isinstance(config_context_length, int) and config_context_length > 0: return config_context_length @@ -1392,6 +1470,13 @@ def get_model_context_length( if context_length is not None: return context_length if not _is_known_provider_base_url(base_url): + # 2b. Ollama native /api/show — any URL might be an Ollama server + # (local, cloud, or custom hosting). Non-Ollama servers return + # 404/405 quickly. Fall through on failure. + ctx = _query_ollama_api_show(model, base_url, api_key=api_key) + if ctx is not None: + save_context_length(model, base_url, ctx) + return ctx # 3. Try querying local server directly if is_local_endpoint(base_url): local_ctx = _query_local_context_length(model, base_url, api_key=api_key) @@ -1461,6 +1546,20 @@ def get_model_context_length( ctx = _resolve_endpoint_context_length(model, base_url, api_key=api_key) if ctx is not None: return ctx + # 5e. Ollama native /api/show probe — runs for ANY provider with a + # base_url, not just ollama-cloud. Ollama-compatible servers expose + # this endpoint regardless of hostname (local Ollama, Ollama Cloud, + # custom Ollama hosting). The OpenAI-compat /v1/models endpoint + # correctly omits context_length per the OpenAI schema, but /api/show + # returns the authoritative GGUF model_info.context_length. + # For non-Ollama servers (OpenAI, Anthropic, etc.), the POST returns + # 404/405 quickly. Results are cached, so the hit is per-model+URL, + # once per hour. + if base_url: + ctx = _query_ollama_api_show(model, base_url, api_key=api_key) + if ctx is not None: + save_context_length(model, base_url, ctx) + return ctx if effective_provider: from agent.models_dev import lookup_models_dev_context ctx = lookup_models_dev_context(effective_provider, model) @@ -1470,7 +1569,24 @@ def get_model_context_length( # 6. OpenRouter live API metadata (provider-unaware fallback) metadata = fetch_model_metadata() if model in metadata: - return metadata[model].get("context_length", DEFAULT_FALLBACK_CONTEXT) + or_ctx = metadata[model].get("context_length", DEFAULT_FALLBACK_CONTEXT) + # Guard against stale OpenRouter metadata for Kimi-family models. + # OpenRouter reports 32768 for moonshotai/kimi-k2.6, but the model + # actually supports 262144 (models.dev + official Kimi docs agree). + # Providers that host their own Kimi endpoints (Ollama Cloud, Kimi + # Coding, Moonshot) would otherwise trip the 64k minimum-context + # guard and reject a perfectly capable model. + # The filter is narrow: only reject exactly 32768 for Kimi-named + # models. If OpenRouter ever updates its data, the stale path + # becomes dead code with no impact. + if or_ctx == 32768 and _model_name_suggests_kimi(model): + logger.info( + "Rejecting OpenRouter metadata context=%s for %r " + "(Kimi-family underreport); falling through to hardcoded defaults", + or_ctx, model, + ) + else: + return or_ctx # 8. Hardcoded defaults (fuzzy match — longest key first for specificity) # Only check `default_model in model` (is the key a substring of the input). diff --git a/agent/models_dev.py b/agent/models_dev.py index fbb3153829b..0398571559e 100644 --- a/agent/models_dev.py +++ b/agent/models_dev.py @@ -347,6 +347,28 @@ def lookup_models_dev_context(provider: str, model: str) -> Optional[int]: if ctx: return ctx + # Suffix-aware fallback: some providers (e.g. ollama-cloud) store + # model IDs with :cloud / -cloud suffixes in models.dev while the + # live API returns bare names. Without this, kimi-k2.6 misses the + # kimi-k2.6:cloud entry and falls through to stale OpenRouter metadata + # reporting 32768 — tripping the 64k minimum-context guard. + # The suffix-stripping in fetch_ollama_cloud_models() handles the + # model-picker UX; this handles the context-length lookup path. + for suffix in (":cloud", "-cloud"): + suffixed_key = model + suffix + entry = models.get(suffixed_key) + if entry: + ctx = _extract_context(entry) + if ctx: + return ctx + # Also try case-insensitive + suffixed_lower = model_lower + suffix + for mid, mdata in models.items(): + if mid.lower() == suffixed_lower: + ctx = _extract_context(mdata) + if ctx: + return ctx + return None