Use nous portal as model metadata authority (#24502)

* nous portal metadata resolver * minor fixes
2026-05-18 04:41:56 +00:00 · 2026-05-12 14:59:31 -04:00 · 2026-05-12 14:59:31 -04:00 · 2863e9484a
commit 2863e9484a
parent c594a23047
2 changed files with 306 additions and 22 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -10,7 +10,7 @@ import os
 import re
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse

 import requests
@ -1330,21 +1330,40 @@ def _resolve_codex_oauth_context_length(
    return None


-def _resolve_nous_context_length(model: str) -> Optional[int]:
-    """Resolve Nous Portal model context length via OpenRouter metadata.
+def _resolve_nous_context_length(
+    model: str,
+    base_url: str = "",
+    api_key: str = "",
+) -> Tuple[Optional[int], str]:
+    """Resolve Nous Portal model context length.

-    Nous model IDs are bare (e.g. 'claude-opus-4-6') while OpenRouter uses
-    prefixed IDs (e.g. 'anthropic/claude-opus-4.6'). Try suffix matching
-    with version normalization (dot↔dash).
+    Tries the live Nous inference endpoint first (authoritative), then falls
+    back to OpenRouter metadata with suffix/version matching.
+
+    Nous model IDs are bare after prefix-stripping (e.g. 'qwen3.6-plus',
+    'claude-opus-4-6') while OpenRouter uses prefixed IDs (e.g.
+    'qwen/qwen3.6-plus', 'anthropic/claude-opus-4.6').  Version
+    normalization (dot↔dash) is applied to handle name drifts.
+
+    Returns ``(context_length, source)`` where ``source`` is one of:
+      - ``"portal"``    — live /v1/models response (authoritative)
+      - ``"openrouter"`` — OpenRouter cache fallback (non-authoritative;
+        callers must NOT persist this to the on-disk cache or a single
+        portal blip will freeze the wrong value in forever)
+      - ``""``           — could not resolve
    """
-    metadata = fetch_model_metadata()  # OpenRouter cache
+    # Portal first — the Nous /models endpoint is authoritative for what our
+    # infrastructure enforces and may differ from OR (e.g. OR reports 1M for
+    # qwen3.6-plus; the portal correctly says 262144).  Fall back to the OR
+    # catalog only if the portal doesn't list the model.
+    if base_url:
+        portal_ctx = _resolve_endpoint_context_length(model, base_url, api_key=api_key)
+        if portal_ctx is not None:
+            return portal_ctx, "portal"
+
+    metadata = fetch_model_metadata()

    def _safe_ctx(or_id: str, entry: dict) -> Optional[int]:
-        """Return context length, but reject stale 32k values for Kimi models.
-
-        Apply the same guard used for the generic OpenRouter path (step 6 in 
-        resolve_context_length) so the Nous portal path does not short-circuit it.
-        """
        ctx = entry.get("context_length")
        if ctx is None:
            return None
@ -1357,19 +1376,20 @@ def _resolve_nous_context_length(model: str) -> Optional[int]:
            return None
        return ctx

-    # Exact match first
    if model in metadata:
-        return _safe_ctx(model, metadata[model])
+        ctx = _safe_ctx(model, metadata[model])
+        if ctx is not None:
+            return ctx, "openrouter"

    normalized = _normalize_model_version(model).lower()

    for or_id, entry in metadata.items():
        bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
        if bare.lower() == model.lower() or _normalize_model_version(bare).lower() == normalized:
-            return _safe_ctx(or_id, entry)
+            ctx = _safe_ctx(or_id, entry)
+            if ctx is not None:
+                return ctx, "openrouter"

-    # Partial prefix match for cases like gemini-3-flash → gemini-3-flash-preview
-    # Require match to be at a word boundary (followed by -, :, or end of string)
    model_lower = model.lower()
    for or_id, entry in metadata.items():
        bare = or_id.split("/", 1)[1] if "/" in or_id else or_id
@ -1377,9 +1397,11 @@ def _resolve_nous_context_length(model: str) -> Optional[int]:
            if candidate.startswith(query) and (
                len(candidate) == len(query) or candidate[len(query)] in "-:."
            ):
-                return _safe_ctx(or_id, entry)
+                ctx = _safe_ctx(or_id, entry)
+                if ctx is not None:
+                    return ctx, "openrouter"

-    return None
+    return None, ""


 def get_model_context_length(
@ -1394,14 +1416,18 @@ def get_model_context_length(

    Resolution order:
    0. Explicit config override (model.context_length or custom_providers per-model)
-    1. Persistent cache (previously discovered via probing)
+    1. Persistent cache (previously discovered via probing).  Nous URLs
+       bypass the cache here so step 5b can always reconcile against
+       the authoritative portal /v1/models response.
    1b. AWS Bedrock static table (must precede custom-endpoint probe)
    2. Active endpoint metadata (/models for explicit custom endpoints)
    3. Local server query (for local endpoints)
    4. Anthropic /v1/models API (API-key users only, not OAuth)
    5. Provider-aware lookups (before generic OpenRouter cache):
       a. Copilot live /models API
-       b. Nous suffix-match via OpenRouter cache
+       b. Nous: live /v1/models probe first (authoritative), then OR
+          cache fallback with suffix/version normalisation.  Only
+          portal-derived values are persisted to disk.
       c. Codex OAuth /models probe
       d. GMI /models endpoint
       e. Ollama native /api/show probe (any base_url, provider-agnostic)
@ -1464,6 +1490,20 @@ def get_model_context_length(
                    model, base_url, f"{cached:,}",
                )
                _invalidate_cached_context_length(model, base_url)
+            # Nous Portal: the portal /v1/models endpoint is authoritative.
+            # Bypass the persistent cache so step 5b can always reconcile
+            # against it — this corrects pre-fix entries seeded from the
+            # OR catalog (the same OR underreport class that the Kimi/Qwen
+            # DEFAULT_CONTEXT_LENGTHS overrides exist to mitigate) without
+            # touching the on-disk file when the portal is unreachable.
+            # The in-memory 300s endpoint metadata cache makes the per-call
+            # cost amortise to ~0 within a process.
+            elif _infer_provider_from_url(base_url) == "nous":
+                logger.debug(
+                    "Bypassing persistent cache for %s@%s (Nous portal authoritative)",
+                    model, base_url,
+                )
+                # Fall through; step 5b reconciles and overwrites if portal responds.
            else:
                return cached

@ -1555,8 +1595,18 @@ def get_model_context_length(
            pass  # Fall through to models.dev

    if effective_provider == "nous":
-        ctx = _resolve_nous_context_length(model)
+        ctx, source = _resolve_nous_context_length(
+            model, base_url=base_url or "", api_key=api_key or ""
+        )
        if ctx:
+            # Persist ONLY portal-derived values.  Caching an OR-fallback
+            # value here would freeze in a wrong number on the first portal
+            # blip / auth glitch and step-1 would short-circuit it forever.
+            # OR's catalog is community-maintained and is precisely why the
+            # Kimi/Qwen DEFAULT_CONTEXT_LENGTHS overrides exist — we don't
+            # want it leaking into the persistent cache for Nous URLs.
+            if base_url and source == "portal":
+                save_context_length(model, base_url, ctx)
            return ctx
    if effective_provider == "openai-codex":
        # Codex OAuth enforces lower context limits than the direct OpenAI
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -473,6 +473,240 @@ class TestCodexOAuthContextLength:
        assert ctx == 1_000_000, "Non-codex 1M cache entries must be respected"


+# =========================================================================
+# Nous Portal context-window resolution (provider="nous")
+# =========================================================================
+
+class TestNousPortalContextResolution:
+    """Nous Portal /v1/models is authoritative for what Nous infra enforces
+    and may diverge from the OpenRouter catalog.
+
+    Invariants this class pins down:
+      1. Portal value wins over the OR fallback.
+      2. Portal-derived values are persisted to disk.
+      3. OR-fallback values are NEVER persisted — otherwise a single portal
+         blip would freeze the wrong value in via step-1 cache short-circuit.
+      4. Pre-fix persistent-cache entries (seeded from the OR catalog) are
+         bypassed at step 1 and overwritten once the portal responds.
+      5. Pre-fix persistent-cache entries SURVIVE on disk when the portal
+         is unreachable — no opportunistic invalidation that loses the only
+         value we have.
+    """
+
+    def setup_method(self):
+        import agent.model_metadata as mm
+        mm._endpoint_model_metadata_cache.clear()
+        mm._endpoint_model_metadata_cache_time.clear()
+
+    @patch("agent.model_metadata.fetch_endpoint_model_metadata")
+    @patch("agent.model_metadata.fetch_model_metadata")
+    def test_portal_value_wins_over_openrouter_catalog(
+        self, mock_or, mock_portal, tmp_path, monkeypatch
+    ):
+        """The motivating case: OR catalog says 1M for qwen3.6-plus, but
+        the Nous portal correctly enforces 262144.  Portal must win."""
+        import agent.model_metadata as mm
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        mock_portal.return_value = {
+            "qwen3.6-plus": {"context_length": 262_144},
+        }
+        mock_or.return_value = {
+            "qwen/qwen3.6-plus": {"context_length": 1_000_000},
+        }
+
+        ctx = mm.get_model_context_length(
+            model="qwen3.6-plus",
+            base_url="https://inference-api.nousresearch.com/v1",
+            api_key="fake-token",
+            provider="nous",
+        )
+        assert ctx == 262_144, (
+            f"Portal must override OR catalog; got {ctx} (OR leak?)"
+        )
+
+    @patch("agent.model_metadata.fetch_endpoint_model_metadata")
+    @patch("agent.model_metadata.fetch_model_metadata")
+    def test_portal_value_is_persisted_to_disk(
+        self, mock_or, mock_portal, tmp_path, monkeypatch
+    ):
+        """Portal-derived value should land in the persistent cache so
+        cross-process callers (e.g. child agents) see the same value."""
+        import agent.model_metadata as mm
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        mock_portal.return_value = {
+            "qwen3.6-plus": {"context_length": 262_144},
+        }
+        mock_or.return_value = {}
+
+        base_url = "https://inference-api.nousresearch.com/v1"
+        ctx = mm.get_model_context_length(
+            model="qwen3.6-plus",
+            base_url=base_url,
+            api_key="fake",
+            provider="nous",
+        )
+        assert ctx == 262_144
+        persisted = yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
+        assert persisted.get(f"qwen3.6-plus@{base_url}") == 262_144, (
+            "Portal-derived value should be persisted to disk"
+        )
+
+    @patch("agent.model_metadata.fetch_endpoint_model_metadata")
+    @patch("agent.model_metadata.fetch_model_metadata")
+    def test_openrouter_fallback_is_not_persisted(
+        self, mock_or, mock_portal, tmp_path, monkeypatch
+    ):
+        """When the portal can't resolve a model (network blip, auth glitch,
+        model not yet listed) we fall back to the OR catalog so the agent
+        keeps working — but we must NOT write the OR value to disk.  Once
+        cached on disk, step-1 short-circuits forever and the user is stuck
+        with the wrong number until they manually clear the cache."""
+        import agent.model_metadata as mm
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        mock_portal.return_value = {}  # portal unreachable / model unknown
+        mock_or.return_value = {
+            "qwen/qwen3.6-plus": {"context_length": 1_000_000},
+        }
+
+        base_url = "https://inference-api.nousresearch.com/v1"
+        ctx = mm.get_model_context_length(
+            model="qwen3.6-plus",
+            base_url=base_url,
+            api_key="fake",
+            provider="nous",
+        )
+        assert ctx == 1_000_000, "OR fallback should still serve the request"
+        assert not cache_file.exists() or not yaml.safe_load(
+            cache_file.read_text()
+        ).get("context_lengths", {}), (
+            "OR-fallback values must NOT be persisted — a single portal blip "
+            "would otherwise freeze the wrong value in via step-1 cache hit"
+        )
+
+    @patch("agent.model_metadata.fetch_endpoint_model_metadata")
+    @patch("agent.model_metadata.fetch_model_metadata")
+    def test_stale_cache_is_bypassed_and_overwritten_by_portal(
+        self, mock_or, mock_portal, tmp_path, monkeypatch
+    ):
+        """Users upgrading from pre-fix builds have ``qwen3.6-plus@…nous… =
+        1000000`` (OR-derived) sitting in their cache file.  Step 1 must
+        NOT short-circuit on that entry — step 5b reconciles against the
+        portal and overwrites the persistent value with 262144."""
+        import agent.model_metadata as mm
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        base_url = "https://inference-api.nousresearch.com/v1"
+        stale_key = f"qwen3.6-plus@{base_url}"
+        other_key = "other-model@https://api.openai.com/v1"
+        cache_file.write_text(yaml.dump({"context_lengths": {
+            stale_key: 1_000_000,     # pre-fix OR-derived value
+            other_key: 128_000,       # unrelated, must survive
+        }}))
+
+        mock_portal.return_value = {
+            "qwen3.6-plus": {"context_length": 262_144},
+        }
+        mock_or.return_value = {}
+
+        ctx = mm.get_model_context_length(
+            model="qwen3.6-plus",
+            base_url=base_url,
+            api_key="fake",
+            provider="nous",
+        )
+        assert ctx == 262_144, (
+            f"Stale OR-derived cache entry should not have leaked through; got {ctx}"
+        )
+
+        remaining = yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
+        assert remaining.get(stale_key) == 262_144, (
+            "Portal value should have overwritten the stale entry on disk"
+        )
+        assert remaining.get(other_key) == 128_000, (
+            "Unrelated cache entries must not be touched"
+        )
+
+    @patch("agent.model_metadata.fetch_endpoint_model_metadata")
+    @patch("agent.model_metadata.fetch_model_metadata")
+    def test_stale_cache_survives_when_portal_unreachable(
+        self, mock_or, mock_portal, tmp_path, monkeypatch
+    ):
+        """When the portal is unreachable AND we have a (potentially stale)
+        on-disk cache entry, the entry must survive untouched — we don't
+        want a transient outage to delete the only value we have.  The
+        request itself still gets served via OR fallback for this call."""
+        import agent.model_metadata as mm
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        base_url = "https://inference-api.nousresearch.com/v1"
+        existing_key = f"qwen3.6-plus@{base_url}"
+        cache_file.write_text(yaml.dump({"context_lengths": {
+            existing_key: 1_000_000,
+        }}))
+
+        mock_portal.return_value = {}  # portal unreachable
+        mock_or.return_value = {
+            "qwen/qwen3.6-plus": {"context_length": 1_000_000},
+        }
+
+        mm.get_model_context_length(
+            model="qwen3.6-plus",
+            base_url=base_url,
+            api_key="fake",
+            provider="nous",
+        )
+
+        remaining = yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
+        assert remaining.get(existing_key) == 1_000_000, (
+            "Persistent cache entry must survive a transient portal outage"
+        )
+
+    @patch("agent.model_metadata.fetch_endpoint_model_metadata")
+    @patch("agent.model_metadata.fetch_model_metadata")
+    def test_bypass_keyed_on_url_not_provider_string(
+        self, mock_or, mock_portal, tmp_path, monkeypatch
+    ):
+        """Some call sites pass ``provider=""`` or ``provider="openrouter"``
+        when the user is really on Nous Portal (e.g. cred-pool fallback).
+        The Nous-URL bypass must trigger off the URL host, not the provider
+        string, so the portal-first resolver still runs in that case."""
+        import agent.model_metadata as mm
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        base_url = "https://inference-api.nousresearch.com/v1"
+        cache_file.write_text(yaml.dump({"context_lengths": {
+            f"qwen3.6-plus@{base_url}": 1_000_000,  # stale
+        }}))
+
+        mock_portal.return_value = {
+            "qwen3.6-plus": {"context_length": 262_144},
+        }
+        mock_or.return_value = {}
+
+        for provider_arg in ("", "openrouter", "custom"):
+            mm._endpoint_model_metadata_cache.clear()
+            mm._endpoint_model_metadata_cache_time.clear()
+            ctx = mm.get_model_context_length(
+                model="qwen3.6-plus",
+                base_url=base_url,
+                api_key="fake",
+                provider=provider_arg,
+            )
+            assert ctx == 262_144, (
+                f"URL-based Nous detection must fire for provider={provider_arg!r}; "
+                f"got {ctx}"
+            )
+
+
 # =========================================================================
 # get_model_context_length — resolution order
 # =========================================================================