fix(copilot): wire live /models max_prompt_tokens into context-window resolver

The Copilot provider resolved context windows via models.dev static data, which does not include account-specific models (e.g. claude-opus-4.6-1m with 1M context). This adds the live Copilot /models API as a higher- priority source for copilot/copilot-acp/github-copilot providers. New helper get_copilot_model_context() in hermes_cli/models.py extracts capabilities.limits.max_prompt_tokens from the cached catalog. Results are cached in-process for 1 hour. In agent/model_metadata.py, step 5a queries the live API before falling through to models.dev (step 5b). This ensures account-specific models get correct context windows while standard models still have a fallback. Part 1 of #7731. Refs: #7272
2026-06-13 09:01:54 +00:00 · 2026-04-20 05:20:55 +00:00 · 2026-04-20 05:20:55 +00:00 · 76329196c1
commit 76329196c1
parent d7ad07d6fe
3 changed files with 192 additions and 0 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -1308,6 +1308,19 @@ def get_model_context_length(
            if inferred:
                effective_provider = inferred

+    # 5a. Copilot live /models API — max_prompt_tokens from the user's account.
+    # This catches account-specific models (e.g. claude-opus-4.6-1m) that
+    # don't exist in models.dev. For models that ARE in models.dev, this
+    # returns the provider-enforced limit which is what users can actually use.
+    if effective_provider in ("copilot", "copilot-acp", "github-copilot"):
+        try:
+            from hermes_cli.models import get_copilot_model_context
+            ctx = get_copilot_model_context(model, api_key=api_key)
+            if ctx:
+                return ctx
+        except Exception:
+            pass  # Fall through to models.dev
+
    if effective_provider == "nous":
        ctx = _resolve_nous_context_length(model)
        if ctx:
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@ -1902,6 +1902,51 @@ def fetch_github_model_catalog(
    return None


+# ─── Copilot catalog context-window helpers ─────────────────────────────────
+
+# Module-level cache: {model_id: max_prompt_tokens}
+_copilot_context_cache: dict[str, int] = {}
+_copilot_context_cache_time: float = 0.0
+_COPILOT_CONTEXT_CACHE_TTL = 3600  # 1 hour
+
+
+def get_copilot_model_context(model_id: str, api_key: Optional[str] = None) -> Optional[int]:
+    """Look up max_prompt_tokens for a Copilot model from the live /models API.
+
+    Results are cached in-process for 1 hour to avoid repeated API calls.
+    Returns the token limit or None if not found.
+    """
+    global _copilot_context_cache, _copilot_context_cache_time
+
+    # Serve from cache if fresh
+    if _copilot_context_cache and (time.time() - _copilot_context_cache_time < _COPILOT_CONTEXT_CACHE_TTL):
+        if model_id in _copilot_context_cache:
+            return _copilot_context_cache[model_id]
+        # Cache is fresh but model not in it — don't re-fetch
+        return None
+
+    # Fetch and populate cache
+    catalog = fetch_github_model_catalog(api_key=api_key)
+    if not catalog:
+        return None
+
+    cache: dict[str, int] = {}
+    for item in catalog:
+        mid = str(item.get("id") or "").strip()
+        if not mid:
+            continue
+        caps = item.get("capabilities") or {}
+        limits = caps.get("limits") or {}
+        max_prompt = limits.get("max_prompt_tokens")
+        if isinstance(max_prompt, int) and max_prompt > 0:
+            cache[mid] = max_prompt
+
+    _copilot_context_cache = cache
+    _copilot_context_cache_time = time.time()
+
+    return cache.get(model_id)
+
+
 def _is_github_models_base_url(base_url: Optional[str]) -> bool:
    normalized = (base_url or "").strip().rstrip("/").lower()
    return (
--- a/tests/hermes_cli/test_copilot_context.py
+++ b/tests/hermes_cli/test_copilot_context.py
@ -0,0 +1,134 @@
+"""Tests for Copilot live /models context-window resolution."""
+
+from __future__ import annotations
+
+import time
+from unittest.mock import patch
+
+import pytest
+
+from hermes_cli.models import get_copilot_model_context
+
+
+# Sample catalog items mimicking the Copilot /models API response
+_SAMPLE_CATALOG = [
+    {
+        "id": "claude-opus-4.6-1m",
+        "capabilities": {
+            "type": "chat",
+            "limits": {"max_prompt_tokens": 1000000, "max_output_tokens": 64000},
+        },
+    },
+    {
+        "id": "gpt-4.1",
+        "capabilities": {
+            "type": "chat",
+            "limits": {"max_prompt_tokens": 128000, "max_output_tokens": 32768},
+        },
+    },
+    {
+        "id": "claude-sonnet-4",
+        "capabilities": {
+            "type": "chat",
+            "limits": {"max_prompt_tokens": 200000, "max_output_tokens": 64000},
+        },
+    },
+    {
+        "id": "model-without-limits",
+        "capabilities": {"type": "chat"},
+    },
+    {
+        "id": "model-zero-limit",
+        "capabilities": {
+            "type": "chat",
+            "limits": {"max_prompt_tokens": 0},
+        },
+    },
+]
+
+
+@pytest.fixture(autouse=True)
+def _clear_cache():
+    """Reset module-level cache before each test."""
+    import hermes_cli.models as mod
+
+    mod._copilot_context_cache = {}
+    mod._copilot_context_cache_time = 0.0
+    yield
+    mod._copilot_context_cache = {}
+    mod._copilot_context_cache_time = 0.0
+
+
+class TestGetCopilotModelContext:
+    """Tests for get_copilot_model_context()."""
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_returns_max_prompt_tokens(self, mock_fetch):
+        assert get_copilot_model_context("claude-opus-4.6-1m") == 1_000_000
+        assert get_copilot_model_context("gpt-4.1") == 128_000
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_returns_none_for_unknown_model(self, mock_fetch):
+        assert get_copilot_model_context("nonexistent-model") is None
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_skips_models_without_limits(self, mock_fetch):
+        assert get_copilot_model_context("model-without-limits") is None
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_skips_zero_limit(self, mock_fetch):
+        assert get_copilot_model_context("model-zero-limit") is None
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_caches_results(self, mock_fetch):
+        get_copilot_model_context("gpt-4.1")
+        get_copilot_model_context("claude-sonnet-4")
+        # Only one API call despite two lookups
+        assert mock_fetch.call_count == 1
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_cache_expires(self, mock_fetch):
+        import hermes_cli.models as mod
+
+        get_copilot_model_context("gpt-4.1")
+        assert mock_fetch.call_count == 1
+
+        # Expire the cache
+        mod._copilot_context_cache_time = time.time() - 7200
+        get_copilot_model_context("gpt-4.1")
+        assert mock_fetch.call_count == 2
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=None)
+    def test_returns_none_when_catalog_unavailable(self, mock_fetch):
+        assert get_copilot_model_context("gpt-4.1") is None
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=[])
+    def test_returns_none_for_empty_catalog(self, mock_fetch):
+        assert get_copilot_model_context("gpt-4.1") is None
+
+
+class TestModelMetadataCopilotIntegration:
+    """Test that get_model_context_length() uses Copilot live API for copilot provider."""
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_copilot_provider_uses_live_api(self, mock_fetch):
+        from agent.model_metadata import get_model_context_length
+
+        ctx = get_model_context_length("claude-opus-4.6-1m", provider="copilot")
+        assert ctx == 1_000_000
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=_SAMPLE_CATALOG)
+    def test_copilot_acp_provider_uses_live_api(self, mock_fetch):
+        from agent.model_metadata import get_model_context_length
+
+        ctx = get_model_context_length("claude-sonnet-4", provider="copilot-acp")
+        assert ctx == 200_000
+
+    @patch("hermes_cli.models.fetch_github_model_catalog", return_value=None)
+    def test_falls_through_when_catalog_unavailable(self, mock_fetch):
+        from agent.model_metadata import get_model_context_length
+
+        # Should not raise, should fall through to models.dev or defaults
+        ctx = get_model_context_length("gpt-4.1", provider="copilot")
+        assert isinstance(ctx, int)
+        assert ctx > 0