feat(zai): add GLM-5.2 with verified 1M context window

GLM-5.2 ships with a 1M (1,048,576) token context window. Without this entry, Hermes falls through to the generic 'glm' key (202,752 tokens), under-reporting the context bar and prematurely compressing conversations. The 1M limit was verified empirically via needle-in-a-haystack retrieval at 789,240 prompt tokens on api.z.ai/api/coding/paas/v4 — zero errors, zero truncation, correct retrieval at every tested size (25K through 789K). Changes: - agent/model_metadata.py: add 'glm-5.2': 1_048_576 before 'glm' fallback - hermes_cli/models.py: add glm-5.2 to zai curated models - hermes_cli/setup.py: add glm-5.2 to setup wizard zai list - hermes_cli/auth.py: add glm-5.2 to coding plan endpoint probes - plugins/model-providers/zai/__init__.py: add glm-5.2 to fallback_models - tests/agent/test_model_metadata.py: context resolution + vendor-prefix tests
2026-06-15 09:21:36 +00:00 · 2026-06-13 10:08:29 -05:00 · 2026-06-13 10:08:29 -05:00 · bff78a34dc
commit bff78a34dc
parent 4e6d05c6a5
6 changed files with 37 additions and 4 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -261,7 +261,13 @@ DEFAULT_CONTEXT_LENGTHS = {
    # https://platform.minimax.io/docs/api-reference/text-chat-openai
    "minimax-m3": 1000000,
    "minimax": 204800,
-    # GLM
+    # GLM — GLM-5.2 ships with a 1M context window (verified empirically:
+    # needle-in-a-haystack retrieval at 789K prompt tokens succeeded with
+    # zero errors on api.z.ai/api/coding/paas/v4).  Older GLM models
+    # (5, 5.1, 5-turbo) are ~202K.  Longest-key-first substring matching
+    # ensures "glm-5.2" resolves to 1M while older variants still hit the
+    # generic 202K fallback.
+    "glm-5.2": 1_048_576,
    "glm": 202752,
    # xAI Grok — xAI /v1/models does not return context_length metadata,
    # so these hardcoded fallbacks prevent Hermes from probing-down to
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@ -616,8 +616,8 @@ ZAI_ENDPOINTS = [
    # (id, base_url, probe_models, label)
    ("global",        "https://api.z.ai/api/paas/v4",        ["glm-5"],   "Global"),
    ("cn",            "https://open.bigmodel.cn/api/paas/v4", ["glm-5"],   "China"),
-    ("coding-global", "https://api.z.ai/api/coding/paas/v4",  ["glm-5.1", "glm-5v-turbo", "glm-4.7"], "Global (Coding Plan)"),
-    ("coding-cn",     "https://open.bigmodel.cn/api/coding/paas/v4", ["glm-5.1", "glm-5v-turbo", "glm-4.7"], "China (Coding Plan)"),
+    ("coding-global", "https://api.z.ai/api/coding/paas/v4",  ["glm-5.2", "glm-5.1", "glm-5v-turbo", "glm-4.7"], "Global (Coding Plan)"),
+    ("coding-cn",     "https://open.bigmodel.cn/api/coding/paas/v4", ["glm-5.2", "glm-5.1", "glm-5v-turbo", "glm-4.7"], "China (Coding Plan)"),
 ]


--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@ -257,6 +257,7 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "gemini-3.5-flash",
    ],
    "zai": [
+        "glm-5.2",
        "glm-5.1",
        "glm-5",
        "glm-5v-turbo",
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@ -93,7 +93,7 @@ _DEFAULT_PROVIDER_MODELS = {
        "gemini-3.1-pro-preview", "gemini-3-pro-preview",
        "gemini-3-flash-preview", "gemini-3.1-flash-lite-preview",
    ],
-    "zai": ["glm-5.1", "glm-5", "glm-4.7", "glm-4.5", "glm-4.5-flash"],
+    "zai": ["glm-5.2", "glm-5.1", "glm-5", "glm-4.7", "glm-4.5", "glm-4.5-flash"],
    "kimi-coding": ["kimi-k2.6", "kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"],
    "kimi-coding-cn": ["kimi-k2.6", "kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"],
    "stepfun": ["step-3.5-flash", "step-3.5-flash-2603"],
--- a/plugins/model-providers/zai/init.py
+++ b/plugins/model-providers/zai/init.py
@ -11,6 +11,7 @@ zai = ProviderProfile(
    description="Z.AI / GLM — Zhipu AI models",
    signup_url="https://z.ai/",
    fallback_models=(
+        "glm-5.2",
        "glm-5",
        "glm-4-9b",
    ),
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -220,6 +220,31 @@ class TestDefaultContextLengths:
                    f"{model_id}: expected {expected_ctx}, got {actual}"
                )

+    def test_glm_52_context_1m(self):
+        """GLM-5.2 must resolve to 1M, not the generic GLM fallback of 202K.
+
+        Context window was verified empirically via needle-in-a-haystack
+        retrieval at 789K prompt tokens on api.z.ai/api/coding/paas/v4
+        (2026-06-13).
+        """
+        from agent.model_metadata import get_model_context_length
+        from unittest.mock import patch as mock_patch
+
+        assert DEFAULT_CONTEXT_LENGTHS["glm-5.2"] == 1_048_576
+        assert DEFAULT_CONTEXT_LENGTHS["glm"] == 202752
+
+        with mock_patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
+             mock_patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
+             mock_patch("agent.model_metadata.get_cached_context_length", return_value=None):
+            # GLM-5.2 (1M) must NOT fall through to the generic 202K entry
+            assert get_model_context_length("glm-5.2") == 1_048_576
+            # Vendor-prefixed forms (zai provider, zhipu alias)
+            assert get_model_context_length("zai/glm-5.2") == 1_048_576
+            assert get_model_context_length("zhipu/glm-5.2") == 1_048_576
+            # Older GLM variants still resolve to the generic 202K fallback
+            assert get_model_context_length("glm-5") == 202752
+            assert get_model_context_length("glm-5.1") == 202752
+
    def test_openrouter_live_metadata_beats_hardcoded_catchall(self):
        """OpenRouter-routed slugs resolve via the live OR catalog before the
        hardcoded family catch-all.