fix(agent): prefer Ollama Modelfile num_ctx over GGUF training max

_query_local_context_length was checking model_info.context_length (the GGUF training max) before num_ctx (the Modelfile runtime override), inverse to query_ollama_num_ctx. The two helpers therefore disagreed on the same model: hermes-brain:qwen3-14b-ctx32k # Modelfile: num_ctx 32768 underlying qwen3:14b GGUF # qwen3.context_length: 40960 query_ollama_num_ctx correctly returned 32768 (the value Ollama will actually allocate KV cache for). _query_local_context_length returned 40960, which let ContextCompressor grow conversations past 32768 before triggering compression — at which point Ollama silently truncated the prefix, corrupting context. Swap the order so num_ctx is checked first, matching query_ollama_num_ctx. Adds a parametrized test that seeds both values and asserts num_ctx wins. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-20 10:11:58 +00:00 · 2026-04-13 11:41:45 +02:00 · 2026-04-13 11:41:45 +02:00 · 3e99964789
commit 3e99964789
parent 39b83f3443
2 changed files with 49 additions and 6 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -775,12 +775,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                resp = client.post(f"{server_url}/api/show", json={"name": model})
                if resp.status_code == 200:
                    data = resp.json()
-                    # Check model_info for context length
-                    model_info = data.get("model_info", {})
-                    for key, value in model_info.items():
-                        if "context_length" in key and isinstance(value, (int, float)):
-                            return int(value)
-                    # Check parameters string for num_ctx
+                    # Prefer explicit num_ctx from Modelfile parameters: this is
+                    # the *runtime* context Ollama will actually allocate KV cache
+                    # for. The GGUF model_info.context_length is the training max,
+                    # which can be larger than num_ctx — using it here would let
+                    # Hermes grow conversations past the runtime limit and Ollama
+                    # would silently truncate. Matches query_ollama_num_ctx().
                    params = data.get("parameters", "")
                    if "num_ctx" in params:
                        for line in params.split("\n"):
@ -791,6 +791,11 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                                        return int(parts[-1])
                                    except ValueError:
                                        pass
+                    # Fall back to GGUF model_info context_length (training max)
+                    model_info = data.get("model_info", {})
+                    for key, value in model_info.items():
+                        if "context_length" in key and isinstance(value, (int, float)):
+                            return int(value)

            # LM Studio native API: /api/v1/models returns max_context_length.
            # This is more reliable than the OpenAI-compat /v1/models which
--- a/tests/agent/test_model_metadata_local_ctx.py
+++ b/tests/agent/test_model_metadata_local_ctx.py
@ -70,6 +70,44 @@ class TestQueryLocalContextLengthOllama:

        assert result == 32768

+    def test_ollama_num_ctx_wins_over_model_info(self):
+        """When both num_ctx (Modelfile) and model_info (GGUF) are present,
+        num_ctx wins because it's the *runtime* context Ollama actually
+        allocates KV cache for. The GGUF model_info.context_length is the
+        training max — using it would let Hermes grow conversations past
+        the runtime limit and Ollama would silently truncate.
+
+        Concrete example: hermes-brain:qwen3-14b-ctx32k is a Modelfile
+        derived from qwen3:14b with `num_ctx 32768`, but the underlying
+        GGUF reports `qwen3.context_length: 40960` (training max). If
+        Hermes used 40960 it would let the conversation grow past 32768
+        before compressing, and Ollama would truncate the prefix.
+        """
+        from agent.model_metadata import _query_local_context_length
+
+        show_resp = self._make_resp(200, {
+            "model_info": {"qwen3.context_length": 40960},
+            "parameters": "num_ctx                        32768\ntemperature                    0.6\n",
+        })
+        models_resp = self._make_resp(404, {})
+
+        client_mock = MagicMock()
+        client_mock.__enter__ = lambda s: client_mock
+        client_mock.__exit__ = MagicMock(return_value=False)
+        client_mock.post.return_value = show_resp
+        client_mock.get.return_value = models_resp
+
+        with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \
+             patch("httpx.Client", return_value=client_mock):
+            result = _query_local_context_length(
+                "hermes-brain:qwen3-14b-ctx32k", "http://100.77.243.5:11434/v1"
+            )
+
+        assert result == 32768, (
+            f"Expected num_ctx (32768) to win over model_info (40960), got {result}. "
+            "If Hermes uses the GGUF training max, conversations will silently truncate."
+        )
+
    def test_ollama_show_404_falls_through(self):
        """When /api/show returns 404, falls through to /v1/models/{model}."""
        from agent.model_metadata import _query_local_context_length