diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index 03f70b3fe4..97ac0b8b8d 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -775,12 +775,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                 resp = client.post(f"{server_url}/api/show", json={"name": model})
                 if resp.status_code == 200:
                     data = resp.json()
-                    # Check model_info for context length
-                    model_info = data.get("model_info", {})
-                    for key, value in model_info.items():
-                        if "context_length" in key and isinstance(value, (int, float)):
-                            return int(value)
-                    # Check parameters string for num_ctx
+                    # Prefer explicit num_ctx from Modelfile parameters: this is
+                    # the *runtime* context Ollama will actually allocate KV cache
+                    # for. The GGUF model_info.context_length is the training max,
+                    # which can be larger than num_ctx — using it here would let
+                    # Hermes grow conversations past the runtime limit and Ollama
+                    # would silently truncate. Matches query_ollama_num_ctx().
                     params = data.get("parameters", "")
                     if "num_ctx" in params:
                         for line in params.split("\n"):
@@ -791,6 +791,11 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                                         return int(parts[-1])
                                     except ValueError:
                                         pass
+                    # Fall back to GGUF model_info context_length (training max)
+                    model_info = data.get("model_info", {})
+                    for key, value in model_info.items():
+                        if "context_length" in key and isinstance(value, (int, float)):
+                            return int(value)
 
             # LM Studio native API: /api/v1/models returns max_context_length.
             # This is more reliable than the OpenAI-compat /v1/models which
diff --git a/tests/agent/test_model_metadata_local_ctx.py b/tests/agent/test_model_metadata_local_ctx.py
index e5ad0dc58c..6852a82cc9 100644
--- a/tests/agent/test_model_metadata_local_ctx.py
+++ b/tests/agent/test_model_metadata_local_ctx.py
@@ -70,6 +70,44 @@ class TestQueryLocalContextLengthOllama:
 
         assert result == 32768
 
+    def test_ollama_num_ctx_wins_over_model_info(self):
+        """When both num_ctx (Modelfile) and model_info (GGUF) are present,
+        num_ctx wins because it's the *runtime* context Ollama actually
+        allocates KV cache for. The GGUF model_info.context_length is the
+        training max — using it would let Hermes grow conversations past
+        the runtime limit and Ollama would silently truncate.
+
+        Concrete example: hermes-brain:qwen3-14b-ctx32k is a Modelfile
+        derived from qwen3:14b with `num_ctx 32768`, but the underlying
+        GGUF reports `qwen3.context_length: 40960` (training max). If
+        Hermes used 40960 it would let the conversation grow past 32768
+        before compressing, and Ollama would truncate the prefix.
+        """
+        from agent.model_metadata import _query_local_context_length
+
+        show_resp = self._make_resp(200, {
+            "model_info": {"qwen3.context_length": 40960},
+            "parameters": "num_ctx                        32768\ntemperature                    0.6\n",
+        })
+        models_resp = self._make_resp(404, {})
+
+        client_mock = MagicMock()
+        client_mock.__enter__ = lambda s: client_mock
+        client_mock.__exit__ = MagicMock(return_value=False)
+        client_mock.post.return_value = show_resp
+        client_mock.get.return_value = models_resp
+
+        with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \
+             patch("httpx.Client", return_value=client_mock):
+            result = _query_local_context_length(
+                "hermes-brain:qwen3-14b-ctx32k", "http://100.77.243.5:11434/v1"
+            )
+
+        assert result == 32768, (
+            f"Expected num_ctx (32768) to win over model_info (40960), got {result}. "
+            "If Hermes uses the GGUF training max, conversations will silently truncate."
+        )
+
     def test_ollama_show_404_falls_through(self):
         """When /api/show returns 404, falls through to /v1/models/{model}."""
         from agent.model_metadata import _query_local_context_length