diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 03f70b3fe4..97ac0b8b8d 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -775,12 +775,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]: resp = client.post(f"{server_url}/api/show", json={"name": model}) if resp.status_code == 200: data = resp.json() - # Check model_info for context length - model_info = data.get("model_info", {}) - for key, value in model_info.items(): - if "context_length" in key and isinstance(value, (int, float)): - return int(value) - # Check parameters string for num_ctx + # Prefer explicit num_ctx from Modelfile parameters: this is + # the *runtime* context Ollama will actually allocate KV cache + # for. The GGUF model_info.context_length is the training max, + # which can be larger than num_ctx — using it here would let + # Hermes grow conversations past the runtime limit and Ollama + # would silently truncate. Matches query_ollama_num_ctx(). params = data.get("parameters", "") if "num_ctx" in params: for line in params.split("\n"): @@ -791,6 +791,11 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]: return int(parts[-1]) except ValueError: pass + # Fall back to GGUF model_info context_length (training max) + model_info = data.get("model_info", {}) + for key, value in model_info.items(): + if "context_length" in key and isinstance(value, (int, float)): + return int(value) # LM Studio native API: /api/v1/models returns max_context_length. # This is more reliable than the OpenAI-compat /v1/models which diff --git a/tests/agent/test_model_metadata_local_ctx.py b/tests/agent/test_model_metadata_local_ctx.py index e5ad0dc58c..6852a82cc9 100644 --- a/tests/agent/test_model_metadata_local_ctx.py +++ b/tests/agent/test_model_metadata_local_ctx.py @@ -70,6 +70,44 @@ class TestQueryLocalContextLengthOllama: assert result == 32768 + def test_ollama_num_ctx_wins_over_model_info(self): + """When both num_ctx (Modelfile) and model_info (GGUF) are present, + num_ctx wins because it's the *runtime* context Ollama actually + allocates KV cache for. The GGUF model_info.context_length is the + training max — using it would let Hermes grow conversations past + the runtime limit and Ollama would silently truncate. + + Concrete example: hermes-brain:qwen3-14b-ctx32k is a Modelfile + derived from qwen3:14b with `num_ctx 32768`, but the underlying + GGUF reports `qwen3.context_length: 40960` (training max). If + Hermes used 40960 it would let the conversation grow past 32768 + before compressing, and Ollama would truncate the prefix. + """ + from agent.model_metadata import _query_local_context_length + + show_resp = self._make_resp(200, { + "model_info": {"qwen3.context_length": 40960}, + "parameters": "num_ctx 32768\ntemperature 0.6\n", + }) + models_resp = self._make_resp(404, {}) + + client_mock = MagicMock() + client_mock.__enter__ = lambda s: client_mock + client_mock.__exit__ = MagicMock(return_value=False) + client_mock.post.return_value = show_resp + client_mock.get.return_value = models_resp + + with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \ + patch("httpx.Client", return_value=client_mock): + result = _query_local_context_length( + "hermes-brain:qwen3-14b-ctx32k", "http://100.77.243.5:11434/v1" + ) + + assert result == 32768, ( + f"Expected num_ctx (32768) to win over model_info (40960), got {result}. " + "If Hermes uses the GGUF training max, conversations will silently truncate." + ) + def test_ollama_show_404_falls_through(self): """When /api/show returns 404, falls through to /v1/models/{model}.""" from agent.model_metadata import _query_local_context_length