mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(agent): prefer Ollama Modelfile num_ctx over GGUF training max
_query_local_context_length was checking model_info.context_length (the GGUF training max) before num_ctx (the Modelfile runtime override), inverse to query_ollama_num_ctx. The two helpers therefore disagreed on the same model: hermes-brain:qwen3-14b-ctx32k # Modelfile: num_ctx 32768 underlying qwen3:14b GGUF # qwen3.context_length: 40960 query_ollama_num_ctx correctly returned 32768 (the value Ollama will actually allocate KV cache for). _query_local_context_length returned 40960, which let ContextCompressor grow conversations past 32768 before triggering compression — at which point Ollama silently truncated the prefix, corrupting context. Swap the order so num_ctx is checked first, matching query_ollama_num_ctx. Adds a parametrized test that seeds both values and asserts num_ctx wins. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
39b83f3443
commit
3e99964789
2 changed files with 49 additions and 6 deletions
|
|
@ -775,12 +775,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
|||
resp = client.post(f"{server_url}/api/show", json={"name": model})
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Check model_info for context length
|
||||
model_info = data.get("model_info", {})
|
||||
for key, value in model_info.items():
|
||||
if "context_length" in key and isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
# Check parameters string for num_ctx
|
||||
# Prefer explicit num_ctx from Modelfile parameters: this is
|
||||
# the *runtime* context Ollama will actually allocate KV cache
|
||||
# for. The GGUF model_info.context_length is the training max,
|
||||
# which can be larger than num_ctx — using it here would let
|
||||
# Hermes grow conversations past the runtime limit and Ollama
|
||||
# would silently truncate. Matches query_ollama_num_ctx().
|
||||
params = data.get("parameters", "")
|
||||
if "num_ctx" in params:
|
||||
for line in params.split("\n"):
|
||||
|
|
@ -791,6 +791,11 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
|||
return int(parts[-1])
|
||||
except ValueError:
|
||||
pass
|
||||
# Fall back to GGUF model_info context_length (training max)
|
||||
model_info = data.get("model_info", {})
|
||||
for key, value in model_info.items():
|
||||
if "context_length" in key and isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
|
||||
# LM Studio native API: /api/v1/models returns max_context_length.
|
||||
# This is more reliable than the OpenAI-compat /v1/models which
|
||||
|
|
|
|||
|
|
@ -70,6 +70,44 @@ class TestQueryLocalContextLengthOllama:
|
|||
|
||||
assert result == 32768
|
||||
|
||||
def test_ollama_num_ctx_wins_over_model_info(self):
|
||||
"""When both num_ctx (Modelfile) and model_info (GGUF) are present,
|
||||
num_ctx wins because it's the *runtime* context Ollama actually
|
||||
allocates KV cache for. The GGUF model_info.context_length is the
|
||||
training max — using it would let Hermes grow conversations past
|
||||
the runtime limit and Ollama would silently truncate.
|
||||
|
||||
Concrete example: hermes-brain:qwen3-14b-ctx32k is a Modelfile
|
||||
derived from qwen3:14b with `num_ctx 32768`, but the underlying
|
||||
GGUF reports `qwen3.context_length: 40960` (training max). If
|
||||
Hermes used 40960 it would let the conversation grow past 32768
|
||||
before compressing, and Ollama would truncate the prefix.
|
||||
"""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
||||
show_resp = self._make_resp(200, {
|
||||
"model_info": {"qwen3.context_length": 40960},
|
||||
"parameters": "num_ctx 32768\ntemperature 0.6\n",
|
||||
})
|
||||
models_resp = self._make_resp(404, {})
|
||||
|
||||
client_mock = MagicMock()
|
||||
client_mock.__enter__ = lambda s: client_mock
|
||||
client_mock.__exit__ = MagicMock(return_value=False)
|
||||
client_mock.post.return_value = show_resp
|
||||
client_mock.get.return_value = models_resp
|
||||
|
||||
with patch("agent.model_metadata.detect_local_server_type", return_value="ollama"), \
|
||||
patch("httpx.Client", return_value=client_mock):
|
||||
result = _query_local_context_length(
|
||||
"hermes-brain:qwen3-14b-ctx32k", "http://100.77.243.5:11434/v1"
|
||||
)
|
||||
|
||||
assert result == 32768, (
|
||||
f"Expected num_ctx (32768) to win over model_info (40960), got {result}. "
|
||||
"If Hermes uses the GGUF training max, conversations will silently truncate."
|
||||
)
|
||||
|
||||
def test_ollama_show_404_falls_through(self):
|
||||
"""When /api/show returns 404, falls through to /v1/models/{model}."""
|
||||
from agent.model_metadata import _query_local_context_length
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue