mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(agent): prefer Ollama Modelfile num_ctx over GGUF training max
_query_local_context_length was checking model_info.context_length (the GGUF training max) before num_ctx (the Modelfile runtime override), inverse to query_ollama_num_ctx. The two helpers therefore disagreed on the same model: hermes-brain:qwen3-14b-ctx32k # Modelfile: num_ctx 32768 underlying qwen3:14b GGUF # qwen3.context_length: 40960 query_ollama_num_ctx correctly returned 32768 (the value Ollama will actually allocate KV cache for). _query_local_context_length returned 40960, which let ContextCompressor grow conversations past 32768 before triggering compression — at which point Ollama silently truncated the prefix, corrupting context. Swap the order so num_ctx is checked first, matching query_ollama_num_ctx. Adds a parametrized test that seeds both values and asserts num_ctx wins. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
39b83f3443
commit
3e99964789
2 changed files with 49 additions and 6 deletions
|
|
@ -775,12 +775,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
|||
resp = client.post(f"{server_url}/api/show", json={"name": model})
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Check model_info for context length
|
||||
model_info = data.get("model_info", {})
|
||||
for key, value in model_info.items():
|
||||
if "context_length" in key and isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
# Check parameters string for num_ctx
|
||||
# Prefer explicit num_ctx from Modelfile parameters: this is
|
||||
# the *runtime* context Ollama will actually allocate KV cache
|
||||
# for. The GGUF model_info.context_length is the training max,
|
||||
# which can be larger than num_ctx — using it here would let
|
||||
# Hermes grow conversations past the runtime limit and Ollama
|
||||
# would silently truncate. Matches query_ollama_num_ctx().
|
||||
params = data.get("parameters", "")
|
||||
if "num_ctx" in params:
|
||||
for line in params.split("\n"):
|
||||
|
|
@ -791,6 +791,11 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
|
|||
return int(parts[-1])
|
||||
except ValueError:
|
||||
pass
|
||||
# Fall back to GGUF model_info context_length (training max)
|
||||
model_info = data.get("model_info", {})
|
||||
for key, value in model_info.items():
|
||||
if "context_length" in key and isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
|
||||
# LM Studio native API: /api/v1/models returns max_context_length.
|
||||
# This is more reliable than the OpenAI-compat /v1/models which
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue