mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(agent): fail fast on small Ollama runtime context
This commit is contained in:
parent
e77f1ed5f7
commit
f6f25b9449
2 changed files with 97 additions and 1 deletions
|
|
@ -46,6 +46,7 @@ from agent.message_sanitization import (
|
|||
_strip_non_ascii,
|
||||
)
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
estimate_messages_tokens_rough,
|
||||
estimate_request_tokens_rough,
|
||||
get_next_probe_tier,
|
||||
|
|
@ -73,6 +74,50 @@ from utils import base_url_host_matches, env_var_enabled
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ollama_context_limit_error(agent: Any, request_tokens: int) -> Optional[str]:
|
||||
"""Return a user-facing error when Ollama is loaded with too little context."""
|
||||
if not getattr(agent, "tools", None):
|
||||
return None
|
||||
|
||||
runtime_ctx = getattr(agent, "_ollama_num_ctx", None)
|
||||
if not isinstance(runtime_ctx, int) or runtime_ctx <= 0:
|
||||
return None
|
||||
if runtime_ctx >= MINIMUM_CONTEXT_LENGTH:
|
||||
return None
|
||||
|
||||
model = getattr(agent, "model", "") or "the selected model"
|
||||
base_url = getattr(agent, "base_url", "") or "unknown base URL"
|
||||
provider = getattr(agent, "provider", "") or "unknown"
|
||||
tool_count = len(getattr(agent, "tools", None) or [])
|
||||
|
||||
logger.warning(
|
||||
"Ollama runtime context too small for Hermes tool use: "
|
||||
"model=%s provider=%s base_url=%s runtime_context=%d "
|
||||
"minimum_context=%d estimated_request_tokens=%d tool_count=%d "
|
||||
"session=%s",
|
||||
model,
|
||||
provider,
|
||||
base_url,
|
||||
runtime_ctx,
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
request_tokens,
|
||||
tool_count,
|
||||
getattr(agent, "session_id", None) or "none",
|
||||
)
|
||||
|
||||
return (
|
||||
f"Ollama loaded `{model}` with only {runtime_ctx:,} tokens of runtime "
|
||||
f"context, but Hermes needs at least {MINIMUM_CONTEXT_LENGTH:,} tokens "
|
||||
"for reliable tool use.\n\n"
|
||||
"Increase the Ollama context for this model and restart/reload the "
|
||||
"model before trying again. A known-good starting point is 65,536 "
|
||||
"tokens. In Hermes config, set `model.ollama_num_ctx: 65536` "
|
||||
"(and `model.context_length: 65536` if you also override the displayed "
|
||||
"model context). If you manage the model through an Ollama Modelfile, "
|
||||
"set `PARAMETER num_ctx 65536` there instead."
|
||||
)
|
||||
|
||||
|
||||
def _ra():
|
||||
"""Lazy reference to ``run_agent`` so callers can patch
|
||||
``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` /
|
||||
|
|
@ -527,6 +572,7 @@ def run_conversation(
|
|||
api_call_count = 0
|
||||
final_response = None
|
||||
interrupted = False
|
||||
failed = False
|
||||
codex_ack_continuations = 0
|
||||
length_continue_retries = 0
|
||||
truncated_tool_call_retries = 0
|
||||
|
|
@ -883,6 +929,26 @@ def run_conversation(
|
|||
# Calculate approximate request size for logging
|
||||
total_chars = sum(len(str(msg)) for msg in api_messages)
|
||||
approx_tokens = estimate_messages_tokens_rough(api_messages)
|
||||
approx_request_tokens = estimate_request_tokens_rough(
|
||||
api_messages, tools=agent.tools or None
|
||||
)
|
||||
|
||||
_runtime_context_error = _ollama_context_limit_error(
|
||||
agent, approx_request_tokens
|
||||
)
|
||||
if _runtime_context_error:
|
||||
final_response = _runtime_context_error
|
||||
failed = True
|
||||
_turn_exit_reason = "ollama_runtime_context_too_small"
|
||||
messages.append({"role": "assistant", "content": final_response})
|
||||
agent._emit_status("❌ Ollama runtime context is too small for Hermes tool use")
|
||||
api_call_count -= 1
|
||||
agent._api_call_count = api_call_count
|
||||
try:
|
||||
agent.iteration_budget.refund()
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
# Thinking spinner for quiet mode (animated during API call)
|
||||
thinking_spinner = None
|
||||
|
|
@ -3848,7 +3914,11 @@ def run_conversation(
|
|||
)
|
||||
|
||||
# Determine if conversation completed successfully
|
||||
completed = final_response is not None and api_call_count < agent.max_iterations
|
||||
completed = (
|
||||
final_response is not None
|
||||
and api_call_count < agent.max_iterations
|
||||
and not failed
|
||||
)
|
||||
|
||||
# Save trajectory if enabled. ``user_message`` may be a multimodal
|
||||
# list of parts; the trajectory format wants a plain string.
|
||||
|
|
@ -3998,6 +4068,7 @@ def run_conversation(
|
|||
"api_calls": api_call_count,
|
||||
"completed": completed,
|
||||
"turn_exit_reason": _turn_exit_reason,
|
||||
"failed": failed,
|
||||
"partial": False, # True only when stopped due to invalid tool calls
|
||||
"interrupted": interrupted,
|
||||
"response_previewed": getattr(agent, "_response_was_previewed", False),
|
||||
|
|
|
|||
|
|
@ -2636,6 +2636,31 @@ class TestRunConversation:
|
|||
assert result["final_response"] == "Final answer"
|
||||
assert result["completed"] is True
|
||||
|
||||
def test_ollama_small_runtime_context_fails_before_api_call(self, agent, caplog):
|
||||
self._setup_agent(agent)
|
||||
agent.model = "qwen3.5:9b"
|
||||
agent.provider = "custom"
|
||||
agent.base_url = "http://host.docker.internal:11434/v1"
|
||||
agent._ollama_num_ctx = 4096
|
||||
|
||||
with (
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
caplog.at_level(logging.WARNING, logger="agent.conversation_loop"),
|
||||
):
|
||||
result = agent.run_conversation("Call ps -aux")
|
||||
|
||||
assert result["failed"] is True
|
||||
assert result["completed"] is False
|
||||
assert result["api_calls"] == 0
|
||||
assert result["turn_exit_reason"] == "ollama_runtime_context_too_small"
|
||||
assert "Ollama loaded `qwen3.5:9b` with only 4,096 tokens" in result["final_response"]
|
||||
assert "model.ollama_num_ctx: 65536" in result["final_response"]
|
||||
assert not agent.client.chat.completions.create.called
|
||||
assert "Ollama runtime context too small for Hermes tool use" in caplog.text
|
||||
assert "runtime_context=4096" in caplog.text
|
||||
|
||||
def test_tool_calls_then_stop(self, agent):
|
||||
self._setup_agent(agent)
|
||||
tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue