From f6f25b9449dbf05e1aa59759e2c1faa0d46e681b Mon Sep 17 00:00:00 2001 From: helix4u <4317663+helix4u@users.noreply.github.com> Date: Thu, 21 May 2026 14:49:02 -0600 Subject: [PATCH] fix(agent): fail fast on small Ollama runtime context --- agent/conversation_loop.py | 73 ++++++++++++++++++++++++++++++- tests/run_agent/test_run_agent.py | 25 +++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index caac0d3e8f2..d3cad0c35fd 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -46,6 +46,7 @@ from agent.message_sanitization import ( _strip_non_ascii, ) from agent.model_metadata import ( + MINIMUM_CONTEXT_LENGTH, estimate_messages_tokens_rough, estimate_request_tokens_rough, get_next_probe_tier, @@ -73,6 +74,50 @@ from utils import base_url_host_matches, env_var_enabled logger = logging.getLogger(__name__) +def _ollama_context_limit_error(agent: Any, request_tokens: int) -> Optional[str]: + """Return a user-facing error when Ollama is loaded with too little context.""" + if not getattr(agent, "tools", None): + return None + + runtime_ctx = getattr(agent, "_ollama_num_ctx", None) + if not isinstance(runtime_ctx, int) or runtime_ctx <= 0: + return None + if runtime_ctx >= MINIMUM_CONTEXT_LENGTH: + return None + + model = getattr(agent, "model", "") or "the selected model" + base_url = getattr(agent, "base_url", "") or "unknown base URL" + provider = getattr(agent, "provider", "") or "unknown" + tool_count = len(getattr(agent, "tools", None) or []) + + logger.warning( + "Ollama runtime context too small for Hermes tool use: " + "model=%s provider=%s base_url=%s runtime_context=%d " + "minimum_context=%d estimated_request_tokens=%d tool_count=%d " + "session=%s", + model, + provider, + base_url, + runtime_ctx, + MINIMUM_CONTEXT_LENGTH, + request_tokens, + tool_count, + getattr(agent, "session_id", None) or "none", + ) + + return ( + f"Ollama loaded `{model}` with only {runtime_ctx:,} tokens of runtime " + f"context, but Hermes needs at least {MINIMUM_CONTEXT_LENGTH:,} tokens " + "for reliable tool use.\n\n" + "Increase the Ollama context for this model and restart/reload the " + "model before trying again. A known-good starting point is 65,536 " + "tokens. In Hermes config, set `model.ollama_num_ctx: 65536` " + "(and `model.context_length: 65536` if you also override the displayed " + "model context). If you manage the model through an Ollama Modelfile, " + "set `PARAMETER num_ctx 65536` there instead." + ) + + def _ra(): """Lazy reference to ``run_agent`` so callers can patch ``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` / @@ -527,6 +572,7 @@ def run_conversation( api_call_count = 0 final_response = None interrupted = False + failed = False codex_ack_continuations = 0 length_continue_retries = 0 truncated_tool_call_retries = 0 @@ -883,6 +929,26 @@ def run_conversation( # Calculate approximate request size for logging total_chars = sum(len(str(msg)) for msg in api_messages) approx_tokens = estimate_messages_tokens_rough(api_messages) + approx_request_tokens = estimate_request_tokens_rough( + api_messages, tools=agent.tools or None + ) + + _runtime_context_error = _ollama_context_limit_error( + agent, approx_request_tokens + ) + if _runtime_context_error: + final_response = _runtime_context_error + failed = True + _turn_exit_reason = "ollama_runtime_context_too_small" + messages.append({"role": "assistant", "content": final_response}) + agent._emit_status("❌ Ollama runtime context is too small for Hermes tool use") + api_call_count -= 1 + agent._api_call_count = api_call_count + try: + agent.iteration_budget.refund() + except Exception: + pass + break # Thinking spinner for quiet mode (animated during API call) thinking_spinner = None @@ -3848,7 +3914,11 @@ def run_conversation( ) # Determine if conversation completed successfully - completed = final_response is not None and api_call_count < agent.max_iterations + completed = ( + final_response is not None + and api_call_count < agent.max_iterations + and not failed + ) # Save trajectory if enabled. ``user_message`` may be a multimodal # list of parts; the trajectory format wants a plain string. @@ -3998,6 +4068,7 @@ def run_conversation( "api_calls": api_call_count, "completed": completed, "turn_exit_reason": _turn_exit_reason, + "failed": failed, "partial": False, # True only when stopped due to invalid tool calls "interrupted": interrupted, "response_previewed": getattr(agent, "_response_was_previewed", False), diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 821228075c3..3d0dcedddd0 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -2636,6 +2636,31 @@ class TestRunConversation: assert result["final_response"] == "Final answer" assert result["completed"] is True + def test_ollama_small_runtime_context_fails_before_api_call(self, agent, caplog): + self._setup_agent(agent) + agent.model = "qwen3.5:9b" + agent.provider = "custom" + agent.base_url = "http://host.docker.internal:11434/v1" + agent._ollama_num_ctx = 4096 + + with ( + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + caplog.at_level(logging.WARNING, logger="agent.conversation_loop"), + ): + result = agent.run_conversation("Call ps -aux") + + assert result["failed"] is True + assert result["completed"] is False + assert result["api_calls"] == 0 + assert result["turn_exit_reason"] == "ollama_runtime_context_too_small" + assert "Ollama loaded `qwen3.5:9b` with only 4,096 tokens" in result["final_response"] + assert "model.ollama_num_ctx: 65536" in result["final_response"] + assert not agent.client.chat.completions.create.called + assert "Ollama runtime context too small for Hermes tool use" in caplog.text + assert "runtime_context=4096" in caplog.text + def test_tool_calls_then_stop(self, agent): self._setup_agent(agent) tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")