From f6f25b9449dbf05e1aa59759e2c1faa0d46e681b Mon Sep 17 00:00:00 2001
From: helix4u <4317663+helix4u@users.noreply.github.com>
Date: Thu, 21 May 2026 14:49:02 -0600
Subject: [PATCH] fix(agent): fail fast on small Ollama runtime context

---
 agent/conversation_loop.py        | 73 ++++++++++++++++++++++++++++++-
 tests/run_agent/test_run_agent.py | 25 +++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index caac0d3e8f2..d3cad0c35fd 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -46,6 +46,7 @@ from agent.message_sanitization import (
     _strip_non_ascii,
 )
 from agent.model_metadata import (
+    MINIMUM_CONTEXT_LENGTH,
     estimate_messages_tokens_rough,
     estimate_request_tokens_rough,
     get_next_probe_tier,
@@ -73,6 +74,50 @@ from utils import base_url_host_matches, env_var_enabled
 logger = logging.getLogger(__name__)
 
 
+def _ollama_context_limit_error(agent: Any, request_tokens: int) -> Optional[str]:
+    """Return a user-facing error when Ollama is loaded with too little context."""
+    if not getattr(agent, "tools", None):
+        return None
+
+    runtime_ctx = getattr(agent, "_ollama_num_ctx", None)
+    if not isinstance(runtime_ctx, int) or runtime_ctx <= 0:
+        return None
+    if runtime_ctx >= MINIMUM_CONTEXT_LENGTH:
+        return None
+
+    model = getattr(agent, "model", "") or "the selected model"
+    base_url = getattr(agent, "base_url", "") or "unknown base URL"
+    provider = getattr(agent, "provider", "") or "unknown"
+    tool_count = len(getattr(agent, "tools", None) or [])
+
+    logger.warning(
+        "Ollama runtime context too small for Hermes tool use: "
+        "model=%s provider=%s base_url=%s runtime_context=%d "
+        "minimum_context=%d estimated_request_tokens=%d tool_count=%d "
+        "session=%s",
+        model,
+        provider,
+        base_url,
+        runtime_ctx,
+        MINIMUM_CONTEXT_LENGTH,
+        request_tokens,
+        tool_count,
+        getattr(agent, "session_id", None) or "none",
+    )
+
+    return (
+        f"Ollama loaded `{model}` with only {runtime_ctx:,} tokens of runtime "
+        f"context, but Hermes needs at least {MINIMUM_CONTEXT_LENGTH:,} tokens "
+        "for reliable tool use.\n\n"
+        "Increase the Ollama context for this model and restart/reload the "
+        "model before trying again. A known-good starting point is 65,536 "
+        "tokens. In Hermes config, set `model.ollama_num_ctx: 65536` "
+        "(and `model.context_length: 65536` if you also override the displayed "
+        "model context). If you manage the model through an Ollama Modelfile, "
+        "set `PARAMETER num_ctx 65536` there instead."
+    )
+
+
 def _ra():
     """Lazy reference to ``run_agent`` so callers can patch
     ``run_agent.handle_function_call`` / ``run_agent._set_interrupt`` /
@@ -527,6 +572,7 @@ def run_conversation(
     api_call_count = 0
     final_response = None
     interrupted = False
+    failed = False
     codex_ack_continuations = 0
     length_continue_retries = 0
     truncated_tool_call_retries = 0
@@ -883,6 +929,26 @@ def run_conversation(
         # Calculate approximate request size for logging
         total_chars = sum(len(str(msg)) for msg in api_messages)
         approx_tokens = estimate_messages_tokens_rough(api_messages)
+        approx_request_tokens = estimate_request_tokens_rough(
+            api_messages, tools=agent.tools or None
+        )
+
+        _runtime_context_error = _ollama_context_limit_error(
+            agent, approx_request_tokens
+        )
+        if _runtime_context_error:
+            final_response = _runtime_context_error
+            failed = True
+            _turn_exit_reason = "ollama_runtime_context_too_small"
+            messages.append({"role": "assistant", "content": final_response})
+            agent._emit_status("❌ Ollama runtime context is too small for Hermes tool use")
+            api_call_count -= 1
+            agent._api_call_count = api_call_count
+            try:
+                agent.iteration_budget.refund()
+            except Exception:
+                pass
+            break
         
         # Thinking spinner for quiet mode (animated during API call)
         thinking_spinner = None
@@ -3848,7 +3914,11 @@ def run_conversation(
                 )
 
     # Determine if conversation completed successfully
-    completed = final_response is not None and api_call_count < agent.max_iterations
+    completed = (
+        final_response is not None
+        and api_call_count < agent.max_iterations
+        and not failed
+    )
 
     # Save trajectory if enabled.  ``user_message`` may be a multimodal
     # list of parts; the trajectory format wants a plain string.
@@ -3998,6 +4068,7 @@ def run_conversation(
         "api_calls": api_call_count,
         "completed": completed,
         "turn_exit_reason": _turn_exit_reason,
+        "failed": failed,
         "partial": False,  # True only when stopped due to invalid tool calls
         "interrupted": interrupted,
         "response_previewed": getattr(agent, "_response_was_previewed", False),
diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py
index 821228075c3..3d0dcedddd0 100644
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -2636,6 +2636,31 @@ class TestRunConversation:
         assert result["final_response"] == "Final answer"
         assert result["completed"] is True
 
+    def test_ollama_small_runtime_context_fails_before_api_call(self, agent, caplog):
+        self._setup_agent(agent)
+        agent.model = "qwen3.5:9b"
+        agent.provider = "custom"
+        agent.base_url = "http://host.docker.internal:11434/v1"
+        agent._ollama_num_ctx = 4096
+
+        with (
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+            caplog.at_level(logging.WARNING, logger="agent.conversation_loop"),
+        ):
+            result = agent.run_conversation("Call ps -aux")
+
+        assert result["failed"] is True
+        assert result["completed"] is False
+        assert result["api_calls"] == 0
+        assert result["turn_exit_reason"] == "ollama_runtime_context_too_small"
+        assert "Ollama loaded `qwen3.5:9b` with only 4,096 tokens" in result["final_response"]
+        assert "model.ollama_num_ctx: 65536" in result["final_response"]
+        assert not agent.client.chat.completions.create.called
+        assert "Ollama runtime context too small for Hermes tool use" in caplog.text
+        assert "runtime_context=4096" in caplog.text
+
     def test_tool_calls_then_stop(self, agent):
         self._setup_agent(agent)
         tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")