diff --git a/cli.py b/cli.py index dcd97139809..63f7e31096f 100644 --- a/cli.py +++ b/cli.py @@ -4958,20 +4958,22 @@ class HermesCLI: if os.environ.get("HERMES_DEFER_AGENT_STARTUP") != "1": self._show_tool_availability_warnings() - # Warn about very low context lengths (common with local servers) - if ctx_len and ctx_len <= 8192: + # Warn about low context lengths (common with local servers). Keep + # this tied to the runtime guard so guidance cannot drift again. + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + if ctx_len and ctx_len < MINIMUM_CONTEXT_LENGTH: self._console_print() self._console_print( f"[yellow]⚠️ Context length is only {ctx_len:,} tokens — " f"this is likely too low for agent use with tools.[/]" ) self._console_print( - "[dim] Hermes needs 16k–32k minimum. Tool schemas + system prompt alone use ~4k–8k.[/]" + f"[dim] Hermes needs at least {MINIMUM_CONTEXT_LENGTH:,} tokens. Tool schemas + system prompt use a large fixed prefix.[/]" ) base_url = getattr(self, "base_url", "") or "" if "11434" in base_url or "ollama" in base_url.lower(): self._console_print( - "[dim] Ollama fix: OLLAMA_CONTEXT_LENGTH=32768 ollama serve[/]" + f"[dim] Ollama fix: OLLAMA_CONTEXT_LENGTH={MINIMUM_CONTEXT_LENGTH} ollama serve[/]" ) elif "1234" in base_url: self._console_print( diff --git a/tests/cli/test_cli_context_warning.py b/tests/cli/test_cli_context_warning.py index bf0c5aac43a..3a2b404bda1 100644 --- a/tests/cli/test_cli_context_warning.py +++ b/tests/cli/test_cli_context_warning.py @@ -6,6 +6,8 @@ from unittest.mock import MagicMock, patch import pytest +from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + @pytest.fixture def _isolate(tmp_path, monkeypatch): @@ -44,17 +46,18 @@ def cli_obj(_isolate): class TestLowContextWarning: """Tests that the CLI warns about low context lengths.""" - def test_no_warning_for_normal_context(self, cli_obj): - """No warning when context is 32k+.""" + def test_warning_for_below_minimum_context(self, cli_obj): + """Warning shown when context is below Hermes' minimum.""" cli_obj.agent.context_compressor.context_length = 32768 with patch("cli.get_tool_definitions", return_value=[]), \ patch("cli.build_welcome_banner"): cli_obj.show_banner() - # Check that no yellow warning was printed calls = [str(c) for c in cli_obj.console.print.call_args_list] warning_calls = [c for c in calls if "too low" in c] - assert len(warning_calls) == 0 + assert len(warning_calls) == 1 + minimum_calls = [c for c in calls if f"{MINIMUM_CONTEXT_LENGTH:,}" in c] + assert minimum_calls def test_warning_for_low_context(self, cli_obj): """Warning shown when context is 4096 (Ollama default).""" @@ -80,19 +83,19 @@ class TestLowContextWarning: assert len(warning_calls) == 1 def test_no_warning_at_boundary(self, cli_obj): - """No warning at exactly 8192 — 8192 is borderline but included in warning.""" - cli_obj.agent.context_compressor.context_length = 8192 + """No warning at exactly Hermes' minimum context length.""" + cli_obj.agent.context_compressor.context_length = MINIMUM_CONTEXT_LENGTH with patch("cli.get_tool_definitions", return_value=[]), \ patch("cli.build_welcome_banner"): cli_obj.show_banner() calls = [str(c) for c in cli_obj.console.print.call_args_list] warning_calls = [c for c in calls if "too low" in c] - assert len(warning_calls) == 1 # 8192 is still warned about + assert len(warning_calls) == 0 def test_no_warning_above_boundary(self, cli_obj): - """No warning at 16384.""" - cli_obj.agent.context_compressor.context_length = 16384 + """No warning above Hermes' minimum context length.""" + cli_obj.agent.context_compressor.context_length = MINIMUM_CONTEXT_LENGTH + 1 with patch("cli.get_tool_definitions", return_value=[]), \ patch("cli.build_welcome_banner"): cli_obj.show_banner() @@ -112,6 +115,7 @@ class TestLowContextWarning: calls = [str(c) for c in cli_obj.console.print.call_args_list] ollama_hints = [c for c in calls if "OLLAMA_CONTEXT_LENGTH" in c] assert len(ollama_hints) == 1 + assert str(MINIMUM_CONTEXT_LENGTH) in ollama_hints[0] def test_lm_studio_specific_hint(self, cli_obj): """LM Studio-specific fix shown when port 1234 detected.""" diff --git a/website/docs/guides/local-llm-on-mac.md b/website/docs/guides/local-llm-on-mac.md index 975ba6b12e1..9ac7bd9b97e 100644 --- a/website/docs/guides/local-llm-on-mac.md +++ b/website/docs/guides/local-llm-on-mac.md @@ -110,9 +110,9 @@ The `--cache-type-k q4_0 --cache-type-v q4_0` flags are the most important optim | q8_0 | ~8 GB | | **q4_0** | **~4 GB** | -On an 8 GB Mac, use `q4_0` KV cache and reduce context to `-c 32768` (32K). On 16 GB, you can comfortably do 128K context. On 32 GB+, you can run larger models or multiple parallel slots. +On an 8 GB Mac, use `q4_0` KV cache and choose a smaller model that can still fit Hermes' 64K minimum context. On 16 GB, you can comfortably do 128K context. On 32 GB+, you can run larger models or multiple parallel slots. -If you're still running out of memory, reduce context size first (`-c`), then try a smaller quantization (Q3_K_M instead of Q4_K_M). +If you're still running out of memory, reduce context only while staying at or above Hermes' 64K minimum; otherwise switch to a smaller model or smaller quantization (Q3_K_M instead of Q4_K_M). ### Test it diff --git a/website/docs/guides/local-ollama-setup.md b/website/docs/guides/local-ollama-setup.md index 9e2fab5e5de..188fbc99273 100644 --- a/website/docs/guides/local-ollama-setup.md +++ b/website/docs/guides/local-ollama-setup.md @@ -156,19 +156,19 @@ Switch models on the fly inside a session: ### Increase Ollama's Context Window -By default, Ollama uses a 2048-token context. For agentic work (tool calls, long conversations), you need more: +By default, Ollama uses a 2048-token context. Hermes requires at least 64,000 tokens for agentic work with tools: ```bash # Create a Modelfile that extends context cat > /tmp/Modelfile << 'EOF' FROM gemma4:31b -PARAMETER num_ctx 16384 +PARAMETER num_ctx 64000 EOF -ollama create gemma4-16k -f /tmp/Modelfile +ollama create gemma4-64k -f /tmp/Modelfile ``` -Then update your Hermes config to use `gemma4-16k` as the model name. +Then update your Hermes config to use `gemma4-64k` as the model name. ### Keep the Model Loaded @@ -311,7 +311,7 @@ Your only cost is electricity — roughly $0.01–0.05 per session depending on ## What's Better with Cloud Models - **Very complex multi-step reasoning** — 70B+ or cloud models like Claude Opus are noticeably better -- **Long context windows** — cloud models offer 100K–1M tokens; local models are typically 8K–32K +- **Long context windows** — cloud models offer 100K–1M tokens; local runtimes often default below Hermes' 64K minimum unless you configure them - **Speed on large responses** — cloud inference is faster than CPU-only local for long generations The sweet spot: use local for everyday tasks, set up a cloud fallback for the hard stuff. diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md index 94880933656..9c51961c257 100644 --- a/website/docs/integrations/providers.md +++ b/website/docs/integrations/providers.md @@ -691,7 +691,7 @@ model: default: qwen2.5-coder:32b provider: custom base_url: http://localhost:11434/v1 - context_length: 32768 # See warning below + context_length: 64000 # See warning below ``` :::caution Ollama defaults to very low context lengths @@ -703,22 +703,22 @@ Ollama does **not** use your model's full context window by default. Depending o | 24–48 GB | 32,768 tokens | | 48+ GB | 256,000 tokens | -For agent use with tools, **you need at least 16k–32k context**. At 4k, the system prompt + tool schemas alone can fill the window, leaving no room for conversation. +Hermes Agent requires at least **64,000 tokens** of context for agent use with tools. Smaller windows are rejected at startup because the system prompt, tool schemas, and working conversation state need enough room for reliable multi-step workflows. **How to increase it** (pick one): ```bash # Option 1: Set server-wide via environment variable (recommended) -OLLAMA_CONTEXT_LENGTH=32768 ollama serve +OLLAMA_CONTEXT_LENGTH=64000 ollama serve # Option 2: For systemd-managed Ollama sudo systemctl edit ollama.service -# Add: Environment="OLLAMA_CONTEXT_LENGTH=32768" +# Add: Environment="OLLAMA_CONTEXT_LENGTH=64000" # Then: sudo systemctl daemon-reload && sudo systemctl restart ollama # Option 3: Bake it into a custom model (persistent per-model) -echo -e "FROM qwen2.5-coder:32b\nPARAMETER num_ctx 32768" > Modelfile -ollama create qwen2.5-coder-32k -f Modelfile +echo -e "FROM qwen2.5-coder:32b\nPARAMETER num_ctx 64000" > Modelfile +ollama create qwen2.5-coder-64k -f Modelfile ``` **You cannot set context length through the OpenAI-compatible API** (`/v1/chat/completions`). It must be configured server-side or via a Modelfile. This is the #1 source of confusion when integrating Ollama with tools like Hermes. @@ -820,13 +820,13 @@ If responses seem truncated, add `max_tokens` to your requests or set `--default cmake -B build && cmake --build build --config Release ./build/bin/llama-server \ --jinja -fa \ - -c 32768 \ + -c 64000 \ -ngl 99 \ -m models/qwen2.5-coder-32b-instruct-Q4_K_M.gguf \ --port 8080 --host 0.0.0.0 ``` -**Context length (`-c`):** Recent builds default to `0` which reads the model's training context from the GGUF metadata. For models with 128k+ training context, this can OOM trying to allocate the full KV cache. Set `-c` explicitly to what you need (32k–64k is a good range for agent use). If using parallel slots (`-np`), the total context is divided among slots — with `-c 32768 -np 4`, each slot only gets 8k. +**Context length (`-c`):** Recent builds default to `0` which reads the model's training context from the GGUF metadata. For models with 128k+ training context, this can OOM trying to allocate the full KV cache. Set `-c` explicitly to at least 64,000 tokens for Hermes. If using parallel slots (`-np`), the total context is divided among slots — with `-c 64000 -np 4`, each slot only gets 16k, which is below Hermes' minimum per active session. Then configure Hermes to point at it: @@ -862,7 +862,7 @@ Start the server from the LM Studio app (Developer tab → Start Server), or use ```bash lms server start # Starts on port 1234 -lms load qwen2.5-coder --context-length 32768 +lms load qwen2.5-coder --context-length 64000 ``` Then configure Hermes: @@ -1044,7 +1044,7 @@ The model outputs something like `{"name": "web_search", "arguments": {...}}` as # vLLM: check --max-model-len in startup args ``` -**Fix:** Set context to at least **32,768 tokens** for agent use. See each server's section above for the specific flag. +**Fix:** Set context to at least **64,000 tokens** for agent use. See each server's section above for the specific flag. #### "Context limit: 2048 tokens" at startup @@ -1057,7 +1057,7 @@ model: default: your-model provider: custom base_url: http://localhost:11434/v1 - context_length: 32768 + context_length: 64000 ``` #### Responses get cut off mid-sentence @@ -1198,7 +1198,7 @@ custom_providers: base_url: "http://localhost:11434/v1" models: qwen3.5:27b: - context_length: 32768 + context_length: 64000 deepseek-r1:70b: context_length: 65536 ``` diff --git a/website/docs/reference/faq.md b/website/docs/reference/faq.md index 20ea8b8997b..01bf783f5d4 100644 --- a/website/docs/reference/faq.md +++ b/website/docs/reference/faq.md @@ -82,7 +82,7 @@ hermes model # API base URL: http://localhost:11434/v1 # API key: ollama # Model name: qwen3.5:27b -# Context length: 32768 ← set this to match your server's actual context window +# Context length: 64000 ← Hermes minimum; set this to match your server's actual context window ``` Or configure it directly in `config.yaml`: @@ -99,7 +99,7 @@ Hermes persists the endpoint, provider, and base URL in `config.yaml` so it surv This works with Ollama, vLLM, llama.cpp server, SGLang, LocalAI, and others. See the [Configuration guide](../user-guide/configuration.md) for details. :::tip Ollama users -If you set a custom `num_ctx` in Ollama (e.g., `ollama run --num_ctx 16384`), make sure to set the matching context length in Hermes — Ollama's `/api/show` reports the model's *maximum* context, not the effective `num_ctx` you configured. +If you set a custom `num_ctx` in Ollama (e.g., `ollama run --num_ctx 64000`), make sure to set the matching context length in Hermes — Ollama's `/api/show` reports the model's *maximum* context, not the effective `num_ctx` you configured. ::: :::tip Timeouts with local models @@ -340,7 +340,7 @@ custom_providers: base_url: "http://localhost:11434/v1" models: qwen3.5:27b: - context_length: 32768 + context_length: 64000 ``` See [Context Length Detection](../integrations/providers.md#context-length-detection) for how auto-detection works and all override options.