diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 069a5b65e1..5aa95dc01b 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -20,6 +20,7 @@ from typing import Any, Dict, List, Optional from agent.auxiliary_client import call_llm from agent.context_engine import ContextEngine from agent.model_metadata import ( + MINIMUM_CONTEXT_LENGTH, get_model_context_length, estimate_messages_tokens_rough, ) @@ -87,7 +88,10 @@ class ContextCompressor(ContextEngine): self.api_key = api_key self.provider = provider self.context_length = context_length - self.threshold_tokens = int(context_length * self.threshold_percent) + self.threshold_tokens = max( + int(context_length * self.threshold_percent), + MINIMUM_CONTEXT_LENGTH, + ) def __init__( self, @@ -118,7 +122,14 @@ class ContextCompressor(ContextEngine): config_context_length=config_context_length, provider=provider, ) - self.threshold_tokens = int(self.context_length * threshold_percent) + # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if + # the percentage would suggest a lower value. This prevents premature + # compression on large-context models at 50% while keeping the % sane + # for models right at the minimum. + self.threshold_tokens = max( + int(self.context_length * threshold_percent), + MINIMUM_CONTEXT_LENGTH, + ) self.compression_count = 0 # Derive token budgets: ratio is relative to the threshold, not total context diff --git a/agent/model_metadata.py b/agent/model_metadata.py index f12801777d..3af2e7bafb 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -85,6 +85,11 @@ CONTEXT_PROBE_TIERS = [ # Default context length when no detection method succeeds. DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0] +# Minimum context length required to run Hermes Agent. Models with fewer +# tokens cannot maintain enough working memory for tool-calling workflows. +# Sessions, model switches, and cron jobs should reject models below this. +MINIMUM_CONTEXT_LENGTH = 64_000 + # Thin fallback defaults — only broad model family patterns. # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic # all miss. Replaced the previous 80+ entry dict. diff --git a/run_agent.py b/run_agent.py index cc93594d68..226a0ba245 100644 --- a/run_agent.py +++ b/run_agent.py @@ -775,12 +775,14 @@ class AIAgent: self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost) - # Iteration budget pressure: warn the LLM as it approaches max_iterations. - # Warnings are injected into the last tool result JSON (not as separate - # messages) so they don't break message structure or invalidate caching. - self._budget_caution_threshold = 0.7 # 70% — nudge to start wrapping up - self._budget_warning_threshold = 0.9 # 90% — urgent, respond now - self._budget_pressure_enabled = True + # Iteration budget: the LLM is only notified when it actually exhausts + # the iteration budget (api_call_count >= max_iterations). At that + # point we inject ONE message, allow one final API call, and if the + # model doesn't produce a text response, force a user-message asking + # it to summarise. No intermediate pressure warnings — they caused + # models to "give up" prematurely on complex tasks (#7915). + self._budget_exhausted_injected = False + self._budget_grace_call = False # Context pressure warnings: notify the USER (not the LLM) as context # fills up. Purely informational — displayed in CLI output and sent via @@ -1331,6 +1333,19 @@ class AIAgent: ) self.compression_enabled = compression_enabled + # Reject models whose context window is below the minimum required + # for reliable tool-calling workflows (64K tokens). + from agent.model_metadata import MINIMUM_CONTEXT_LENGTH + _ctx = getattr(self.context_compressor, "context_length", 0) + if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH: + raise ValueError( + f"Model {self.model} has a context window of {_ctx:,} tokens, " + f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required " + f"by Hermes Agent. Choose a model with at least " + f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set " + f"model.context_length in config.yaml to override." + ) + # Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand) self._context_engine_tool_names: set = set() if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None: @@ -6985,6 +7000,15 @@ class AIAgent: self._current_tool = function_name self._touch_activity(f"executing tool: {function_name}") + # Set activity callback for long-running tool execution (terminal + # commands, etc.) so the gateway's inactivity monitor doesn't kill + # the agent while a command is running. + try: + from tools.environments.base import set_activity_callback + set_activity_callback(self._touch_activity) + except Exception: + pass + if self.tool_progress_callback: try: preview = _build_tool_preview(function_name, function_args) @@ -7298,25 +7322,11 @@ class AIAgent: def _get_budget_warning(self, api_call_count: int) -> Optional[str]: """Return a budget pressure string, or None if not yet needed. - Two-tier system: - - Caution (70%): nudge to consolidate work - - Warning (90%): urgent, must respond now + Only fires once the iteration budget is fully exhausted. No + intermediate warnings — those caused models to abandon complex + tasks prematurely. """ - if not self._budget_pressure_enabled or self.max_iterations <= 0: - return None - progress = api_call_count / self.max_iterations - remaining = self.max_iterations - api_call_count - if progress >= self._budget_warning_threshold: - return ( - f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. " - f"Only {remaining} iteration(s) left. " - "Provide your final response NOW. No more tool calls unless absolutely critical.]" - ) - if progress >= self._budget_caution_threshold: - return ( - f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. " - f"{remaining} iterations left. Start consolidating your work.]" - ) + # Never inject warnings during the normal run return None def _emit_context_pressure(self, compaction_progress: float, compressor) -> None: @@ -7834,7 +7844,7 @@ class AIAgent: except Exception: pass - while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0: + while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call: # Reset per-turn checkpoint dedup so each iteration can take one snapshot self._checkpoint_mgr.new_turn() @@ -7849,7 +7859,13 @@ class AIAgent: api_call_count += 1 self._api_call_count = api_call_count self._touch_activity(f"starting API call #{api_call_count}") - if not self.iteration_budget.consume(): + + # Grace call: the budget is exhausted but we gave the model one + # more chance. Consume the grace flag so the loop exits after + # this iteration regardless of outcome. + if self._budget_grace_call: + self._budget_grace_call = False + elif not self.iteration_budget.consume(): _turn_exit_reason = "budget_exhausted" if not self.quiet_mode: self._safe_print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)") @@ -10034,7 +10050,31 @@ class AIAgent: if final_response is None and ( api_call_count >= self.max_iterations or self.iteration_budget.remaining <= 0 - ): + ) and not self._budget_exhausted_injected: + # Budget exhausted but we haven't tried asking the model to + # summarise yet. Inject a user message and give it one grace + # API call to produce a text response. + self._budget_exhausted_injected = True + self._budget_grace_call = True + _grace_msg = ( + "Your tool budget ran out. Please give me the information " + "or actions you've completed so far." + ) + messages.append({"role": "user", "content": _grace_msg}) + self._emit_status( + f"⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) " + "— asking model to summarise" + ) + if not self.quiet_mode: + self._safe_print( + f"\n⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) " + "— requesting summary..." + ) + + if final_response is None and ( + api_call_count >= self.max_iterations + or self.iteration_budget.remaining <= 0 + ) and not self._budget_grace_call: _turn_exit_reason = f"max_iterations_reached({api_call_count}/{self.max_iterations})" if self.iteration_budget.remaining <= 0 and not self.quiet_mode: print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)") diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 88a23b44cf..f4cf19666f 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -576,11 +576,19 @@ class TestSummaryTargetRatio: assert c.summary_target_ratio == 0.80 def test_default_threshold_is_50_percent(self): - """Default compression threshold should be 50%.""" + """Default compression threshold should be 50%, with a 64K floor.""" with patch("agent.context_compressor.get_model_context_length", return_value=100_000): c = ContextCompressor(model="test", quiet_mode=True) assert c.threshold_percent == 0.50 - assert c.threshold_tokens == 50_000 + # 50% of 100K = 50K, but the floor is 64K + assert c.threshold_tokens == 64_000 + + def test_threshold_floor_does_not_apply_above_128k(self): + """On large-context models the 50% percentage is used directly.""" + with patch("agent.context_compressor.get_model_context_length", return_value=200_000): + c = ContextCompressor(model="test", quiet_mode=True) + # 50% of 200K = 100K, which is above the 64K floor + assert c.threshold_tokens == 100_000 def test_default_protect_last_n_is_20(self): """Default protect_last_n should be 20.""" diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 61137fe90a..d19220e506 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -2742,74 +2742,24 @@ class TestSystemPromptStability: assert "Hermes Agent" in agent._cached_system_prompt class TestBudgetPressure: - """Budget pressure warning system (issue #414).""" + """Budget pressure warning system — now only fires at budget exhaustion.""" - def test_no_warning_below_caution(self, agent): + def test_no_intermediate_warnings(self, agent): + """No warnings at 70% or 90% — only at actual exhaustion.""" agent.max_iterations = 60 - assert agent._get_budget_warning(30) is None - - def test_caution_at_70_percent(self, agent): - agent.max_iterations = 60 - msg = agent._get_budget_warning(42) - assert msg is not None - assert "[BUDGET:" in msg - assert "18 iterations left" in msg - - def test_warning_at_90_percent(self, agent): - agent.max_iterations = 60 - msg = agent._get_budget_warning(54) - assert "[BUDGET WARNING:" in msg - assert "Provide your final response NOW" in msg - - def test_last_iteration(self, agent): - agent.max_iterations = 60 - msg = agent._get_budget_warning(59) - assert "1 iteration(s) left" in msg - - def test_disabled(self, agent): - agent.max_iterations = 60 - agent._budget_pressure_enabled = False - assert agent._get_budget_warning(55) is None + assert agent._get_budget_warning(30) is None # 50% + assert agent._get_budget_warning(42) is None # 70% + assert agent._get_budget_warning(54) is None # 90% + assert agent._get_budget_warning(59) is None # last iteration def test_zero_max_iterations(self, agent): agent.max_iterations = 0 assert agent._get_budget_warning(0) is None - def test_injects_into_json_tool_result(self, agent): - """Warning should be injected as _budget_warning field in JSON tool results.""" - import json - agent.max_iterations = 10 - messages = [ - {"role": "tool", "content": json.dumps({"output": "done", "exit_code": 0}), "tool_call_id": "tc1"} - ] - warning = agent._get_budget_warning(9) - assert warning is not None - # Simulate the injection logic - last_content = messages[-1]["content"] - parsed = json.loads(last_content) - parsed["_budget_warning"] = warning - messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False) - result = json.loads(messages[-1]["content"]) - assert "_budget_warning" in result - assert "BUDGET WARNING" in result["_budget_warning"] - assert result["output"] == "done" # original content preserved - - def test_appends_to_non_json_tool_result(self, agent): - """Warning should be appended as text for non-JSON tool results.""" - agent.max_iterations = 10 - messages = [ - {"role": "tool", "content": "plain text result", "tool_call_id": "tc1"} - ] - warning = agent._get_budget_warning(9) - # Simulate injection logic for non-JSON - last_content = messages[-1]["content"] - try: - import json - json.loads(last_content) - except (json.JSONDecodeError, TypeError): - messages[-1]["content"] = last_content + f"\n\n{warning}" - assert "plain text result" in messages[-1]["content"] - assert "BUDGET WARNING" in messages[-1]["content"] + def test_grace_call_flags_initialized(self, agent): + """Agent should have budget grace call flags.""" + assert agent._budget_exhausted_injected is False + assert agent._budget_grace_call is False class TestSafeWriter: diff --git a/tools/environments/base.py b/tools/environments/base.py index 1598c22110..19c3bf024e 100644 --- a/tools/environments/base.py +++ b/tools/environments/base.py @@ -23,6 +23,19 @@ from tools.interrupt import is_interrupted logger = logging.getLogger(__name__) +# Thread-local activity callback. The agent sets this before a tool call so +# long-running _wait_for_process loops can report liveness to the gateway. +_activity_callback_local = threading.local() + + +def set_activity_callback(cb: Callable[[str], None] | None) -> None: + """Register a callback that _wait_for_process fires periodically.""" + _activity_callback_local.callback = cb + + +def _get_activity_callback() -> Callable[[str], None] | None: + return getattr(_activity_callback_local, "callback", None) + def get_sandbox_dir() -> Path: """Return the host-side root for all sandbox storage (Docker workspaces, @@ -370,6 +383,10 @@ class BaseEnvironment(ABC): """Poll-based wait with interrupt checking and stdout draining. Shared across all backends — not overridden. + + Fires the ``activity_callback`` (if set on this instance) every 10s + while the process is running so the gateway's inactivity timeout + doesn't kill long-running commands. """ output_chunks: list[str] = [] @@ -388,6 +405,8 @@ class BaseEnvironment(ABC): drain_thread = threading.Thread(target=_drain, daemon=True) drain_thread.start() deadline = time.monotonic() + timeout + _last_activity_touch = time.monotonic() + _ACTIVITY_INTERVAL = 10.0 # seconds between activity touches while proc.poll() is None: if is_interrupted(): @@ -408,6 +427,17 @@ class BaseEnvironment(ABC): else timeout_msg.lstrip(), "returncode": 124, } + # Periodic activity touch so the gateway knows we're alive + _now = time.monotonic() + if _now - _last_activity_touch >= _ACTIVITY_INTERVAL: + _last_activity_touch = _now + _cb = _get_activity_callback() + if _cb: + try: + _elapsed = int(_now - (deadline - timeout)) + _cb(f"terminal command running ({_elapsed}s elapsed)") + except Exception: + pass time.sleep(0.2) drain_thread.join(timeout=5) diff --git a/website/docs/getting-started/quickstart.md b/website/docs/getting-started/quickstart.md index bd26f1eebb..9646fbcc9f 100644 --- a/website/docs/getting-started/quickstart.md +++ b/website/docs/getting-started/quickstart.md @@ -64,6 +64,10 @@ hermes setup # Or configure everything at once | **Vercel AI Gateway** | Vercel AI Gateway routing | Set `AI_GATEWAY_API_KEY` | | **Custom Endpoint** | VLLM, SGLang, Ollama, or any OpenAI-compatible API | Set base URL + API key | +:::caution Minimum context: 64K tokens +Hermes Agent requires a model with at least **64,000 tokens** of context. Models with smaller windows cannot maintain enough working memory for multi-step tool-calling workflows and will be rejected at startup. Most hosted models (Claude, GPT, Gemini, Qwen, DeepSeek) meet this easily. If you're running a local model, set its context size to at least 64K (e.g. `--ctx-size 65536` for llama.cpp or `-c 65536` for Ollama). +::: + :::tip You can switch providers at any time with `hermes model` — no code changes, no lock-in. When configuring a custom endpoint, Hermes will prompt for the context window size and auto-detect it when possible. See [Context Length Detection](../integrations/providers.md#context-length-detection) for details. :::