mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: prevent agent from stopping mid-task — compression floor, budget overhaul, activity tracking
Three root causes of the 'agent stops mid-task' gateway bug:
1. Compression threshold floor (64K tokens minimum)
- The 50% threshold on a 100K-context model fired at 50K tokens,
causing premature compression that made models lose track of
multi-step plans. Now threshold_tokens = max(50% * context, 64K).
- Models with <64K context are rejected at startup with a clear error.
2. Budget warning removal — grace call instead
- Removed the 70%/90% iteration budget warnings entirely. These
injected '[BUDGET WARNING: Provide your final response NOW]' into
tool results, causing models to abandon complex tasks prematurely.
- Now: no warnings during normal execution. When the budget is
actually exhausted (90/90), inject a user message asking the model
to summarise, allow one grace API call, and only then fall back
to _handle_max_iterations.
3. Activity touches during long terminal execution
- _wait_for_process polls every 0.2s but never reported activity.
The gateway's inactivity timeout (default 1800s) would fire during
long-running commands that appeared 'idle.'
- Now: thread-local activity callback fires every 10s during the
poll loop, keeping the gateway's activity tracker alive.
- Agent wires _touch_activity into the callback before each tool call.
Also: docs update noting 64K minimum context requirement.
Closes #7915 (root cause was agent-loop termination, not Weixin delivery limits).
This commit is contained in:
parent
08f35076c9
commit
c8aff74632
7 changed files with 140 additions and 92 deletions
|
|
@ -20,6 +20,7 @@ from typing import Any, Dict, List, Optional
|
|||
from agent.auxiliary_client import call_llm
|
||||
from agent.context_engine import ContextEngine
|
||||
from agent.model_metadata import (
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
get_model_context_length,
|
||||
estimate_messages_tokens_rough,
|
||||
)
|
||||
|
|
@ -87,7 +88,10 @@ class ContextCompressor(ContextEngine):
|
|||
self.api_key = api_key
|
||||
self.provider = provider
|
||||
self.context_length = context_length
|
||||
self.threshold_tokens = int(context_length * self.threshold_percent)
|
||||
self.threshold_tokens = max(
|
||||
int(context_length * self.threshold_percent),
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -118,7 +122,14 @@ class ContextCompressor(ContextEngine):
|
|||
config_context_length=config_context_length,
|
||||
provider=provider,
|
||||
)
|
||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
|
||||
# the percentage would suggest a lower value. This prevents premature
|
||||
# compression on large-context models at 50% while keeping the % sane
|
||||
# for models right at the minimum.
|
||||
self.threshold_tokens = max(
|
||||
int(self.context_length * threshold_percent),
|
||||
MINIMUM_CONTEXT_LENGTH,
|
||||
)
|
||||
self.compression_count = 0
|
||||
|
||||
# Derive token budgets: ratio is relative to the threshold, not total context
|
||||
|
|
|
|||
|
|
@ -85,6 +85,11 @@ CONTEXT_PROBE_TIERS = [
|
|||
# Default context length when no detection method succeeds.
|
||||
DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
|
||||
|
||||
# Minimum context length required to run Hermes Agent. Models with fewer
|
||||
# tokens cannot maintain enough working memory for tool-calling workflows.
|
||||
# Sessions, model switches, and cron jobs should reject models below this.
|
||||
MINIMUM_CONTEXT_LENGTH = 64_000
|
||||
|
||||
# Thin fallback defaults — only broad model family patterns.
|
||||
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
|
||||
# all miss. Replaced the previous 80+ entry dict.
|
||||
|
|
|
|||
94
run_agent.py
94
run_agent.py
|
|
@ -775,12 +775,14 @@ class AIAgent:
|
|||
self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic
|
||||
self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost)
|
||||
|
||||
# Iteration budget pressure: warn the LLM as it approaches max_iterations.
|
||||
# Warnings are injected into the last tool result JSON (not as separate
|
||||
# messages) so they don't break message structure or invalidate caching.
|
||||
self._budget_caution_threshold = 0.7 # 70% — nudge to start wrapping up
|
||||
self._budget_warning_threshold = 0.9 # 90% — urgent, respond now
|
||||
self._budget_pressure_enabled = True
|
||||
# Iteration budget: the LLM is only notified when it actually exhausts
|
||||
# the iteration budget (api_call_count >= max_iterations). At that
|
||||
# point we inject ONE message, allow one final API call, and if the
|
||||
# model doesn't produce a text response, force a user-message asking
|
||||
# it to summarise. No intermediate pressure warnings — they caused
|
||||
# models to "give up" prematurely on complex tasks (#7915).
|
||||
self._budget_exhausted_injected = False
|
||||
self._budget_grace_call = False
|
||||
|
||||
# Context pressure warnings: notify the USER (not the LLM) as context
|
||||
# fills up. Purely informational — displayed in CLI output and sent via
|
||||
|
|
@ -1331,6 +1333,19 @@ class AIAgent:
|
|||
)
|
||||
self.compression_enabled = compression_enabled
|
||||
|
||||
# Reject models whose context window is below the minimum required
|
||||
# for reliable tool-calling workflows (64K tokens).
|
||||
from agent.model_metadata import MINIMUM_CONTEXT_LENGTH
|
||||
_ctx = getattr(self.context_compressor, "context_length", 0)
|
||||
if _ctx and _ctx < MINIMUM_CONTEXT_LENGTH:
|
||||
raise ValueError(
|
||||
f"Model {self.model} has a context window of {_ctx:,} tokens, "
|
||||
f"which is below the minimum {MINIMUM_CONTEXT_LENGTH:,} required "
|
||||
f"by Hermes Agent. Choose a model with at least "
|
||||
f"{MINIMUM_CONTEXT_LENGTH // 1000}K context, or set "
|
||||
f"model.context_length in config.yaml to override."
|
||||
)
|
||||
|
||||
# Inject context engine tool schemas (e.g. lcm_grep, lcm_describe, lcm_expand)
|
||||
self._context_engine_tool_names: set = set()
|
||||
if hasattr(self, "context_compressor") and self.context_compressor and self.tools is not None:
|
||||
|
|
@ -6985,6 +7000,15 @@ class AIAgent:
|
|||
self._current_tool = function_name
|
||||
self._touch_activity(f"executing tool: {function_name}")
|
||||
|
||||
# Set activity callback for long-running tool execution (terminal
|
||||
# commands, etc.) so the gateway's inactivity monitor doesn't kill
|
||||
# the agent while a command is running.
|
||||
try:
|
||||
from tools.environments.base import set_activity_callback
|
||||
set_activity_callback(self._touch_activity)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if self.tool_progress_callback:
|
||||
try:
|
||||
preview = _build_tool_preview(function_name, function_args)
|
||||
|
|
@ -7298,25 +7322,11 @@ class AIAgent:
|
|||
def _get_budget_warning(self, api_call_count: int) -> Optional[str]:
|
||||
"""Return a budget pressure string, or None if not yet needed.
|
||||
|
||||
Two-tier system:
|
||||
- Caution (70%): nudge to consolidate work
|
||||
- Warning (90%): urgent, must respond now
|
||||
Only fires once the iteration budget is fully exhausted. No
|
||||
intermediate warnings — those caused models to abandon complex
|
||||
tasks prematurely.
|
||||
"""
|
||||
if not self._budget_pressure_enabled or self.max_iterations <= 0:
|
||||
return None
|
||||
progress = api_call_count / self.max_iterations
|
||||
remaining = self.max_iterations - api_call_count
|
||||
if progress >= self._budget_warning_threshold:
|
||||
return (
|
||||
f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. "
|
||||
f"Only {remaining} iteration(s) left. "
|
||||
"Provide your final response NOW. No more tool calls unless absolutely critical.]"
|
||||
)
|
||||
if progress >= self._budget_caution_threshold:
|
||||
return (
|
||||
f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. "
|
||||
f"{remaining} iterations left. Start consolidating your work.]"
|
||||
)
|
||||
# Never inject warnings during the normal run
|
||||
return None
|
||||
|
||||
def _emit_context_pressure(self, compaction_progress: float, compressor) -> None:
|
||||
|
|
@ -7834,7 +7844,7 @@ class AIAgent:
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
|
||||
while (api_call_count < self.max_iterations and self.iteration_budget.remaining > 0) or self._budget_grace_call:
|
||||
# Reset per-turn checkpoint dedup so each iteration can take one snapshot
|
||||
self._checkpoint_mgr.new_turn()
|
||||
|
||||
|
|
@ -7849,7 +7859,13 @@ class AIAgent:
|
|||
api_call_count += 1
|
||||
self._api_call_count = api_call_count
|
||||
self._touch_activity(f"starting API call #{api_call_count}")
|
||||
if not self.iteration_budget.consume():
|
||||
|
||||
# Grace call: the budget is exhausted but we gave the model one
|
||||
# more chance. Consume the grace flag so the loop exits after
|
||||
# this iteration regardless of outcome.
|
||||
if self._budget_grace_call:
|
||||
self._budget_grace_call = False
|
||||
elif not self.iteration_budget.consume():
|
||||
_turn_exit_reason = "budget_exhausted"
|
||||
if not self.quiet_mode:
|
||||
self._safe_print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)")
|
||||
|
|
@ -10034,7 +10050,31 @@ class AIAgent:
|
|||
if final_response is None and (
|
||||
api_call_count >= self.max_iterations
|
||||
or self.iteration_budget.remaining <= 0
|
||||
):
|
||||
) and not self._budget_exhausted_injected:
|
||||
# Budget exhausted but we haven't tried asking the model to
|
||||
# summarise yet. Inject a user message and give it one grace
|
||||
# API call to produce a text response.
|
||||
self._budget_exhausted_injected = True
|
||||
self._budget_grace_call = True
|
||||
_grace_msg = (
|
||||
"Your tool budget ran out. Please give me the information "
|
||||
"or actions you've completed so far."
|
||||
)
|
||||
messages.append({"role": "user", "content": _grace_msg})
|
||||
self._emit_status(
|
||||
f"⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) "
|
||||
"— asking model to summarise"
|
||||
)
|
||||
if not self.quiet_mode:
|
||||
self._safe_print(
|
||||
f"\n⚠️ Iteration budget exhausted ({api_call_count}/{self.max_iterations}) "
|
||||
"— requesting summary..."
|
||||
)
|
||||
|
||||
if final_response is None and (
|
||||
api_call_count >= self.max_iterations
|
||||
or self.iteration_budget.remaining <= 0
|
||||
) and not self._budget_grace_call:
|
||||
_turn_exit_reason = f"max_iterations_reached({api_call_count}/{self.max_iterations})"
|
||||
if self.iteration_budget.remaining <= 0 and not self.quiet_mode:
|
||||
print(f"\n⚠️ Iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} iterations used)")
|
||||
|
|
|
|||
|
|
@ -576,11 +576,19 @@ class TestSummaryTargetRatio:
|
|||
assert c.summary_target_ratio == 0.80
|
||||
|
||||
def test_default_threshold_is_50_percent(self):
|
||||
"""Default compression threshold should be 50%."""
|
||||
"""Default compression threshold should be 50%, with a 64K floor."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.threshold_percent == 0.50
|
||||
assert c.threshold_tokens == 50_000
|
||||
# 50% of 100K = 50K, but the floor is 64K
|
||||
assert c.threshold_tokens == 64_000
|
||||
|
||||
def test_threshold_floor_does_not_apply_above_128k(self):
|
||||
"""On large-context models the 50% percentage is used directly."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
# 50% of 200K = 100K, which is above the 64K floor
|
||||
assert c.threshold_tokens == 100_000
|
||||
|
||||
def test_default_protect_last_n_is_20(self):
|
||||
"""Default protect_last_n should be 20."""
|
||||
|
|
|
|||
|
|
@ -2742,74 +2742,24 @@ class TestSystemPromptStability:
|
|||
assert "Hermes Agent" in agent._cached_system_prompt
|
||||
|
||||
class TestBudgetPressure:
|
||||
"""Budget pressure warning system (issue #414)."""
|
||||
"""Budget pressure warning system — now only fires at budget exhaustion."""
|
||||
|
||||
def test_no_warning_below_caution(self, agent):
|
||||
def test_no_intermediate_warnings(self, agent):
|
||||
"""No warnings at 70% or 90% — only at actual exhaustion."""
|
||||
agent.max_iterations = 60
|
||||
assert agent._get_budget_warning(30) is None
|
||||
|
||||
def test_caution_at_70_percent(self, agent):
|
||||
agent.max_iterations = 60
|
||||
msg = agent._get_budget_warning(42)
|
||||
assert msg is not None
|
||||
assert "[BUDGET:" in msg
|
||||
assert "18 iterations left" in msg
|
||||
|
||||
def test_warning_at_90_percent(self, agent):
|
||||
agent.max_iterations = 60
|
||||
msg = agent._get_budget_warning(54)
|
||||
assert "[BUDGET WARNING:" in msg
|
||||
assert "Provide your final response NOW" in msg
|
||||
|
||||
def test_last_iteration(self, agent):
|
||||
agent.max_iterations = 60
|
||||
msg = agent._get_budget_warning(59)
|
||||
assert "1 iteration(s) left" in msg
|
||||
|
||||
def test_disabled(self, agent):
|
||||
agent.max_iterations = 60
|
||||
agent._budget_pressure_enabled = False
|
||||
assert agent._get_budget_warning(55) is None
|
||||
assert agent._get_budget_warning(30) is None # 50%
|
||||
assert agent._get_budget_warning(42) is None # 70%
|
||||
assert agent._get_budget_warning(54) is None # 90%
|
||||
assert agent._get_budget_warning(59) is None # last iteration
|
||||
|
||||
def test_zero_max_iterations(self, agent):
|
||||
agent.max_iterations = 0
|
||||
assert agent._get_budget_warning(0) is None
|
||||
|
||||
def test_injects_into_json_tool_result(self, agent):
|
||||
"""Warning should be injected as _budget_warning field in JSON tool results."""
|
||||
import json
|
||||
agent.max_iterations = 10
|
||||
messages = [
|
||||
{"role": "tool", "content": json.dumps({"output": "done", "exit_code": 0}), "tool_call_id": "tc1"}
|
||||
]
|
||||
warning = agent._get_budget_warning(9)
|
||||
assert warning is not None
|
||||
# Simulate the injection logic
|
||||
last_content = messages[-1]["content"]
|
||||
parsed = json.loads(last_content)
|
||||
parsed["_budget_warning"] = warning
|
||||
messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
|
||||
result = json.loads(messages[-1]["content"])
|
||||
assert "_budget_warning" in result
|
||||
assert "BUDGET WARNING" in result["_budget_warning"]
|
||||
assert result["output"] == "done" # original content preserved
|
||||
|
||||
def test_appends_to_non_json_tool_result(self, agent):
|
||||
"""Warning should be appended as text for non-JSON tool results."""
|
||||
agent.max_iterations = 10
|
||||
messages = [
|
||||
{"role": "tool", "content": "plain text result", "tool_call_id": "tc1"}
|
||||
]
|
||||
warning = agent._get_budget_warning(9)
|
||||
# Simulate injection logic for non-JSON
|
||||
last_content = messages[-1]["content"]
|
||||
try:
|
||||
import json
|
||||
json.loads(last_content)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
messages[-1]["content"] = last_content + f"\n\n{warning}"
|
||||
assert "plain text result" in messages[-1]["content"]
|
||||
assert "BUDGET WARNING" in messages[-1]["content"]
|
||||
def test_grace_call_flags_initialized(self, agent):
|
||||
"""Agent should have budget grace call flags."""
|
||||
assert agent._budget_exhausted_injected is False
|
||||
assert agent._budget_grace_call is False
|
||||
|
||||
|
||||
class TestSafeWriter:
|
||||
|
|
|
|||
|
|
@ -23,6 +23,19 @@ from tools.interrupt import is_interrupted
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Thread-local activity callback. The agent sets this before a tool call so
|
||||
# long-running _wait_for_process loops can report liveness to the gateway.
|
||||
_activity_callback_local = threading.local()
|
||||
|
||||
|
||||
def set_activity_callback(cb: Callable[[str], None] | None) -> None:
|
||||
"""Register a callback that _wait_for_process fires periodically."""
|
||||
_activity_callback_local.callback = cb
|
||||
|
||||
|
||||
def _get_activity_callback() -> Callable[[str], None] | None:
|
||||
return getattr(_activity_callback_local, "callback", None)
|
||||
|
||||
|
||||
def get_sandbox_dir() -> Path:
|
||||
"""Return the host-side root for all sandbox storage (Docker workspaces,
|
||||
|
|
@ -370,6 +383,10 @@ class BaseEnvironment(ABC):
|
|||
"""Poll-based wait with interrupt checking and stdout draining.
|
||||
|
||||
Shared across all backends — not overridden.
|
||||
|
||||
Fires the ``activity_callback`` (if set on this instance) every 10s
|
||||
while the process is running so the gateway's inactivity timeout
|
||||
doesn't kill long-running commands.
|
||||
"""
|
||||
output_chunks: list[str] = []
|
||||
|
||||
|
|
@ -388,6 +405,8 @@ class BaseEnvironment(ABC):
|
|||
drain_thread = threading.Thread(target=_drain, daemon=True)
|
||||
drain_thread.start()
|
||||
deadline = time.monotonic() + timeout
|
||||
_last_activity_touch = time.monotonic()
|
||||
_ACTIVITY_INTERVAL = 10.0 # seconds between activity touches
|
||||
|
||||
while proc.poll() is None:
|
||||
if is_interrupted():
|
||||
|
|
@ -408,6 +427,17 @@ class BaseEnvironment(ABC):
|
|||
else timeout_msg.lstrip(),
|
||||
"returncode": 124,
|
||||
}
|
||||
# Periodic activity touch so the gateway knows we're alive
|
||||
_now = time.monotonic()
|
||||
if _now - _last_activity_touch >= _ACTIVITY_INTERVAL:
|
||||
_last_activity_touch = _now
|
||||
_cb = _get_activity_callback()
|
||||
if _cb:
|
||||
try:
|
||||
_elapsed = int(_now - (deadline - timeout))
|
||||
_cb(f"terminal command running ({_elapsed}s elapsed)")
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(0.2)
|
||||
|
||||
drain_thread.join(timeout=5)
|
||||
|
|
|
|||
|
|
@ -64,6 +64,10 @@ hermes setup # Or configure everything at once
|
|||
| **Vercel AI Gateway** | Vercel AI Gateway routing | Set `AI_GATEWAY_API_KEY` |
|
||||
| **Custom Endpoint** | VLLM, SGLang, Ollama, or any OpenAI-compatible API | Set base URL + API key |
|
||||
|
||||
:::caution Minimum context: 64K tokens
|
||||
Hermes Agent requires a model with at least **64,000 tokens** of context. Models with smaller windows cannot maintain enough working memory for multi-step tool-calling workflows and will be rejected at startup. Most hosted models (Claude, GPT, Gemini, Qwen, DeepSeek) meet this easily. If you're running a local model, set its context size to at least 64K (e.g. `--ctx-size 65536` for llama.cpp or `-c 65536` for Ollama).
|
||||
:::
|
||||
|
||||
:::tip
|
||||
You can switch providers at any time with `hermes model` — no code changes, no lock-in. When configuring a custom endpoint, Hermes will prompt for the context window size and auto-detect it when possible. See [Context Length Detection](../integrations/providers.md#context-length-detection) for details.
|
||||
:::
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue