mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(agent): scale tool-output budget to the model context window (#23767)
The tool-result persistence budget was a fixed 100K chars/result and 200K chars/turn regardless of the active model. On a small-context model (e.g. a 65K-token local model switched into mid-session) a single large tool result (reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens) could by itself approach or exceed the window, forcing an oversized request that the provider rejects as "Prompt too long". - budget_config.budget_for_context_window() scales per-result/per-turn char caps to a fraction of the model window, clamped to the historical 100K/200K defaults (large models unchanged) and floored so small models stay usable. - resolve_threshold() now caps the per-tool registry value at default_result_size so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate a scaled-down budget. No-op for the default budget (both 100K). - tool_executor wires the agent's live context_length (recomputed on model switch) into all four persist/turn-budget call sites. read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical to today.
This commit is contained in:
parent
44d552ea5a
commit
1965d56219
3 changed files with 172 additions and 3 deletions
|
|
@ -44,9 +44,26 @@ from tools.tool_result_storage import (
|
|||
maybe_persist_tool_result,
|
||||
enforce_turn_budget,
|
||||
)
|
||||
from tools.budget_config import BudgetConfig, DEFAULT_BUDGET, budget_for_context_window
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _budget_for_agent(agent) -> BudgetConfig:
|
||||
"""Resolve a tool-result BudgetConfig scaled to the agent's context window.
|
||||
|
||||
Large-context models keep the historical 100K/200K char defaults; small
|
||||
models (e.g. a 65K-token local model switched into mid-session) get a budget
|
||||
proportional to their window so a single large tool result can't push the
|
||||
request past the model's limit (#23767). Falls back to the default budget
|
||||
when the context length isn't resolvable.
|
||||
"""
|
||||
try:
|
||||
ctx = getattr(getattr(agent, "context_compressor", None), "context_length", None)
|
||||
return budget_for_context_window(int(ctx)) if ctx else DEFAULT_BUDGET
|
||||
except Exception:
|
||||
return DEFAULT_BUDGET
|
||||
|
||||
# Maximum number of concurrent worker threads for parallel tool execution.
|
||||
# Mirrors the constant in ``run_agent`` for tests/imports that look here.
|
||||
_MAX_TOOL_WORKERS = 8
|
||||
|
|
@ -249,6 +266,10 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
|
|||
tool_calls = assistant_message.tool_calls
|
||||
num_tools = len(tool_calls)
|
||||
|
||||
# Resolve the context-scaled tool-output budget once per turn (cheap, but
|
||||
# avoids rebuilding it per result inside the loop below).
|
||||
_tool_budget = _budget_for_agent(agent)
|
||||
|
||||
# ── Pre-flight: interrupt check ──────────────────────────────────
|
||||
if agent._interrupt_requested:
|
||||
print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
|
||||
|
|
@ -725,6 +746,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
|
|||
tool_name=name,
|
||||
tool_use_id=tc.id,
|
||||
env=get_active_env(effective_task_id),
|
||||
config=_tool_budget,
|
||||
) if not _is_multimodal_tool_result(function_result) else function_result
|
||||
|
||||
subdir_hints = agent._subdirectory_hints.check_tool_call(name, args)
|
||||
|
|
@ -756,7 +778,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
|
|||
num_tools = len(parsed_calls)
|
||||
if num_tools > 0:
|
||||
turn_tool_msgs = messages[-num_tools:]
|
||||
enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))
|
||||
enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id), config=_tool_budget)
|
||||
|
||||
# ── /steer injection ──────────────────────────────────────────────
|
||||
# Append any pending user steer text to the last tool result so the
|
||||
|
|
@ -769,6 +791,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
|
|||
|
||||
def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
|
||||
"""Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
|
||||
# Resolve the context-scaled tool-output budget once per turn.
|
||||
_tool_budget = _budget_for_agent(agent)
|
||||
for i, tool_call in enumerate(assistant_message.tool_calls, 1):
|
||||
# SAFETY: check interrupt BEFORE starting each tool.
|
||||
# If the user sent "stop" during a previous tool's execution,
|
||||
|
|
@ -1377,6 +1401,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
|
|||
tool_name=function_name,
|
||||
tool_use_id=tool_call.id,
|
||||
env=get_active_env(effective_task_id),
|
||||
config=_tool_budget,
|
||||
) if not _is_multimodal_tool_result(function_result) else function_result
|
||||
|
||||
# Discover subdirectory context files from tool arguments
|
||||
|
|
@ -1425,7 +1450,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
|
|||
# ── Per-turn aggregate budget enforcement ─────────────────────────
|
||||
num_tools_seq = len(assistant_message.tool_calls)
|
||||
if num_tools_seq > 0:
|
||||
enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))
|
||||
enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id), config=_tool_budget)
|
||||
|
||||
# ── /steer injection ──────────────────────────────────────────────
|
||||
# See _execute_tool_calls_parallel for the rationale. Same hook,
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ from tools.budget_config import (
|
|||
DEFAULT_TURN_BUDGET_CHARS,
|
||||
PINNED_THRESHOLDS,
|
||||
BudgetConfig,
|
||||
budget_for_context_window,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -174,3 +175,83 @@ class TestResolveThreshold:
|
|||
"""Canonical case: read_file must always return inf."""
|
||||
cfg = BudgetConfig()
|
||||
assert cfg.resolve_threshold("read_file") == float("inf")
|
||||
|
||||
@patch("tools.registry.registry")
|
||||
def test_registry_value_capped_at_default(self, mock_registry):
|
||||
"""A scaled-down budget caps an oversized registry value (#23767).
|
||||
|
||||
web/terminal/x_search register max_result_size_chars=100_000; a small
|
||||
model's scaled budget must not be re-inflated by that.
|
||||
"""
|
||||
mock_registry.get_max_result_size.return_value = 100_000
|
||||
cfg = BudgetConfig(default_result_size=30_000)
|
||||
assert cfg.resolve_threshold("web_search") == 30_000
|
||||
|
||||
@patch("tools.registry.registry")
|
||||
def test_registry_inf_not_capped(self, mock_registry):
|
||||
"""An inf registry value (e.g. a future pinned-like tool) is preserved."""
|
||||
mock_registry.get_max_result_size.return_value = float("inf")
|
||||
cfg = BudgetConfig(default_result_size=30_000)
|
||||
assert cfg.resolve_threshold("some_tool") == float("inf")
|
||||
|
||||
@patch("tools.registry.registry")
|
||||
def test_default_budget_unchanged_for_100k_tool(self, mock_registry):
|
||||
"""Default budget keeps 100K registry tools at 100K (no behavior change)."""
|
||||
mock_registry.get_max_result_size.return_value = 100_000
|
||||
cfg = BudgetConfig() # default_result_size == 100_000
|
||||
assert cfg.resolve_threshold("web_search") == 100_000
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# budget_for_context_window() — context-aware scaling (#23767)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestBudgetForContextWindow:
|
||||
"""Scaling the tool-output budget to the active model's context window."""
|
||||
|
||||
def test_none_returns_default(self):
|
||||
assert budget_for_context_window(None) is DEFAULT_BUDGET
|
||||
|
||||
def test_zero_or_negative_returns_default(self):
|
||||
assert budget_for_context_window(0) is DEFAULT_BUDGET
|
||||
assert budget_for_context_window(-5) is DEFAULT_BUDGET
|
||||
|
||||
def test_large_model_unchanged(self):
|
||||
"""A 200K-token model keeps the historical 100K/200K char defaults."""
|
||||
cfg = budget_for_context_window(200_000)
|
||||
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
|
||||
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
|
||||
|
||||
def test_very_large_model_still_capped_at_default(self):
|
||||
"""A 1M-token model never exceeds the historical defaults (cap)."""
|
||||
cfg = budget_for_context_window(1_000_000)
|
||||
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
|
||||
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
|
||||
|
||||
def test_small_model_scaled_down(self):
|
||||
"""A 65K-token model gets a budget proportional to its window.
|
||||
|
||||
window_chars = 65_536*4 = 262_144; per_result = 15% = 39_321;
|
||||
per_turn = 30% = 78_643. Both below the 100K/200K defaults.
|
||||
"""
|
||||
cfg = budget_for_context_window(65_536)
|
||||
assert cfg.default_result_size < DEFAULT_RESULT_SIZE_CHARS
|
||||
assert cfg.turn_budget < DEFAULT_TURN_BUDGET_CHARS
|
||||
assert cfg.default_result_size == int(65_536 * 4 * 0.15)
|
||||
assert cfg.turn_budget == int(65_536 * 4 * 0.30)
|
||||
|
||||
def test_tiny_model_floored(self):
|
||||
"""A tiny window can't drop below the floor (usable preview survives)."""
|
||||
cfg = budget_for_context_window(8_000)
|
||||
assert cfg.default_result_size >= 8_000
|
||||
assert cfg.turn_budget >= 16_000
|
||||
|
||||
def test_scaled_budget_constrains_oversized_result(self):
|
||||
"""A 279K-char result against a 65K model exceeds the scaled per-result
|
||||
threshold, so it will be persisted/truncated rather than sent whole."""
|
||||
cfg = budget_for_context_window(65_536)
|
||||
huge_len = 279_549
|
||||
threshold = cfg.resolve_threshold("mcp_firecrawl_firecrawl_search")
|
||||
assert threshold < huge_len
|
||||
assert cfg.default_result_size < huge_len
|
||||
|
|
|
|||
|
|
@ -38,14 +38,77 @@ class BudgetConfig:
|
|||
"""Resolve the persistence threshold for a tool.
|
||||
|
||||
Priority: pinned -> tool_overrides -> registry per-tool -> default.
|
||||
|
||||
The registry per-tool value is capped at ``default_result_size`` so a
|
||||
context-scaled budget (small model) actually constrains tools that
|
||||
register a large fixed ``max_result_size_chars`` (web/terminal/x_search
|
||||
all register 100K). For the default budget this is a no-op because both
|
||||
equal 100K; for a scaled-down budget it prevents a per-tool registry
|
||||
value from re-inflating the cap past the model's window (#23767).
|
||||
"""
|
||||
if tool_name in PINNED_THRESHOLDS:
|
||||
return PINNED_THRESHOLDS[tool_name]
|
||||
if tool_name in self.tool_overrides:
|
||||
return self.tool_overrides[tool_name]
|
||||
from tools.registry import registry
|
||||
return registry.get_max_result_size(tool_name, default=self.default_result_size)
|
||||
registry_value = registry.get_max_result_size(tool_name, default=self.default_result_size)
|
||||
if registry_value == float("inf"):
|
||||
return registry_value
|
||||
return min(registry_value, self.default_result_size)
|
||||
|
||||
|
||||
# Default config -- matches current hardcoded behavior exactly.
|
||||
DEFAULT_BUDGET = BudgetConfig()
|
||||
|
||||
|
||||
# Token<->char conversion used when scaling the budget to a model's context
|
||||
# window. Deliberately conservative (a smaller divisor = more chars per token =
|
||||
# a larger char budget) would UNDER-protect small models, so we use the same
|
||||
# rough 4-chars-per-token ratio the estimator uses (agent/model_metadata.py).
|
||||
_CHARS_PER_TOKEN: int = 4
|
||||
|
||||
# Fraction of a model's context window we allow a SINGLE tool result to occupy
|
||||
# before persisting/truncating it, and the fraction the WHOLE turn's tool
|
||||
# output may occupy. Tool output is not the only thing in the window (system
|
||||
# prompt, tool schemas, conversation history, the model's own reply all
|
||||
# compete), so these stay well under 1.0.
|
||||
_PER_RESULT_WINDOW_FRACTION: float = 0.15
|
||||
_PER_TURN_WINDOW_FRACTION: float = 0.30
|
||||
|
||||
# Floor so even a tiny-but-admitted model still gets a usable preview/result
|
||||
# rather than a 0-char budget.
|
||||
_MIN_RESULT_SIZE_CHARS: int = 8_000
|
||||
_MIN_TURN_BUDGET_CHARS: int = 16_000
|
||||
|
||||
|
||||
def budget_for_context_window(context_length: int | None) -> BudgetConfig:
|
||||
"""Return a BudgetConfig scaled to the active model's context window.
|
||||
|
||||
The fixed defaults (100K result / 200K turn chars) are correct for large
|
||||
(200K+ token) models but blind to small ones: on a 65K-token model a single
|
||||
tool result persisted at the 100K-char threshold, or a 200K-char turn
|
||||
budget (~50K tokens), can by itself approach or exceed the whole window and
|
||||
force an oversized request (#23767).
|
||||
|
||||
Scaling keeps large models byte-identical to today (the proportional value
|
||||
is clamped to the existing defaults as a CAP) while shrinking the budget for
|
||||
small models proportionally to their window, floored so a usable preview
|
||||
always survives.
|
||||
"""
|
||||
if not context_length or context_length <= 0:
|
||||
return DEFAULT_BUDGET
|
||||
|
||||
window_chars = context_length * _CHARS_PER_TOKEN
|
||||
per_result = int(window_chars * _PER_RESULT_WINDOW_FRACTION)
|
||||
per_turn = int(window_chars * _PER_TURN_WINDOW_FRACTION)
|
||||
|
||||
# Clamp: never exceed the historical defaults (so large models are
|
||||
# unchanged), never drop below the floor (so tiny models stay usable).
|
||||
per_result = max(_MIN_RESULT_SIZE_CHARS, min(per_result, DEFAULT_RESULT_SIZE_CHARS))
|
||||
per_turn = max(_MIN_TURN_BUDGET_CHARS, min(per_turn, DEFAULT_TURN_BUDGET_CHARS))
|
||||
|
||||
return BudgetConfig(
|
||||
default_result_size=per_result,
|
||||
turn_budget=per_turn,
|
||||
preview_size=DEFAULT_PREVIEW_SIZE_CHARS,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue