fix(agent): scale tool-output budget to the model context window (#23767)

The tool-result persistence budget was a fixed 100K chars/result and 200K
chars/turn regardless of the active model. On a small-context model (e.g. a
65K-token local model switched into mid-session) a single large tool result
(reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens)
could by itself approach or exceed the window, forcing an oversized request
that the provider rejects as "Prompt too long".

- budget_config.budget_for_context_window() scales per-result/per-turn char
  caps to a fraction of the model window, clamped to the historical 100K/200K
  defaults (large models unchanged) and floored so small models stay usable.
- resolve_threshold() now caps the per-tool registry value at default_result_size
  so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate
  a scaled-down budget. No-op for the default budget (both 100K).
- tool_executor wires the agent's live context_length (recomputed on model
  switch) into all four persist/turn-budget call sites.

read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result
against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical
to today.
This commit is contained in:
kshitijk4poor 2026-06-21 17:29:35 +05:30
parent 44d552ea5a
commit 1965d56219
3 changed files with 172 additions and 3 deletions

View file

@ -44,9 +44,26 @@ from tools.tool_result_storage import (
maybe_persist_tool_result,
enforce_turn_budget,
)
from tools.budget_config import BudgetConfig, DEFAULT_BUDGET, budget_for_context_window
logger = logging.getLogger(__name__)
def _budget_for_agent(agent) -> BudgetConfig:
"""Resolve a tool-result BudgetConfig scaled to the agent's context window.
Large-context models keep the historical 100K/200K char defaults; small
models (e.g. a 65K-token local model switched into mid-session) get a budget
proportional to their window so a single large tool result can't push the
request past the model's limit (#23767). Falls back to the default budget
when the context length isn't resolvable.
"""
try:
ctx = getattr(getattr(agent, "context_compressor", None), "context_length", None)
return budget_for_context_window(int(ctx)) if ctx else DEFAULT_BUDGET
except Exception:
return DEFAULT_BUDGET
# Maximum number of concurrent worker threads for parallel tool execution.
# Mirrors the constant in ``run_agent`` for tests/imports that look here.
_MAX_TOOL_WORKERS = 8
@ -249,6 +266,10 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
tool_calls = assistant_message.tool_calls
num_tools = len(tool_calls)
# Resolve the context-scaled tool-output budget once per turn (cheap, but
# avoids rebuilding it per result inside the loop below).
_tool_budget = _budget_for_agent(agent)
# ── Pre-flight: interrupt check ──────────────────────────────────
if agent._interrupt_requested:
print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
@ -725,6 +746,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
tool_name=name,
tool_use_id=tc.id,
env=get_active_env(effective_task_id),
config=_tool_budget,
) if not _is_multimodal_tool_result(function_result) else function_result
subdir_hints = agent._subdirectory_hints.check_tool_call(name, args)
@ -756,7 +778,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
num_tools = len(parsed_calls)
if num_tools > 0:
turn_tool_msgs = messages[-num_tools:]
enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))
enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id), config=_tool_budget)
# ── /steer injection ──────────────────────────────────────────────
# Append any pending user steer text to the last tool result so the
@ -769,6 +791,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
"""Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
# Resolve the context-scaled tool-output budget once per turn.
_tool_budget = _budget_for_agent(agent)
for i, tool_call in enumerate(assistant_message.tool_calls, 1):
# SAFETY: check interrupt BEFORE starting each tool.
# If the user sent "stop" during a previous tool's execution,
@ -1377,6 +1401,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
tool_name=function_name,
tool_use_id=tool_call.id,
env=get_active_env(effective_task_id),
config=_tool_budget,
) if not _is_multimodal_tool_result(function_result) else function_result
# Discover subdirectory context files from tool arguments
@ -1425,7 +1450,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
# ── Per-turn aggregate budget enforcement ─────────────────────────
num_tools_seq = len(assistant_message.tool_calls)
if num_tools_seq > 0:
enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))
enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id), config=_tool_budget)
# ── /steer injection ──────────────────────────────────────────────
# See _execute_tool_calls_parallel for the rationale. Same hook,

View file

@ -18,6 +18,7 @@ from tools.budget_config import (
DEFAULT_TURN_BUDGET_CHARS,
PINNED_THRESHOLDS,
BudgetConfig,
budget_for_context_window,
)
@ -174,3 +175,83 @@ class TestResolveThreshold:
"""Canonical case: read_file must always return inf."""
cfg = BudgetConfig()
assert cfg.resolve_threshold("read_file") == float("inf")
@patch("tools.registry.registry")
def test_registry_value_capped_at_default(self, mock_registry):
"""A scaled-down budget caps an oversized registry value (#23767).
web/terminal/x_search register max_result_size_chars=100_000; a small
model's scaled budget must not be re-inflated by that.
"""
mock_registry.get_max_result_size.return_value = 100_000
cfg = BudgetConfig(default_result_size=30_000)
assert cfg.resolve_threshold("web_search") == 30_000
@patch("tools.registry.registry")
def test_registry_inf_not_capped(self, mock_registry):
"""An inf registry value (e.g. a future pinned-like tool) is preserved."""
mock_registry.get_max_result_size.return_value = float("inf")
cfg = BudgetConfig(default_result_size=30_000)
assert cfg.resolve_threshold("some_tool") == float("inf")
@patch("tools.registry.registry")
def test_default_budget_unchanged_for_100k_tool(self, mock_registry):
"""Default budget keeps 100K registry tools at 100K (no behavior change)."""
mock_registry.get_max_result_size.return_value = 100_000
cfg = BudgetConfig() # default_result_size == 100_000
assert cfg.resolve_threshold("web_search") == 100_000
# ---------------------------------------------------------------------------
# budget_for_context_window() — context-aware scaling (#23767)
# ---------------------------------------------------------------------------
class TestBudgetForContextWindow:
"""Scaling the tool-output budget to the active model's context window."""
def test_none_returns_default(self):
assert budget_for_context_window(None) is DEFAULT_BUDGET
def test_zero_or_negative_returns_default(self):
assert budget_for_context_window(0) is DEFAULT_BUDGET
assert budget_for_context_window(-5) is DEFAULT_BUDGET
def test_large_model_unchanged(self):
"""A 200K-token model keeps the historical 100K/200K char defaults."""
cfg = budget_for_context_window(200_000)
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
def test_very_large_model_still_capped_at_default(self):
"""A 1M-token model never exceeds the historical defaults (cap)."""
cfg = budget_for_context_window(1_000_000)
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
def test_small_model_scaled_down(self):
"""A 65K-token model gets a budget proportional to its window.
window_chars = 65_536*4 = 262_144; per_result = 15% = 39_321;
per_turn = 30% = 78_643. Both below the 100K/200K defaults.
"""
cfg = budget_for_context_window(65_536)
assert cfg.default_result_size < DEFAULT_RESULT_SIZE_CHARS
assert cfg.turn_budget < DEFAULT_TURN_BUDGET_CHARS
assert cfg.default_result_size == int(65_536 * 4 * 0.15)
assert cfg.turn_budget == int(65_536 * 4 * 0.30)
def test_tiny_model_floored(self):
"""A tiny window can't drop below the floor (usable preview survives)."""
cfg = budget_for_context_window(8_000)
assert cfg.default_result_size >= 8_000
assert cfg.turn_budget >= 16_000
def test_scaled_budget_constrains_oversized_result(self):
"""A 279K-char result against a 65K model exceeds the scaled per-result
threshold, so it will be persisted/truncated rather than sent whole."""
cfg = budget_for_context_window(65_536)
huge_len = 279_549
threshold = cfg.resolve_threshold("mcp_firecrawl_firecrawl_search")
assert threshold < huge_len
assert cfg.default_result_size < huge_len

View file

@ -38,14 +38,77 @@ class BudgetConfig:
"""Resolve the persistence threshold for a tool.
Priority: pinned -> tool_overrides -> registry per-tool -> default.
The registry per-tool value is capped at ``default_result_size`` so a
context-scaled budget (small model) actually constrains tools that
register a large fixed ``max_result_size_chars`` (web/terminal/x_search
all register 100K). For the default budget this is a no-op because both
equal 100K; for a scaled-down budget it prevents a per-tool registry
value from re-inflating the cap past the model's window (#23767).
"""
if tool_name in PINNED_THRESHOLDS:
return PINNED_THRESHOLDS[tool_name]
if tool_name in self.tool_overrides:
return self.tool_overrides[tool_name]
from tools.registry import registry
return registry.get_max_result_size(tool_name, default=self.default_result_size)
registry_value = registry.get_max_result_size(tool_name, default=self.default_result_size)
if registry_value == float("inf"):
return registry_value
return min(registry_value, self.default_result_size)
# Default config -- matches current hardcoded behavior exactly.
DEFAULT_BUDGET = BudgetConfig()
# Token<->char conversion used when scaling the budget to a model's context
# window. Deliberately conservative (a smaller divisor = more chars per token =
# a larger char budget) would UNDER-protect small models, so we use the same
# rough 4-chars-per-token ratio the estimator uses (agent/model_metadata.py).
_CHARS_PER_TOKEN: int = 4
# Fraction of a model's context window we allow a SINGLE tool result to occupy
# before persisting/truncating it, and the fraction the WHOLE turn's tool
# output may occupy. Tool output is not the only thing in the window (system
# prompt, tool schemas, conversation history, the model's own reply all
# compete), so these stay well under 1.0.
_PER_RESULT_WINDOW_FRACTION: float = 0.15
_PER_TURN_WINDOW_FRACTION: float = 0.30
# Floor so even a tiny-but-admitted model still gets a usable preview/result
# rather than a 0-char budget.
_MIN_RESULT_SIZE_CHARS: int = 8_000
_MIN_TURN_BUDGET_CHARS: int = 16_000
def budget_for_context_window(context_length: int | None) -> BudgetConfig:
"""Return a BudgetConfig scaled to the active model's context window.
The fixed defaults (100K result / 200K turn chars) are correct for large
(200K+ token) models but blind to small ones: on a 65K-token model a single
tool result persisted at the 100K-char threshold, or a 200K-char turn
budget (~50K tokens), can by itself approach or exceed the whole window and
force an oversized request (#23767).
Scaling keeps large models byte-identical to today (the proportional value
is clamped to the existing defaults as a CAP) while shrinking the budget for
small models proportionally to their window, floored so a usable preview
always survives.
"""
if not context_length or context_length <= 0:
return DEFAULT_BUDGET
window_chars = context_length * _CHARS_PER_TOKEN
per_result = int(window_chars * _PER_RESULT_WINDOW_FRACTION)
per_turn = int(window_chars * _PER_TURN_WINDOW_FRACTION)
# Clamp: never exceed the historical defaults (so large models are
# unchanged), never drop below the floor (so tiny models stay usable).
per_result = max(_MIN_RESULT_SIZE_CHARS, min(per_result, DEFAULT_RESULT_SIZE_CHARS))
per_turn = max(_MIN_TURN_BUDGET_CHARS, min(per_turn, DEFAULT_TURN_BUDGET_CHARS))
return BudgetConfig(
default_result_size=per_result,
turn_budget=per_turn,
preview_size=DEFAULT_PREVIEW_SIZE_CHARS,
)