From 1965d562197016e4e3109b483bd0a8761fada640 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:29:35 +0530 Subject: [PATCH] fix(agent): scale tool-output budget to the model context window (#23767) The tool-result persistence budget was a fixed 100K chars/result and 200K chars/turn regardless of the active model. On a small-context model (e.g. a 65K-token local model switched into mid-session) a single large tool result (reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens) could by itself approach or exceed the window, forcing an oversized request that the provider rejects as "Prompt too long". - budget_config.budget_for_context_window() scales per-result/per-turn char caps to a fraction of the model window, clamped to the historical 100K/200K defaults (large models unchanged) and floored so small models stay usable. - resolve_threshold() now caps the per-tool registry value at default_result_size so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate a scaled-down budget. No-op for the default budget (both 100K). - tool_executor wires the agent's live context_length (recomputed on model switch) into all four persist/turn-budget call sites. read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical to today. --- agent/tool_executor.py | 29 ++++++++++- tests/tools/test_budget_config.py | 81 +++++++++++++++++++++++++++++++ tools/budget_config.py | 65 ++++++++++++++++++++++++- 3 files changed, 172 insertions(+), 3 deletions(-) diff --git a/agent/tool_executor.py b/agent/tool_executor.py index e7ba79db8b7..b79c29767e8 100644 --- a/agent/tool_executor.py +++ b/agent/tool_executor.py @@ -44,9 +44,26 @@ from tools.tool_result_storage import ( maybe_persist_tool_result, enforce_turn_budget, ) +from tools.budget_config import BudgetConfig, DEFAULT_BUDGET, budget_for_context_window logger = logging.getLogger(__name__) + +def _budget_for_agent(agent) -> BudgetConfig: + """Resolve a tool-result BudgetConfig scaled to the agent's context window. + + Large-context models keep the historical 100K/200K char defaults; small + models (e.g. a 65K-token local model switched into mid-session) get a budget + proportional to their window so a single large tool result can't push the + request past the model's limit (#23767). Falls back to the default budget + when the context length isn't resolvable. + """ + try: + ctx = getattr(getattr(agent, "context_compressor", None), "context_length", None) + return budget_for_context_window(int(ctx)) if ctx else DEFAULT_BUDGET + except Exception: + return DEFAULT_BUDGET + # Maximum number of concurrent worker threads for parallel tool execution. # Mirrors the constant in ``run_agent`` for tests/imports that look here. _MAX_TOOL_WORKERS = 8 @@ -249,6 +266,10 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe tool_calls = assistant_message.tool_calls num_tools = len(tool_calls) + # Resolve the context-scaled tool-output budget once per turn (cheap, but + # avoids rebuilding it per result inside the loop below). + _tool_budget = _budget_for_agent(agent) + # ── Pre-flight: interrupt check ────────────────────────────────── if agent._interrupt_requested: print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)") @@ -725,6 +746,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe tool_name=name, tool_use_id=tc.id, env=get_active_env(effective_task_id), + config=_tool_budget, ) if not _is_multimodal_tool_result(function_result) else function_result subdir_hints = agent._subdirectory_hints.check_tool_call(name, args) @@ -756,7 +778,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe num_tools = len(parsed_calls) if num_tools > 0: turn_tool_msgs = messages[-num_tools:] - enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id)) + enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id), config=_tool_budget) # ── /steer injection ────────────────────────────────────────────── # Append any pending user steer text to the last tool result so the @@ -769,6 +791,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools.""" + # Resolve the context-scaled tool-output budget once per turn. + _tool_budget = _budget_for_agent(agent) for i, tool_call in enumerate(assistant_message.tool_calls, 1): # SAFETY: check interrupt BEFORE starting each tool. # If the user sent "stop" during a previous tool's execution, @@ -1377,6 +1401,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe tool_name=function_name, tool_use_id=tool_call.id, env=get_active_env(effective_task_id), + config=_tool_budget, ) if not _is_multimodal_tool_result(function_result) else function_result # Discover subdirectory context files from tool arguments @@ -1425,7 +1450,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe # ── Per-turn aggregate budget enforcement ───────────────────────── num_tools_seq = len(assistant_message.tool_calls) if num_tools_seq > 0: - enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id)) + enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id), config=_tool_budget) # ── /steer injection ────────────────────────────────────────────── # See _execute_tool_calls_parallel for the rationale. Same hook, diff --git a/tests/tools/test_budget_config.py b/tests/tools/test_budget_config.py index aeacc621903..4c78d3d6c41 100644 --- a/tests/tools/test_budget_config.py +++ b/tests/tools/test_budget_config.py @@ -18,6 +18,7 @@ from tools.budget_config import ( DEFAULT_TURN_BUDGET_CHARS, PINNED_THRESHOLDS, BudgetConfig, + budget_for_context_window, ) @@ -174,3 +175,83 @@ class TestResolveThreshold: """Canonical case: read_file must always return inf.""" cfg = BudgetConfig() assert cfg.resolve_threshold("read_file") == float("inf") + + @patch("tools.registry.registry") + def test_registry_value_capped_at_default(self, mock_registry): + """A scaled-down budget caps an oversized registry value (#23767). + + web/terminal/x_search register max_result_size_chars=100_000; a small + model's scaled budget must not be re-inflated by that. + """ + mock_registry.get_max_result_size.return_value = 100_000 + cfg = BudgetConfig(default_result_size=30_000) + assert cfg.resolve_threshold("web_search") == 30_000 + + @patch("tools.registry.registry") + def test_registry_inf_not_capped(self, mock_registry): + """An inf registry value (e.g. a future pinned-like tool) is preserved.""" + mock_registry.get_max_result_size.return_value = float("inf") + cfg = BudgetConfig(default_result_size=30_000) + assert cfg.resolve_threshold("some_tool") == float("inf") + + @patch("tools.registry.registry") + def test_default_budget_unchanged_for_100k_tool(self, mock_registry): + """Default budget keeps 100K registry tools at 100K (no behavior change).""" + mock_registry.get_max_result_size.return_value = 100_000 + cfg = BudgetConfig() # default_result_size == 100_000 + assert cfg.resolve_threshold("web_search") == 100_000 + + +# --------------------------------------------------------------------------- +# budget_for_context_window() — context-aware scaling (#23767) +# --------------------------------------------------------------------------- + + +class TestBudgetForContextWindow: + """Scaling the tool-output budget to the active model's context window.""" + + def test_none_returns_default(self): + assert budget_for_context_window(None) is DEFAULT_BUDGET + + def test_zero_or_negative_returns_default(self): + assert budget_for_context_window(0) is DEFAULT_BUDGET + assert budget_for_context_window(-5) is DEFAULT_BUDGET + + def test_large_model_unchanged(self): + """A 200K-token model keeps the historical 100K/200K char defaults.""" + cfg = budget_for_context_window(200_000) + assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS + assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS + + def test_very_large_model_still_capped_at_default(self): + """A 1M-token model never exceeds the historical defaults (cap).""" + cfg = budget_for_context_window(1_000_000) + assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS + assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS + + def test_small_model_scaled_down(self): + """A 65K-token model gets a budget proportional to its window. + + window_chars = 65_536*4 = 262_144; per_result = 15% = 39_321; + per_turn = 30% = 78_643. Both below the 100K/200K defaults. + """ + cfg = budget_for_context_window(65_536) + assert cfg.default_result_size < DEFAULT_RESULT_SIZE_CHARS + assert cfg.turn_budget < DEFAULT_TURN_BUDGET_CHARS + assert cfg.default_result_size == int(65_536 * 4 * 0.15) + assert cfg.turn_budget == int(65_536 * 4 * 0.30) + + def test_tiny_model_floored(self): + """A tiny window can't drop below the floor (usable preview survives).""" + cfg = budget_for_context_window(8_000) + assert cfg.default_result_size >= 8_000 + assert cfg.turn_budget >= 16_000 + + def test_scaled_budget_constrains_oversized_result(self): + """A 279K-char result against a 65K model exceeds the scaled per-result + threshold, so it will be persisted/truncated rather than sent whole.""" + cfg = budget_for_context_window(65_536) + huge_len = 279_549 + threshold = cfg.resolve_threshold("mcp_firecrawl_firecrawl_search") + assert threshold < huge_len + assert cfg.default_result_size < huge_len diff --git a/tools/budget_config.py b/tools/budget_config.py index 093188d5c75..8e47479446e 100644 --- a/tools/budget_config.py +++ b/tools/budget_config.py @@ -38,14 +38,77 @@ class BudgetConfig: """Resolve the persistence threshold for a tool. Priority: pinned -> tool_overrides -> registry per-tool -> default. + + The registry per-tool value is capped at ``default_result_size`` so a + context-scaled budget (small model) actually constrains tools that + register a large fixed ``max_result_size_chars`` (web/terminal/x_search + all register 100K). For the default budget this is a no-op because both + equal 100K; for a scaled-down budget it prevents a per-tool registry + value from re-inflating the cap past the model's window (#23767). """ if tool_name in PINNED_THRESHOLDS: return PINNED_THRESHOLDS[tool_name] if tool_name in self.tool_overrides: return self.tool_overrides[tool_name] from tools.registry import registry - return registry.get_max_result_size(tool_name, default=self.default_result_size) + registry_value = registry.get_max_result_size(tool_name, default=self.default_result_size) + if registry_value == float("inf"): + return registry_value + return min(registry_value, self.default_result_size) # Default config -- matches current hardcoded behavior exactly. DEFAULT_BUDGET = BudgetConfig() + + +# Token<->char conversion used when scaling the budget to a model's context +# window. Deliberately conservative (a smaller divisor = more chars per token = +# a larger char budget) would UNDER-protect small models, so we use the same +# rough 4-chars-per-token ratio the estimator uses (agent/model_metadata.py). +_CHARS_PER_TOKEN: int = 4 + +# Fraction of a model's context window we allow a SINGLE tool result to occupy +# before persisting/truncating it, and the fraction the WHOLE turn's tool +# output may occupy. Tool output is not the only thing in the window (system +# prompt, tool schemas, conversation history, the model's own reply all +# compete), so these stay well under 1.0. +_PER_RESULT_WINDOW_FRACTION: float = 0.15 +_PER_TURN_WINDOW_FRACTION: float = 0.30 + +# Floor so even a tiny-but-admitted model still gets a usable preview/result +# rather than a 0-char budget. +_MIN_RESULT_SIZE_CHARS: int = 8_000 +_MIN_TURN_BUDGET_CHARS: int = 16_000 + + +def budget_for_context_window(context_length: int | None) -> BudgetConfig: + """Return a BudgetConfig scaled to the active model's context window. + + The fixed defaults (100K result / 200K turn chars) are correct for large + (200K+ token) models but blind to small ones: on a 65K-token model a single + tool result persisted at the 100K-char threshold, or a 200K-char turn + budget (~50K tokens), can by itself approach or exceed the whole window and + force an oversized request (#23767). + + Scaling keeps large models byte-identical to today (the proportional value + is clamped to the existing defaults as a CAP) while shrinking the budget for + small models proportionally to their window, floored so a usable preview + always survives. + """ + if not context_length or context_length <= 0: + return DEFAULT_BUDGET + + window_chars = context_length * _CHARS_PER_TOKEN + per_result = int(window_chars * _PER_RESULT_WINDOW_FRACTION) + per_turn = int(window_chars * _PER_TURN_WINDOW_FRACTION) + + # Clamp: never exceed the historical defaults (so large models are + # unchanged), never drop below the floor (so tiny models stay usable). + per_result = max(_MIN_RESULT_SIZE_CHARS, min(per_result, DEFAULT_RESULT_SIZE_CHARS)) + per_turn = max(_MIN_TURN_BUDGET_CHARS, min(per_turn, DEFAULT_TURN_BUDGET_CHARS)) + + return BudgetConfig( + default_result_size=per_result, + turn_budget=per_turn, + preview_size=DEFAULT_PREVIEW_SIZE_CHARS, + )