From 1965d562197016e4e3109b483bd0a8761fada640 Mon Sep 17 00:00:00 2001
From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com>
Date: Sun, 21 Jun 2026 17:29:35 +0530
Subject: [PATCH] fix(agent): scale tool-output budget to the model context
 window (#23767)

The tool-result persistence budget was a fixed 100K chars/result and 200K
chars/turn regardless of the active model. On a small-context model (e.g. a
65K-token local model switched into mid-session) a single large tool result
(reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens)
could by itself approach or exceed the window, forcing an oversized request
that the provider rejects as "Prompt too long".

- budget_config.budget_for_context_window() scales per-result/per-turn char
  caps to a fraction of the model window, clamped to the historical 100K/200K
  defaults (large models unchanged) and floored so small models stay usable.
- resolve_threshold() now caps the per-tool registry value at default_result_size
  so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate
  a scaled-down budget. No-op for the default budget (both 100K).
- tool_executor wires the agent's live context_length (recomputed on model
  switch) into all four persist/turn-budget call sites.

read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result
against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical
to today.
---
 agent/tool_executor.py            | 29 ++++++++++-
 tests/tools/test_budget_config.py | 81 +++++++++++++++++++++++++++++++
 tools/budget_config.py            | 65 ++++++++++++++++++++++++-
 3 files changed, 172 insertions(+), 3 deletions(-)

diff --git a/agent/tool_executor.py b/agent/tool_executor.py
index e7ba79db8b7..b79c29767e8 100644
--- a/agent/tool_executor.py
+++ b/agent/tool_executor.py
@@ -44,9 +44,26 @@ from tools.tool_result_storage import (
     maybe_persist_tool_result,
     enforce_turn_budget,
 )
+from tools.budget_config import BudgetConfig, DEFAULT_BUDGET, budget_for_context_window
 
 logger = logging.getLogger(__name__)
 
+
+def _budget_for_agent(agent) -> BudgetConfig:
+    """Resolve a tool-result BudgetConfig scaled to the agent's context window.
+
+    Large-context models keep the historical 100K/200K char defaults; small
+    models (e.g. a 65K-token local model switched into mid-session) get a budget
+    proportional to their window so a single large tool result can't push the
+    request past the model's limit (#23767). Falls back to the default budget
+    when the context length isn't resolvable.
+    """
+    try:
+        ctx = getattr(getattr(agent, "context_compressor", None), "context_length", None)
+        return budget_for_context_window(int(ctx)) if ctx else DEFAULT_BUDGET
+    except Exception:
+        return DEFAULT_BUDGET
+
 # Maximum number of concurrent worker threads for parallel tool execution.
 # Mirrors the constant in ``run_agent`` for tests/imports that look here.
 _MAX_TOOL_WORKERS = 8
@@ -249,6 +266,10 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
     tool_calls = assistant_message.tool_calls
     num_tools = len(tool_calls)
 
+    # Resolve the context-scaled tool-output budget once per turn (cheap, but
+    # avoids rebuilding it per result inside the loop below).
+    _tool_budget = _budget_for_agent(agent)
+
     # ── Pre-flight: interrupt check ──────────────────────────────────
     if agent._interrupt_requested:
         print(f"{agent.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
@@ -725,6 +746,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
             tool_name=name,
             tool_use_id=tc.id,
             env=get_active_env(effective_task_id),
+            config=_tool_budget,
         ) if not _is_multimodal_tool_result(function_result) else function_result
 
         subdir_hints = agent._subdirectory_hints.check_tool_call(name, args)
@@ -756,7 +778,7 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
     num_tools = len(parsed_calls)
     if num_tools > 0:
         turn_tool_msgs = messages[-num_tools:]
-        enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id))
+        enforce_turn_budget(turn_tool_msgs, env=get_active_env(effective_task_id), config=_tool_budget)
 
     # ── /steer injection ──────────────────────────────────────────────
     # Append any pending user steer text to the last tool result so the
@@ -769,6 +791,8 @@ def execute_tool_calls_concurrent(agent, assistant_message, messages: list, effe
 
 def execute_tool_calls_sequential(agent, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
     """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
+    # Resolve the context-scaled tool-output budget once per turn.
+    _tool_budget = _budget_for_agent(agent)
     for i, tool_call in enumerate(assistant_message.tool_calls, 1):
         # SAFETY: check interrupt BEFORE starting each tool.
         # If the user sent "stop" during a previous tool's execution,
@@ -1377,6 +1401,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
             tool_name=function_name,
             tool_use_id=tool_call.id,
             env=get_active_env(effective_task_id),
+            config=_tool_budget,
         ) if not _is_multimodal_tool_result(function_result) else function_result
 
         # Discover subdirectory context files from tool arguments
@@ -1425,7 +1450,7 @@ def execute_tool_calls_sequential(agent, assistant_message, messages: list, effe
     # ── Per-turn aggregate budget enforcement ─────────────────────────
     num_tools_seq = len(assistant_message.tool_calls)
     if num_tools_seq > 0:
-        enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id))
+        enforce_turn_budget(messages[-num_tools_seq:], env=get_active_env(effective_task_id), config=_tool_budget)
 
     # ── /steer injection ──────────────────────────────────────────────
     # See _execute_tool_calls_parallel for the rationale. Same hook,
diff --git a/tests/tools/test_budget_config.py b/tests/tools/test_budget_config.py
index aeacc621903..4c78d3d6c41 100644
--- a/tests/tools/test_budget_config.py
+++ b/tests/tools/test_budget_config.py
@@ -18,6 +18,7 @@ from tools.budget_config import (
     DEFAULT_TURN_BUDGET_CHARS,
     PINNED_THRESHOLDS,
     BudgetConfig,
+    budget_for_context_window,
 )
 
 
@@ -174,3 +175,83 @@ class TestResolveThreshold:
         """Canonical case: read_file must always return inf."""
         cfg = BudgetConfig()
         assert cfg.resolve_threshold("read_file") == float("inf")
+
+    @patch("tools.registry.registry")
+    def test_registry_value_capped_at_default(self, mock_registry):
+        """A scaled-down budget caps an oversized registry value (#23767).
+
+        web/terminal/x_search register max_result_size_chars=100_000; a small
+        model's scaled budget must not be re-inflated by that.
+        """
+        mock_registry.get_max_result_size.return_value = 100_000
+        cfg = BudgetConfig(default_result_size=30_000)
+        assert cfg.resolve_threshold("web_search") == 30_000
+
+    @patch("tools.registry.registry")
+    def test_registry_inf_not_capped(self, mock_registry):
+        """An inf registry value (e.g. a future pinned-like tool) is preserved."""
+        mock_registry.get_max_result_size.return_value = float("inf")
+        cfg = BudgetConfig(default_result_size=30_000)
+        assert cfg.resolve_threshold("some_tool") == float("inf")
+
+    @patch("tools.registry.registry")
+    def test_default_budget_unchanged_for_100k_tool(self, mock_registry):
+        """Default budget keeps 100K registry tools at 100K (no behavior change)."""
+        mock_registry.get_max_result_size.return_value = 100_000
+        cfg = BudgetConfig()  # default_result_size == 100_000
+        assert cfg.resolve_threshold("web_search") == 100_000
+
+
+# ---------------------------------------------------------------------------
+# budget_for_context_window() — context-aware scaling (#23767)
+# ---------------------------------------------------------------------------
+
+
+class TestBudgetForContextWindow:
+    """Scaling the tool-output budget to the active model's context window."""
+
+    def test_none_returns_default(self):
+        assert budget_for_context_window(None) is DEFAULT_BUDGET
+
+    def test_zero_or_negative_returns_default(self):
+        assert budget_for_context_window(0) is DEFAULT_BUDGET
+        assert budget_for_context_window(-5) is DEFAULT_BUDGET
+
+    def test_large_model_unchanged(self):
+        """A 200K-token model keeps the historical 100K/200K char defaults."""
+        cfg = budget_for_context_window(200_000)
+        assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
+        assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
+
+    def test_very_large_model_still_capped_at_default(self):
+        """A 1M-token model never exceeds the historical defaults (cap)."""
+        cfg = budget_for_context_window(1_000_000)
+        assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
+        assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
+
+    def test_small_model_scaled_down(self):
+        """A 65K-token model gets a budget proportional to its window.
+
+        window_chars = 65_536*4 = 262_144; per_result = 15% = 39_321;
+        per_turn = 30% = 78_643. Both below the 100K/200K defaults.
+        """
+        cfg = budget_for_context_window(65_536)
+        assert cfg.default_result_size < DEFAULT_RESULT_SIZE_CHARS
+        assert cfg.turn_budget < DEFAULT_TURN_BUDGET_CHARS
+        assert cfg.default_result_size == int(65_536 * 4 * 0.15)
+        assert cfg.turn_budget == int(65_536 * 4 * 0.30)
+
+    def test_tiny_model_floored(self):
+        """A tiny window can't drop below the floor (usable preview survives)."""
+        cfg = budget_for_context_window(8_000)
+        assert cfg.default_result_size >= 8_000
+        assert cfg.turn_budget >= 16_000
+
+    def test_scaled_budget_constrains_oversized_result(self):
+        """A 279K-char result against a 65K model exceeds the scaled per-result
+        threshold, so it will be persisted/truncated rather than sent whole."""
+        cfg = budget_for_context_window(65_536)
+        huge_len = 279_549
+        threshold = cfg.resolve_threshold("mcp_firecrawl_firecrawl_search")
+        assert threshold < huge_len
+        assert cfg.default_result_size < huge_len
diff --git a/tools/budget_config.py b/tools/budget_config.py
index 093188d5c75..8e47479446e 100644
--- a/tools/budget_config.py
+++ b/tools/budget_config.py
@@ -38,14 +38,77 @@ class BudgetConfig:
         """Resolve the persistence threshold for a tool.
 
         Priority: pinned -> tool_overrides -> registry per-tool -> default.
+
+        The registry per-tool value is capped at ``default_result_size`` so a
+        context-scaled budget (small model) actually constrains tools that
+        register a large fixed ``max_result_size_chars`` (web/terminal/x_search
+        all register 100K). For the default budget this is a no-op because both
+        equal 100K; for a scaled-down budget it prevents a per-tool registry
+        value from re-inflating the cap past the model's window (#23767).
         """
         if tool_name in PINNED_THRESHOLDS:
             return PINNED_THRESHOLDS[tool_name]
         if tool_name in self.tool_overrides:
             return self.tool_overrides[tool_name]
         from tools.registry import registry
-        return registry.get_max_result_size(tool_name, default=self.default_result_size)
+        registry_value = registry.get_max_result_size(tool_name, default=self.default_result_size)
+        if registry_value == float("inf"):
+            return registry_value
+        return min(registry_value, self.default_result_size)
 
 
 # Default config -- matches current hardcoded behavior exactly.
 DEFAULT_BUDGET = BudgetConfig()
+
+
+# Token<->char conversion used when scaling the budget to a model's context
+# window. Deliberately conservative (a smaller divisor = more chars per token =
+# a larger char budget) would UNDER-protect small models, so we use the same
+# rough 4-chars-per-token ratio the estimator uses (agent/model_metadata.py).
+_CHARS_PER_TOKEN: int = 4
+
+# Fraction of a model's context window we allow a SINGLE tool result to occupy
+# before persisting/truncating it, and the fraction the WHOLE turn's tool
+# output may occupy. Tool output is not the only thing in the window (system
+# prompt, tool schemas, conversation history, the model's own reply all
+# compete), so these stay well under 1.0.
+_PER_RESULT_WINDOW_FRACTION: float = 0.15
+_PER_TURN_WINDOW_FRACTION: float = 0.30
+
+# Floor so even a tiny-but-admitted model still gets a usable preview/result
+# rather than a 0-char budget.
+_MIN_RESULT_SIZE_CHARS: int = 8_000
+_MIN_TURN_BUDGET_CHARS: int = 16_000
+
+
+def budget_for_context_window(context_length: int | None) -> BudgetConfig:
+    """Return a BudgetConfig scaled to the active model's context window.
+
+    The fixed defaults (100K result / 200K turn chars) are correct for large
+    (200K+ token) models but blind to small ones: on a 65K-token model a single
+    tool result persisted at the 100K-char threshold, or a 200K-char turn
+    budget (~50K tokens), can by itself approach or exceed the whole window and
+    force an oversized request (#23767).
+
+    Scaling keeps large models byte-identical to today (the proportional value
+    is clamped to the existing defaults as a CAP) while shrinking the budget for
+    small models proportionally to their window, floored so a usable preview
+    always survives.
+    """
+    if not context_length or context_length <= 0:
+        return DEFAULT_BUDGET
+
+    window_chars = context_length * _CHARS_PER_TOKEN
+    per_result = int(window_chars * _PER_RESULT_WINDOW_FRACTION)
+    per_turn = int(window_chars * _PER_TURN_WINDOW_FRACTION)
+
+    # Clamp: never exceed the historical defaults (so large models are
+    # unchanged), never drop below the floor (so tiny models stay usable).
+    per_result = max(_MIN_RESULT_SIZE_CHARS, min(per_result, DEFAULT_RESULT_SIZE_CHARS))
+    per_turn = max(_MIN_TURN_BUDGET_CHARS, min(per_turn, DEFAULT_TURN_BUDGET_CHARS))
+
+    return BudgetConfig(
+        default_result_size=per_result,
+        turn_budget=per_turn,
+        preview_size=DEFAULT_PREVIEW_SIZE_CHARS,
+    )