hermes-agent/tools/budget_config.py
kshitijk4poor 1965d56219 fix(agent): scale tool-output budget to the model context window (#23767)
The tool-result persistence budget was a fixed 100K chars/result and 200K
chars/turn regardless of the active model. On a small-context model (e.g. a
65K-token local model switched into mid-session) a single large tool result
(reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens)
could by itself approach or exceed the window, forcing an oversized request
that the provider rejects as "Prompt too long".

- budget_config.budget_for_context_window() scales per-result/per-turn char
  caps to a fraction of the model window, clamped to the historical 100K/200K
  defaults (large models unchanged) and floored so small models stay usable.
- resolve_threshold() now caps the per-tool registry value at default_result_size
  so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate
  a scaled-down budget. No-op for the default budget (both 100K).
- tool_executor wires the agent's live context_length (recomputed on model
  switch) into all four persist/turn-budget call sites.

read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result
against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical
to today.
2026-06-21 17:46:38 +05:30

114 lines
4.9 KiB
Python

"""Configurable budget constants for tool result persistence.
Per-tool resolution: pinned > config overrides > registry > default.
"""
from dataclasses import dataclass, field
from typing import Dict
# Tools whose thresholds must never be overridden.
# read_file=inf prevents infinite persist->read->persist loops.
PINNED_THRESHOLDS: Dict[str, float] = {
"read_file": float("inf"),
}
# Defaults matching the current hardcoded values in tool_result_storage.py.
# Kept here as the single source of truth; tool_result_storage.py imports these.
DEFAULT_RESULT_SIZE_CHARS: int = 100_000
DEFAULT_TURN_BUDGET_CHARS: int = 200_000
DEFAULT_PREVIEW_SIZE_CHARS: int = 1_500
@dataclass(frozen=True)
class BudgetConfig:
"""Immutable budget constants for the 3-layer tool result persistence system.
Layer 2 (per-result): resolve_threshold(tool_name) -> threshold in chars.
Layer 3 (per-turn): turn_budget -> aggregate char budget across all tool
results in a single assistant turn.
Preview: preview_size -> inline snippet size after persistence.
"""
default_result_size: int = DEFAULT_RESULT_SIZE_CHARS
turn_budget: int = DEFAULT_TURN_BUDGET_CHARS
preview_size: int = DEFAULT_PREVIEW_SIZE_CHARS
tool_overrides: Dict[str, int] = field(default_factory=dict)
def resolve_threshold(self, tool_name: str) -> int | float:
"""Resolve the persistence threshold for a tool.
Priority: pinned -> tool_overrides -> registry per-tool -> default.
The registry per-tool value is capped at ``default_result_size`` so a
context-scaled budget (small model) actually constrains tools that
register a large fixed ``max_result_size_chars`` (web/terminal/x_search
all register 100K). For the default budget this is a no-op because both
equal 100K; for a scaled-down budget it prevents a per-tool registry
value from re-inflating the cap past the model's window (#23767).
"""
if tool_name in PINNED_THRESHOLDS:
return PINNED_THRESHOLDS[tool_name]
if tool_name in self.tool_overrides:
return self.tool_overrides[tool_name]
from tools.registry import registry
registry_value = registry.get_max_result_size(tool_name, default=self.default_result_size)
if registry_value == float("inf"):
return registry_value
return min(registry_value, self.default_result_size)
# Default config -- matches current hardcoded behavior exactly.
DEFAULT_BUDGET = BudgetConfig()
# Token<->char conversion used when scaling the budget to a model's context
# window. Deliberately conservative (a smaller divisor = more chars per token =
# a larger char budget) would UNDER-protect small models, so we use the same
# rough 4-chars-per-token ratio the estimator uses (agent/model_metadata.py).
_CHARS_PER_TOKEN: int = 4
# Fraction of a model's context window we allow a SINGLE tool result to occupy
# before persisting/truncating it, and the fraction the WHOLE turn's tool
# output may occupy. Tool output is not the only thing in the window (system
# prompt, tool schemas, conversation history, the model's own reply all
# compete), so these stay well under 1.0.
_PER_RESULT_WINDOW_FRACTION: float = 0.15
_PER_TURN_WINDOW_FRACTION: float = 0.30
# Floor so even a tiny-but-admitted model still gets a usable preview/result
# rather than a 0-char budget.
_MIN_RESULT_SIZE_CHARS: int = 8_000
_MIN_TURN_BUDGET_CHARS: int = 16_000
def budget_for_context_window(context_length: int | None) -> BudgetConfig:
"""Return a BudgetConfig scaled to the active model's context window.
The fixed defaults (100K result / 200K turn chars) are correct for large
(200K+ token) models but blind to small ones: on a 65K-token model a single
tool result persisted at the 100K-char threshold, or a 200K-char turn
budget (~50K tokens), can by itself approach or exceed the whole window and
force an oversized request (#23767).
Scaling keeps large models byte-identical to today (the proportional value
is clamped to the existing defaults as a CAP) while shrinking the budget for
small models proportionally to their window, floored so a usable preview
always survives.
"""
if not context_length or context_length <= 0:
return DEFAULT_BUDGET
window_chars = context_length * _CHARS_PER_TOKEN
per_result = int(window_chars * _PER_RESULT_WINDOW_FRACTION)
per_turn = int(window_chars * _PER_TURN_WINDOW_FRACTION)
# Clamp: never exceed the historical defaults (so large models are
# unchanged), never drop below the floor (so tiny models stay usable).
per_result = max(_MIN_RESULT_SIZE_CHARS, min(per_result, DEFAULT_RESULT_SIZE_CHARS))
per_turn = max(_MIN_TURN_BUDGET_CHARS, min(per_turn, DEFAULT_TURN_BUDGET_CHARS))
return BudgetConfig(
default_result_size=per_result,
turn_budget=per_turn,
preview_size=DEFAULT_PREVIEW_SIZE_CHARS,
)