mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
The tool-result persistence budget was a fixed 100K chars/result and 200K chars/turn regardless of the active model. On a small-context model (e.g. a 65K-token local model switched into mid-session) a single large tool result (reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens) could by itself approach or exceed the window, forcing an oversized request that the provider rejects as "Prompt too long". - budget_config.budget_for_context_window() scales per-result/per-turn char caps to a fraction of the model window, clamped to the historical 100K/200K defaults (large models unchanged) and floored so small models stay usable. - resolve_threshold() now caps the per-tool registry value at default_result_size so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate a scaled-down budget. No-op for the default budget (both 100K). - tool_executor wires the agent's live context_length (recomputed on model switch) into all four persist/turn-budget call sites. read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical to today.
114 lines
4.9 KiB
Python
114 lines
4.9 KiB
Python
"""Configurable budget constants for tool result persistence.
|
|
|
|
Per-tool resolution: pinned > config overrides > registry > default.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict
|
|
|
|
# Tools whose thresholds must never be overridden.
|
|
# read_file=inf prevents infinite persist->read->persist loops.
|
|
PINNED_THRESHOLDS: Dict[str, float] = {
|
|
"read_file": float("inf"),
|
|
}
|
|
|
|
# Defaults matching the current hardcoded values in tool_result_storage.py.
|
|
# Kept here as the single source of truth; tool_result_storage.py imports these.
|
|
DEFAULT_RESULT_SIZE_CHARS: int = 100_000
|
|
DEFAULT_TURN_BUDGET_CHARS: int = 200_000
|
|
DEFAULT_PREVIEW_SIZE_CHARS: int = 1_500
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class BudgetConfig:
|
|
"""Immutable budget constants for the 3-layer tool result persistence system.
|
|
|
|
Layer 2 (per-result): resolve_threshold(tool_name) -> threshold in chars.
|
|
Layer 3 (per-turn): turn_budget -> aggregate char budget across all tool
|
|
results in a single assistant turn.
|
|
Preview: preview_size -> inline snippet size after persistence.
|
|
"""
|
|
|
|
default_result_size: int = DEFAULT_RESULT_SIZE_CHARS
|
|
turn_budget: int = DEFAULT_TURN_BUDGET_CHARS
|
|
preview_size: int = DEFAULT_PREVIEW_SIZE_CHARS
|
|
tool_overrides: Dict[str, int] = field(default_factory=dict)
|
|
|
|
def resolve_threshold(self, tool_name: str) -> int | float:
|
|
"""Resolve the persistence threshold for a tool.
|
|
|
|
Priority: pinned -> tool_overrides -> registry per-tool -> default.
|
|
|
|
The registry per-tool value is capped at ``default_result_size`` so a
|
|
context-scaled budget (small model) actually constrains tools that
|
|
register a large fixed ``max_result_size_chars`` (web/terminal/x_search
|
|
all register 100K). For the default budget this is a no-op because both
|
|
equal 100K; for a scaled-down budget it prevents a per-tool registry
|
|
value from re-inflating the cap past the model's window (#23767).
|
|
"""
|
|
if tool_name in PINNED_THRESHOLDS:
|
|
return PINNED_THRESHOLDS[tool_name]
|
|
if tool_name in self.tool_overrides:
|
|
return self.tool_overrides[tool_name]
|
|
from tools.registry import registry
|
|
registry_value = registry.get_max_result_size(tool_name, default=self.default_result_size)
|
|
if registry_value == float("inf"):
|
|
return registry_value
|
|
return min(registry_value, self.default_result_size)
|
|
|
|
|
|
# Default config -- matches current hardcoded behavior exactly.
|
|
DEFAULT_BUDGET = BudgetConfig()
|
|
|
|
|
|
# Token<->char conversion used when scaling the budget to a model's context
|
|
# window. Deliberately conservative (a smaller divisor = more chars per token =
|
|
# a larger char budget) would UNDER-protect small models, so we use the same
|
|
# rough 4-chars-per-token ratio the estimator uses (agent/model_metadata.py).
|
|
_CHARS_PER_TOKEN: int = 4
|
|
|
|
# Fraction of a model's context window we allow a SINGLE tool result to occupy
|
|
# before persisting/truncating it, and the fraction the WHOLE turn's tool
|
|
# output may occupy. Tool output is not the only thing in the window (system
|
|
# prompt, tool schemas, conversation history, the model's own reply all
|
|
# compete), so these stay well under 1.0.
|
|
_PER_RESULT_WINDOW_FRACTION: float = 0.15
|
|
_PER_TURN_WINDOW_FRACTION: float = 0.30
|
|
|
|
# Floor so even a tiny-but-admitted model still gets a usable preview/result
|
|
# rather than a 0-char budget.
|
|
_MIN_RESULT_SIZE_CHARS: int = 8_000
|
|
_MIN_TURN_BUDGET_CHARS: int = 16_000
|
|
|
|
|
|
def budget_for_context_window(context_length: int | None) -> BudgetConfig:
|
|
"""Return a BudgetConfig scaled to the active model's context window.
|
|
|
|
The fixed defaults (100K result / 200K turn chars) are correct for large
|
|
(200K+ token) models but blind to small ones: on a 65K-token model a single
|
|
tool result persisted at the 100K-char threshold, or a 200K-char turn
|
|
budget (~50K tokens), can by itself approach or exceed the whole window and
|
|
force an oversized request (#23767).
|
|
|
|
Scaling keeps large models byte-identical to today (the proportional value
|
|
is clamped to the existing defaults as a CAP) while shrinking the budget for
|
|
small models proportionally to their window, floored so a usable preview
|
|
always survives.
|
|
"""
|
|
if not context_length or context_length <= 0:
|
|
return DEFAULT_BUDGET
|
|
|
|
window_chars = context_length * _CHARS_PER_TOKEN
|
|
per_result = int(window_chars * _PER_RESULT_WINDOW_FRACTION)
|
|
per_turn = int(window_chars * _PER_TURN_WINDOW_FRACTION)
|
|
|
|
# Clamp: never exceed the historical defaults (so large models are
|
|
# unchanged), never drop below the floor (so tiny models stay usable).
|
|
per_result = max(_MIN_RESULT_SIZE_CHARS, min(per_result, DEFAULT_RESULT_SIZE_CHARS))
|
|
per_turn = max(_MIN_TURN_BUDGET_CHARS, min(per_turn, DEFAULT_TURN_BUDGET_CHARS))
|
|
|
|
return BudgetConfig(
|
|
default_result_size=per_result,
|
|
turn_budget=per_turn,
|
|
preview_size=DEFAULT_PREVIEW_SIZE_CHARS,
|
|
)
|