mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-24 10:52:21 +00:00
The tool-result persistence budget was a fixed 100K chars/result and 200K chars/turn regardless of the active model. On a small-context model (e.g. a 65K-token local model switched into mid-session) a single large tool result (reporter: a 279K-char search result) or a full 200K-char turn (~50K tokens) could by itself approach or exceed the window, forcing an oversized request that the provider rejects as "Prompt too long". - budget_config.budget_for_context_window() scales per-result/per-turn char caps to a fraction of the model window, clamped to the historical 100K/200K defaults (large models unchanged) and floored so small models stay usable. - resolve_threshold() now caps the per-tool registry value at default_result_size so tools that register a fixed 100K cap (web/terminal/x_search) don't re-inflate a scaled-down budget. No-op for the default budget (both 100K). - tool_executor wires the agent's live context_length (recomputed on model switch) into all four persist/turn-budget call sites. read_file stays inf-pinned (no persist loop). Verified E2E: a 279K-char result against a 65K model collapses to a ~1.6K preview; a 200K model is byte-identical to today.
257 lines
9.6 KiB
Python
257 lines
9.6 KiB
Python
"""Unit tests for tools/budget_config.py.
|
||
|
||
Covers default values, resolve_threshold() priority chain
|
||
(pinned > tool_overrides > registry > default), immutability,
|
||
and the PINNED_THRESHOLDS escape-hatch for read_file.
|
||
"""
|
||
|
||
import dataclasses
|
||
import math
|
||
from unittest.mock import patch
|
||
|
||
import pytest
|
||
|
||
from tools.budget_config import (
|
||
DEFAULT_BUDGET,
|
||
DEFAULT_PREVIEW_SIZE_CHARS,
|
||
DEFAULT_RESULT_SIZE_CHARS,
|
||
DEFAULT_TURN_BUDGET_CHARS,
|
||
PINNED_THRESHOLDS,
|
||
BudgetConfig,
|
||
budget_for_context_window,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Module-level constants
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestModuleConstants:
|
||
"""Verify documented default values haven't drifted."""
|
||
|
||
def test_default_result_size(self):
|
||
assert DEFAULT_RESULT_SIZE_CHARS == 100_000
|
||
|
||
def test_default_turn_budget(self):
|
||
assert DEFAULT_TURN_BUDGET_CHARS == 200_000
|
||
|
||
def test_default_preview_size(self):
|
||
assert DEFAULT_PREVIEW_SIZE_CHARS == 1_500
|
||
|
||
|
||
class TestPinnedThresholds:
|
||
"""PINNED_THRESHOLDS – tools whose values must never be overridden."""
|
||
|
||
def test_read_file_is_inf(self):
|
||
assert PINNED_THRESHOLDS["read_file"] == float("inf")
|
||
assert math.isinf(PINNED_THRESHOLDS["read_file"])
|
||
|
||
def test_pinned_is_not_empty(self):
|
||
assert len(PINNED_THRESHOLDS) >= 1
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# BudgetConfig defaults
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestBudgetConfigDefaults:
|
||
"""BudgetConfig() should match the module-level defaults exactly."""
|
||
|
||
def test_default_result_size(self):
|
||
cfg = BudgetConfig()
|
||
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
|
||
|
||
def test_default_turn_budget(self):
|
||
cfg = BudgetConfig()
|
||
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
|
||
|
||
def test_default_preview_size(self):
|
||
cfg = BudgetConfig()
|
||
assert cfg.preview_size == DEFAULT_PREVIEW_SIZE_CHARS
|
||
|
||
def test_default_tool_overrides_empty(self):
|
||
cfg = BudgetConfig()
|
||
assert cfg.tool_overrides == {}
|
||
|
||
def test_default_budget_singleton_matches(self):
|
||
"""DEFAULT_BUDGET should equal a freshly constructed BudgetConfig."""
|
||
assert DEFAULT_BUDGET == BudgetConfig()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Immutability (frozen=True)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestBudgetConfigFrozen:
|
||
"""Frozen dataclass must reject attribute mutation."""
|
||
|
||
def test_cannot_set_default_result_size(self):
|
||
cfg = BudgetConfig()
|
||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||
cfg.default_result_size = 999
|
||
|
||
def test_cannot_set_turn_budget(self):
|
||
cfg = BudgetConfig()
|
||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||
cfg.turn_budget = 999
|
||
|
||
def test_cannot_set_preview_size(self):
|
||
cfg = BudgetConfig()
|
||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||
cfg.preview_size = 999
|
||
|
||
def test_cannot_set_tool_overrides(self):
|
||
cfg = BudgetConfig()
|
||
with pytest.raises(dataclasses.FrozenInstanceError):
|
||
cfg.tool_overrides = {"foo": 1}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Custom construction
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestBudgetConfigCustom:
|
||
"""BudgetConfig can be created with non-default values."""
|
||
|
||
def test_custom_values(self):
|
||
cfg = BudgetConfig(
|
||
default_result_size=50_000,
|
||
turn_budget=100_000,
|
||
preview_size=500,
|
||
tool_overrides={"my_tool": 42},
|
||
)
|
||
assert cfg.default_result_size == 50_000
|
||
assert cfg.turn_budget == 100_000
|
||
assert cfg.preview_size == 500
|
||
assert cfg.tool_overrides == {"my_tool": 42}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# resolve_threshold() priority chain
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestResolveThreshold:
|
||
"""Priority: pinned > tool_overrides > registry > default."""
|
||
|
||
def test_pinned_wins_over_override(self):
|
||
"""Even if tool_overrides contains read_file, pinned value wins."""
|
||
cfg = BudgetConfig(tool_overrides={"read_file": 1})
|
||
result = cfg.resolve_threshold("read_file")
|
||
assert result == float("inf")
|
||
|
||
def test_tool_override_wins_over_default(self):
|
||
"""tool_overrides should be returned before falling back to registry."""
|
||
cfg = BudgetConfig(tool_overrides={"my_tool": 42})
|
||
result = cfg.resolve_threshold("my_tool")
|
||
assert result == 42
|
||
|
||
@patch("tools.registry.registry")
|
||
def test_falls_back_to_registry(self, mock_registry):
|
||
"""When not pinned and not in overrides, delegate to registry."""
|
||
mock_registry.get_max_result_size.return_value = 77_777
|
||
cfg = BudgetConfig()
|
||
result = cfg.resolve_threshold("some_tool")
|
||
mock_registry.get_max_result_size.assert_called_once_with(
|
||
"some_tool", default=DEFAULT_RESULT_SIZE_CHARS
|
||
)
|
||
assert result == 77_777
|
||
|
||
@patch("tools.registry.registry")
|
||
def test_registry_receives_custom_default(self, mock_registry):
|
||
"""Custom default_result_size flows through to registry call."""
|
||
mock_registry.get_max_result_size.return_value = 50_000
|
||
cfg = BudgetConfig(default_result_size=50_000)
|
||
cfg.resolve_threshold("unknown_tool")
|
||
mock_registry.get_max_result_size.assert_called_once_with(
|
||
"unknown_tool", default=50_000
|
||
)
|
||
|
||
def test_pinned_read_file_returns_inf(self):
|
||
"""Canonical case: read_file must always return inf."""
|
||
cfg = BudgetConfig()
|
||
assert cfg.resolve_threshold("read_file") == float("inf")
|
||
|
||
@patch("tools.registry.registry")
|
||
def test_registry_value_capped_at_default(self, mock_registry):
|
||
"""A scaled-down budget caps an oversized registry value (#23767).
|
||
|
||
web/terminal/x_search register max_result_size_chars=100_000; a small
|
||
model's scaled budget must not be re-inflated by that.
|
||
"""
|
||
mock_registry.get_max_result_size.return_value = 100_000
|
||
cfg = BudgetConfig(default_result_size=30_000)
|
||
assert cfg.resolve_threshold("web_search") == 30_000
|
||
|
||
@patch("tools.registry.registry")
|
||
def test_registry_inf_not_capped(self, mock_registry):
|
||
"""An inf registry value (e.g. a future pinned-like tool) is preserved."""
|
||
mock_registry.get_max_result_size.return_value = float("inf")
|
||
cfg = BudgetConfig(default_result_size=30_000)
|
||
assert cfg.resolve_threshold("some_tool") == float("inf")
|
||
|
||
@patch("tools.registry.registry")
|
||
def test_default_budget_unchanged_for_100k_tool(self, mock_registry):
|
||
"""Default budget keeps 100K registry tools at 100K (no behavior change)."""
|
||
mock_registry.get_max_result_size.return_value = 100_000
|
||
cfg = BudgetConfig() # default_result_size == 100_000
|
||
assert cfg.resolve_threshold("web_search") == 100_000
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# budget_for_context_window() — context-aware scaling (#23767)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestBudgetForContextWindow:
|
||
"""Scaling the tool-output budget to the active model's context window."""
|
||
|
||
def test_none_returns_default(self):
|
||
assert budget_for_context_window(None) is DEFAULT_BUDGET
|
||
|
||
def test_zero_or_negative_returns_default(self):
|
||
assert budget_for_context_window(0) is DEFAULT_BUDGET
|
||
assert budget_for_context_window(-5) is DEFAULT_BUDGET
|
||
|
||
def test_large_model_unchanged(self):
|
||
"""A 200K-token model keeps the historical 100K/200K char defaults."""
|
||
cfg = budget_for_context_window(200_000)
|
||
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
|
||
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
|
||
|
||
def test_very_large_model_still_capped_at_default(self):
|
||
"""A 1M-token model never exceeds the historical defaults (cap)."""
|
||
cfg = budget_for_context_window(1_000_000)
|
||
assert cfg.default_result_size == DEFAULT_RESULT_SIZE_CHARS
|
||
assert cfg.turn_budget == DEFAULT_TURN_BUDGET_CHARS
|
||
|
||
def test_small_model_scaled_down(self):
|
||
"""A 65K-token model gets a budget proportional to its window.
|
||
|
||
window_chars = 65_536*4 = 262_144; per_result = 15% = 39_321;
|
||
per_turn = 30% = 78_643. Both below the 100K/200K defaults.
|
||
"""
|
||
cfg = budget_for_context_window(65_536)
|
||
assert cfg.default_result_size < DEFAULT_RESULT_SIZE_CHARS
|
||
assert cfg.turn_budget < DEFAULT_TURN_BUDGET_CHARS
|
||
assert cfg.default_result_size == int(65_536 * 4 * 0.15)
|
||
assert cfg.turn_budget == int(65_536 * 4 * 0.30)
|
||
|
||
def test_tiny_model_floored(self):
|
||
"""A tiny window can't drop below the floor (usable preview survives)."""
|
||
cfg = budget_for_context_window(8_000)
|
||
assert cfg.default_result_size >= 8_000
|
||
assert cfg.turn_budget >= 16_000
|
||
|
||
def test_scaled_budget_constrains_oversized_result(self):
|
||
"""A 279K-char result against a 65K model exceeds the scaled per-result
|
||
threshold, so it will be persisted/truncated rather than sent whole."""
|
||
cfg = budget_for_context_window(65_536)
|
||
huge_len = 279_549
|
||
threshold = cfg.resolve_threshold("mcp_firecrawl_firecrawl_search")
|
||
assert threshold < huge_len
|
||
assert cfg.default_result_size < huge_len
|