diff --git a/environments/agent_loop.py b/environments/agent_loop.py index ba2db0b57..cbf9c7742 100644 --- a/environments/agent_loop.py +++ b/environments/agent_loop.py @@ -140,6 +140,7 @@ class HermesAgentLoop: temperature: float = 1.0, max_tokens: Optional[int] = None, extra_body: Optional[Dict[str, Any]] = None, + budget_config: Optional["BudgetConfig"] = None, ): """ Initialize the agent loop. @@ -156,7 +157,11 @@ class HermesAgentLoop: extra_body: Extra parameters passed to the OpenAI client's create() call. Used for OpenRouter provider preferences, transforms, etc. e.g. {"provider": {"ignore": ["DeepInfra"]}} + budget_config: Tool result persistence budget. Controls per-tool + thresholds, per-turn aggregate budget, and preview size. + If None, uses DEFAULT_BUDGET (current hardcoded values). """ + from tools.budget_config import DEFAULT_BUDGET self.server = server self.tool_schemas = tool_schemas self.valid_tool_names = valid_tool_names @@ -165,6 +170,7 @@ class HermesAgentLoop: self.temperature = temperature self.max_tokens = max_tokens self.extra_body = extra_body + self.budget_config = budget_config or DEFAULT_BUDGET async def run(self, messages: List[Dict[str, Any]]) -> AgentResult: """ @@ -455,6 +461,8 @@ class HermesAgentLoop: tool_name=tool_name, tool_use_id=tc_id, env=get_active_env(self.task_id), + threshold=self.budget_config.resolve_threshold(tool_name), + preview_size=self.budget_config.preview_size, ) except Exception: pass # Persistence is best-effort in eval path @@ -470,7 +478,12 @@ class HermesAgentLoop: try: num_tcs = len(assistant_msg.tool_calls) if num_tcs > 0: - enforce_turn_budget(messages[-num_tcs:], env=get_active_env(self.task_id)) + enforce_turn_budget( + messages[-num_tcs:], + env=get_active_env(self.task_id), + budget=self.budget_config.turn_budget, + preview_size=self.budget_config.preview_size, + ) except Exception: pass diff --git a/environments/agentic_opd_env.py b/environments/agentic_opd_env.py index b96271237..44311f551 100644 --- a/environments/agentic_opd_env.py +++ b/environments/agentic_opd_env.py @@ -1048,6 +1048,7 @@ class AgenticOPDEnv(HermesAgentBaseEnv): temperature=0.0, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index 2f0d9262f..c7eaff6c4 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -541,6 +541,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) else: @@ -553,6 +554,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) diff --git a/environments/benchmarks/yc_bench/yc_bench_env.py b/environments/benchmarks/yc_bench/yc_bench_env.py index 5b6bf9ad3..4247ae56c 100644 --- a/environments/benchmarks/yc_bench/yc_bench_env.py +++ b/environments/benchmarks/yc_bench/yc_bench_env.py @@ -549,6 +549,7 @@ class YCBenchEvalEnv(HermesAgentBaseEnv): temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) diff --git a/environments/hermes_base_env.py b/environments/hermes_base_env.py index 651722ff1..ededab355 100644 --- a/environments/hermes_base_env.py +++ b/environments/hermes_base_env.py @@ -62,6 +62,11 @@ from atroposlib.type_definitions import Item from environments.agent_loop import AgentResult, HermesAgentLoop from environments.tool_context import ToolContext +from tools.budget_config import ( + DEFAULT_RESULT_SIZE_CHARS, + DEFAULT_TURN_BUDGET_CHARS, + DEFAULT_PREVIEW_SIZE_CHARS, +) # Import hermes-agent toolset infrastructure from model_tools import get_tool_definitions @@ -160,6 +165,32 @@ class HermesAgentEnvConfig(BaseEnvConfig): "Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.", ) + # --- Tool result budget --- + # Defaults imported from tools.budget_config (single source of truth). + default_result_size_chars: int = Field( + default=DEFAULT_RESULT_SIZE_CHARS, + description="Default per-tool threshold (chars) for persisting large results " + "to sandbox. Results exceeding this are written to /tmp/hermes-results/ " + "and replaced with a preview. Per-tool registry values take precedence " + "unless overridden via tool_result_overrides.", + ) + turn_budget_chars: int = Field( + default=DEFAULT_TURN_BUDGET_CHARS, + description="Aggregate char budget per assistant turn. If all tool results " + "in a single turn exceed this, the largest are persisted to disk first.", + ) + preview_size_chars: int = Field( + default=DEFAULT_PREVIEW_SIZE_CHARS, + description="Size of the inline preview shown after a tool result is persisted.", + ) + tool_result_overrides: Optional[Dict[str, int]] = Field( + default=None, + description="Per-tool threshold overrides (chars). Keys are tool names, " + "values are char thresholds. Overrides both the default and registry " + "per-tool values. Example: {'terminal': 10000, 'search_files': 5000}. " + "Note: read_file is pinned to infinity and cannot be overridden.", + ) + # --- Provider-specific parameters --- # Passed as extra_body to the OpenAI client's chat.completions.create() call. # Useful for OpenRouter provider preferences, transforms, route settings, etc. @@ -176,6 +207,16 @@ class HermesAgentEnvConfig(BaseEnvConfig): "transforms, and other provider-specific settings.", ) + def build_budget_config(self): + """Build a BudgetConfig from env config fields.""" + from tools.budget_config import BudgetConfig + return BudgetConfig( + default_result_size=self.default_result_size_chars, + turn_budget=self.turn_budget_chars, + preview_size=self.preview_size_chars, + tool_overrides=dict(self.tool_result_overrides) if self.tool_result_overrides else {}, + ) + class HermesAgentBaseEnv(BaseEnv): """ @@ -490,6 +531,7 @@ class HermesAgentBaseEnv(BaseEnv): temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) except NotImplementedError: @@ -507,6 +549,7 @@ class HermesAgentBaseEnv(BaseEnv): temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) else: @@ -520,6 +563,7 @@ class HermesAgentBaseEnv(BaseEnv): temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) diff --git a/environments/web_research_env.py b/environments/web_research_env.py index b234159f0..c637a7cbe 100644 --- a/environments/web_research_env.py +++ b/environments/web_research_env.py @@ -472,6 +472,7 @@ class WebResearchEnv(HermesAgentBaseEnv): temperature=0.0, # Deterministic for eval max_tokens=self.config.max_token_length, extra_body=self.config.extra_body, + budget_config=self.config.build_budget_config(), ) result = await agent.run(messages) diff --git a/tools/budget_config.py b/tools/budget_config.py new file mode 100644 index 000000000..52204cdf8 --- /dev/null +++ b/tools/budget_config.py @@ -0,0 +1,52 @@ +"""Configurable budget constants for tool result persistence. + +Overridable at the RL environment level via HermesAgentEnvConfig fields. +Per-tool resolution: pinned > config overrides > registry > default. +""" + +from dataclasses import dataclass, field +from typing import Dict + +# Tools whose thresholds must never be overridden. +# read_file=inf prevents infinite persist->read->persist loops. +PINNED_THRESHOLDS: Dict[str, float] = { + "read_file": float("inf"), +} + +# Defaults matching the current hardcoded values in tool_result_storage.py. +# Kept here as the single source of truth; tool_result_storage.py imports these. +DEFAULT_RESULT_SIZE_CHARS: int = 50_000 +DEFAULT_TURN_BUDGET_CHARS: int = 200_000 +DEFAULT_PREVIEW_SIZE_CHARS: int = 2_000 + + +@dataclass(frozen=True) +class BudgetConfig: + """Immutable budget constants for the 3-layer tool result persistence system. + + Layer 2 (per-result): resolve_threshold(tool_name) -> threshold in chars. + Layer 3 (per-turn): turn_budget -> aggregate char budget across all tool + results in a single assistant turn. + Preview: preview_size -> inline snippet size after persistence. + """ + + default_result_size: int = DEFAULT_RESULT_SIZE_CHARS + turn_budget: int = DEFAULT_TURN_BUDGET_CHARS + preview_size: int = DEFAULT_PREVIEW_SIZE_CHARS + tool_overrides: Dict[str, int] = field(default_factory=dict) + + def resolve_threshold(self, tool_name: str) -> int | float: + """Resolve the persistence threshold for a tool. + + Priority: pinned -> tool_overrides -> registry per-tool -> default. + """ + if tool_name in PINNED_THRESHOLDS: + return PINNED_THRESHOLDS[tool_name] + if tool_name in self.tool_overrides: + return self.tool_overrides[tool_name] + from tools.registry import registry + return registry.get_max_result_size(tool_name, default=self.default_result_size) + + +# Default config -- matches current hardcoded behavior exactly. +DEFAULT_BUDGET = BudgetConfig() diff --git a/tools/registry.py b/tools/registry.py index c01c60c09..9437a6b41 100644 --- a/tools/registry.py +++ b/tools/registry.py @@ -169,12 +169,14 @@ class ToolRegistry: # Query helpers (replace redundant dicts in model_tools.py) # ------------------------------------------------------------------ - def get_max_result_size(self, name: str) -> int | float: - """Return per-tool max result size, or global default.""" - from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS + def get_max_result_size(self, name: str, default: int | float | None = None) -> int | float: + """Return per-tool max result size, or *default* (or global default).""" entry = self._tools.get(name) if entry and entry.max_result_size_chars is not None: return entry.max_result_size_chars + if default is not None: + return default + from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS return DEFAULT_MAX_RESULT_SIZE_CHARS def get_all_tool_names(self) -> List[str]: diff --git a/tools/tool_result_storage.py b/tools/tool_result_storage.py index c478431be..8b2abb918 100644 --- a/tools/tool_result_storage.py +++ b/tools/tool_result_storage.py @@ -24,11 +24,13 @@ import json import logging import uuid -logger = logging.getLogger(__name__) +from tools.budget_config import ( + DEFAULT_RESULT_SIZE_CHARS as DEFAULT_MAX_RESULT_SIZE_CHARS, + DEFAULT_TURN_BUDGET_CHARS as MAX_TURN_BUDGET_CHARS, + DEFAULT_PREVIEW_SIZE_CHARS as PREVIEW_SIZE_CHARS, +) -DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000 -MAX_TURN_BUDGET_CHARS: int = 200_000 -PREVIEW_SIZE_CHARS: int = 2_000 +logger = logging.getLogger(__name__) PERSISTED_OUTPUT_TAG = "" PERSISTED_OUTPUT_CLOSING_TAG = "" STORAGE_DIR = "/tmp/hermes-results" @@ -112,6 +114,7 @@ def maybe_persist_tool_result( tool_use_id: str, env=None, threshold: int | float | None = None, + preview_size: int = PREVIEW_SIZE_CHARS, ) -> str: """Layer 2: persist oversized result into the sandbox, return preview + path. @@ -125,6 +128,7 @@ def maybe_persist_tool_result( tool_use_id: Unique ID for this tool call (used as filename). env: The active BaseEnvironment instance, or None. threshold: Override threshold; if None, looked up from registry. + preview_size: Max chars for the inline preview after persistence. Returns: Original content if small, or replacement. @@ -143,7 +147,7 @@ def maybe_persist_tool_result( remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt" # Write raw output (not JSON wrapper) so read_file returns readable text file_content = _extract_raw_output(content) - preview, has_more = generate_preview(file_content) + preview, has_more = generate_preview(file_content, max_chars=preview_size) # Try writing into the sandbox if env is not None: @@ -173,6 +177,7 @@ def enforce_turn_budget( tool_messages: list[dict], env=None, budget: int = MAX_TURN_BUDGET_CHARS, + preview_size: int = PREVIEW_SIZE_CHARS, ) -> list[dict]: """Layer 3: enforce aggregate budget across all tool results in a turn. @@ -210,6 +215,7 @@ def enforce_turn_budget( tool_use_id=tool_use_id, env=env, threshold=0, + preview_size=preview_size, ) if replacement != content: total_size -= size