diff --git a/environments/agent_loop.py b/environments/agent_loop.py
index ba2db0b57..cbf9c7742 100644
--- a/environments/agent_loop.py
+++ b/environments/agent_loop.py
@@ -140,6 +140,7 @@ class HermesAgentLoop:
temperature: float = 1.0,
max_tokens: Optional[int] = None,
extra_body: Optional[Dict[str, Any]] = None,
+ budget_config: Optional["BudgetConfig"] = None,
):
"""
Initialize the agent loop.
@@ -156,7 +157,11 @@ class HermesAgentLoop:
extra_body: Extra parameters passed to the OpenAI client's create() call.
Used for OpenRouter provider preferences, transforms, etc.
e.g. {"provider": {"ignore": ["DeepInfra"]}}
+ budget_config: Tool result persistence budget. Controls per-tool
+ thresholds, per-turn aggregate budget, and preview size.
+ If None, uses DEFAULT_BUDGET (current hardcoded values).
"""
+ from tools.budget_config import DEFAULT_BUDGET
self.server = server
self.tool_schemas = tool_schemas
self.valid_tool_names = valid_tool_names
@@ -165,6 +170,7 @@ class HermesAgentLoop:
self.temperature = temperature
self.max_tokens = max_tokens
self.extra_body = extra_body
+ self.budget_config = budget_config or DEFAULT_BUDGET
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
"""
@@ -455,6 +461,8 @@ class HermesAgentLoop:
tool_name=tool_name,
tool_use_id=tc_id,
env=get_active_env(self.task_id),
+ threshold=self.budget_config.resolve_threshold(tool_name),
+ preview_size=self.budget_config.preview_size,
)
except Exception:
pass # Persistence is best-effort in eval path
@@ -470,7 +478,12 @@ class HermesAgentLoop:
try:
num_tcs = len(assistant_msg.tool_calls)
if num_tcs > 0:
- enforce_turn_budget(messages[-num_tcs:], env=get_active_env(self.task_id))
+ enforce_turn_budget(
+ messages[-num_tcs:],
+ env=get_active_env(self.task_id),
+ budget=self.budget_config.turn_budget,
+ preview_size=self.budget_config.preview_size,
+ )
except Exception:
pass
diff --git a/environments/agentic_opd_env.py b/environments/agentic_opd_env.py
index b96271237..44311f551 100644
--- a/environments/agentic_opd_env.py
+++ b/environments/agentic_opd_env.py
@@ -1048,6 +1048,7 @@ class AgenticOPDEnv(HermesAgentBaseEnv):
temperature=0.0,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
index 2f0d9262f..c7eaff6c4 100644
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@@ -541,6 +541,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
else:
@@ -553,6 +554,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
diff --git a/environments/benchmarks/yc_bench/yc_bench_env.py b/environments/benchmarks/yc_bench/yc_bench_env.py
index 5b6bf9ad3..4247ae56c 100644
--- a/environments/benchmarks/yc_bench/yc_bench_env.py
+++ b/environments/benchmarks/yc_bench/yc_bench_env.py
@@ -549,6 +549,7 @@ class YCBenchEvalEnv(HermesAgentBaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
diff --git a/environments/hermes_base_env.py b/environments/hermes_base_env.py
index 651722ff1..ededab355 100644
--- a/environments/hermes_base_env.py
+++ b/environments/hermes_base_env.py
@@ -62,6 +62,11 @@ from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext
+from tools.budget_config import (
+ DEFAULT_RESULT_SIZE_CHARS,
+ DEFAULT_TURN_BUDGET_CHARS,
+ DEFAULT_PREVIEW_SIZE_CHARS,
+)
# Import hermes-agent toolset infrastructure
from model_tools import get_tool_definitions
@@ -160,6 +165,32 @@ class HermesAgentEnvConfig(BaseEnvConfig):
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
)
+ # --- Tool result budget ---
+ # Defaults imported from tools.budget_config (single source of truth).
+ default_result_size_chars: int = Field(
+ default=DEFAULT_RESULT_SIZE_CHARS,
+ description="Default per-tool threshold (chars) for persisting large results "
+ "to sandbox. Results exceeding this are written to /tmp/hermes-results/ "
+ "and replaced with a preview. Per-tool registry values take precedence "
+ "unless overridden via tool_result_overrides.",
+ )
+ turn_budget_chars: int = Field(
+ default=DEFAULT_TURN_BUDGET_CHARS,
+ description="Aggregate char budget per assistant turn. If all tool results "
+ "in a single turn exceed this, the largest are persisted to disk first.",
+ )
+ preview_size_chars: int = Field(
+ default=DEFAULT_PREVIEW_SIZE_CHARS,
+ description="Size of the inline preview shown after a tool result is persisted.",
+ )
+ tool_result_overrides: Optional[Dict[str, int]] = Field(
+ default=None,
+ description="Per-tool threshold overrides (chars). Keys are tool names, "
+ "values are char thresholds. Overrides both the default and registry "
+ "per-tool values. Example: {'terminal': 10000, 'search_files': 5000}. "
+ "Note: read_file is pinned to infinity and cannot be overridden.",
+ )
+
# --- Provider-specific parameters ---
# Passed as extra_body to the OpenAI client's chat.completions.create() call.
# Useful for OpenRouter provider preferences, transforms, route settings, etc.
@@ -176,6 +207,16 @@ class HermesAgentEnvConfig(BaseEnvConfig):
"transforms, and other provider-specific settings.",
)
+ def build_budget_config(self):
+ """Build a BudgetConfig from env config fields."""
+ from tools.budget_config import BudgetConfig
+ return BudgetConfig(
+ default_result_size=self.default_result_size_chars,
+ turn_budget=self.turn_budget_chars,
+ preview_size=self.preview_size_chars,
+ tool_overrides=dict(self.tool_result_overrides) if self.tool_result_overrides else {},
+ )
+
class HermesAgentBaseEnv(BaseEnv):
"""
@@ -490,6 +531,7 @@ class HermesAgentBaseEnv(BaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
except NotImplementedError:
@@ -507,6 +549,7 @@ class HermesAgentBaseEnv(BaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
else:
@@ -520,6 +563,7 @@ class HermesAgentBaseEnv(BaseEnv):
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
diff --git a/environments/web_research_env.py b/environments/web_research_env.py
index b234159f0..c637a7cbe 100644
--- a/environments/web_research_env.py
+++ b/environments/web_research_env.py
@@ -472,6 +472,7 @@ class WebResearchEnv(HermesAgentBaseEnv):
temperature=0.0, # Deterministic for eval
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
+ budget_config=self.config.build_budget_config(),
)
result = await agent.run(messages)
diff --git a/tools/budget_config.py b/tools/budget_config.py
new file mode 100644
index 000000000..52204cdf8
--- /dev/null
+++ b/tools/budget_config.py
@@ -0,0 +1,52 @@
+"""Configurable budget constants for tool result persistence.
+
+Overridable at the RL environment level via HermesAgentEnvConfig fields.
+Per-tool resolution: pinned > config overrides > registry > default.
+"""
+
+from dataclasses import dataclass, field
+from typing import Dict
+
+# Tools whose thresholds must never be overridden.
+# read_file=inf prevents infinite persist->read->persist loops.
+PINNED_THRESHOLDS: Dict[str, float] = {
+ "read_file": float("inf"),
+}
+
+# Defaults matching the current hardcoded values in tool_result_storage.py.
+# Kept here as the single source of truth; tool_result_storage.py imports these.
+DEFAULT_RESULT_SIZE_CHARS: int = 50_000
+DEFAULT_TURN_BUDGET_CHARS: int = 200_000
+DEFAULT_PREVIEW_SIZE_CHARS: int = 2_000
+
+
+@dataclass(frozen=True)
+class BudgetConfig:
+ """Immutable budget constants for the 3-layer tool result persistence system.
+
+ Layer 2 (per-result): resolve_threshold(tool_name) -> threshold in chars.
+ Layer 3 (per-turn): turn_budget -> aggregate char budget across all tool
+ results in a single assistant turn.
+ Preview: preview_size -> inline snippet size after persistence.
+ """
+
+ default_result_size: int = DEFAULT_RESULT_SIZE_CHARS
+ turn_budget: int = DEFAULT_TURN_BUDGET_CHARS
+ preview_size: int = DEFAULT_PREVIEW_SIZE_CHARS
+ tool_overrides: Dict[str, int] = field(default_factory=dict)
+
+ def resolve_threshold(self, tool_name: str) -> int | float:
+ """Resolve the persistence threshold for a tool.
+
+ Priority: pinned -> tool_overrides -> registry per-tool -> default.
+ """
+ if tool_name in PINNED_THRESHOLDS:
+ return PINNED_THRESHOLDS[tool_name]
+ if tool_name in self.tool_overrides:
+ return self.tool_overrides[tool_name]
+ from tools.registry import registry
+ return registry.get_max_result_size(tool_name, default=self.default_result_size)
+
+
+# Default config -- matches current hardcoded behavior exactly.
+DEFAULT_BUDGET = BudgetConfig()
diff --git a/tools/registry.py b/tools/registry.py
index c01c60c09..9437a6b41 100644
--- a/tools/registry.py
+++ b/tools/registry.py
@@ -169,12 +169,14 @@ class ToolRegistry:
# Query helpers (replace redundant dicts in model_tools.py)
# ------------------------------------------------------------------
- def get_max_result_size(self, name: str) -> int | float:
- """Return per-tool max result size, or global default."""
- from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
+ def get_max_result_size(self, name: str, default: int | float | None = None) -> int | float:
+ """Return per-tool max result size, or *default* (or global default)."""
entry = self._tools.get(name)
if entry and entry.max_result_size_chars is not None:
return entry.max_result_size_chars
+ if default is not None:
+ return default
+ from tools.tool_result_storage import DEFAULT_MAX_RESULT_SIZE_CHARS
return DEFAULT_MAX_RESULT_SIZE_CHARS
def get_all_tool_names(self) -> List[str]:
diff --git a/tools/tool_result_storage.py b/tools/tool_result_storage.py
index c478431be..8b2abb918 100644
--- a/tools/tool_result_storage.py
+++ b/tools/tool_result_storage.py
@@ -24,11 +24,13 @@ import json
import logging
import uuid
-logger = logging.getLogger(__name__)
+from tools.budget_config import (
+ DEFAULT_RESULT_SIZE_CHARS as DEFAULT_MAX_RESULT_SIZE_CHARS,
+ DEFAULT_TURN_BUDGET_CHARS as MAX_TURN_BUDGET_CHARS,
+ DEFAULT_PREVIEW_SIZE_CHARS as PREVIEW_SIZE_CHARS,
+)
-DEFAULT_MAX_RESULT_SIZE_CHARS: int = 50_000
-MAX_TURN_BUDGET_CHARS: int = 200_000
-PREVIEW_SIZE_CHARS: int = 2_000
+logger = logging.getLogger(__name__)
PERSISTED_OUTPUT_TAG = ""
PERSISTED_OUTPUT_CLOSING_TAG = ""
STORAGE_DIR = "/tmp/hermes-results"
@@ -112,6 +114,7 @@ def maybe_persist_tool_result(
tool_use_id: str,
env=None,
threshold: int | float | None = None,
+ preview_size: int = PREVIEW_SIZE_CHARS,
) -> str:
"""Layer 2: persist oversized result into the sandbox, return preview + path.
@@ -125,6 +128,7 @@ def maybe_persist_tool_result(
tool_use_id: Unique ID for this tool call (used as filename).
env: The active BaseEnvironment instance, or None.
threshold: Override threshold; if None, looked up from registry.
+ preview_size: Max chars for the inline preview after persistence.
Returns:
Original content if small, or replacement.
@@ -143,7 +147,7 @@ def maybe_persist_tool_result(
remote_path = f"{STORAGE_DIR}/{tool_use_id}.txt"
# Write raw output (not JSON wrapper) so read_file returns readable text
file_content = _extract_raw_output(content)
- preview, has_more = generate_preview(file_content)
+ preview, has_more = generate_preview(file_content, max_chars=preview_size)
# Try writing into the sandbox
if env is not None:
@@ -173,6 +177,7 @@ def enforce_turn_budget(
tool_messages: list[dict],
env=None,
budget: int = MAX_TURN_BUDGET_CHARS,
+ preview_size: int = PREVIEW_SIZE_CHARS,
) -> list[dict]:
"""Layer 3: enforce aggregate budget across all tool results in a turn.
@@ -210,6 +215,7 @@ def enforce_turn_budget(
tool_use_id=tool_use_id,
env=env,
threshold=0,
+ preview_size=preview_size,
)
if replacement != content:
total_size -= size