Add new environments and enhance tool context functionality

- Introduced new environments: Terminal Test Environment and SWE Environment, each with default configurations for testing and software engineering tasks.
- Added TerminalBench 2.0 evaluation environment with comprehensive setup for agentic LLMs, including task execution and verification.
- Enhanced ToolContext with methods for uploading and downloading files, ensuring binary-safe operations.
- Updated documentation across environments to reflect new features and usage instructions.
- Refactored existing environment configurations for consistency and clarity.
This commit is contained in:
teknium 2026-02-10 19:39:05 +00:00
parent e8343f2d87
commit 35ad3146a8
18 changed files with 1428 additions and 19 deletions

View file

@ -39,19 +39,24 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
# Create environment OUTSIDE locks so we don't block other rollouts
# during slow Modal/Docker startup (~10s)
if needs_creation:
from tools.terminal_tool import _task_env_overrides
config = _get_env_config()
env_type = config["env_type"]
# Check per-task overrides (set by environments like TerminalBench2Env)
overrides = _task_env_overrides.get(task_id, {})
if env_type == "docker":
image = config["docker_image"]
image = overrides.get("docker_image") or config["docker_image"]
elif env_type == "singularity":
image = config["singularity_image"]
image = overrides.get("singularity_image") or config["singularity_image"]
elif env_type == "modal":
image = config["modal_image"]
image = overrides.get("modal_image") or config["modal_image"]
else:
image = ""
cwd = config["cwd"]
cwd = overrides.get("cwd") or config["cwd"]
_check_disk_usage_warning()
if not os.getenv("HERMES_QUIET"):
print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True)