wip: run tb2 and fix modal instantiation

This commit is contained in:
alt-glitch 2026-04-01 16:55:03 -07:00
parent 3baafea380
commit 3befb9389f
7 changed files with 90 additions and 23 deletions

View file

@ -225,20 +225,35 @@ class HermesAgentLoop:
chat_kwargs["extra_body"] = self.extra_body
# Make the API call -- standard OpenAI spec
# Retry on timeout/connection errors (provider queuing, rate limits)
api_start = _time.monotonic()
try:
response = await self.server.chat_completion(**chat_kwargs)
except Exception as e:
api_elapsed = _time.monotonic() - api_start
logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
tool_errors=tool_errors,
)
response = None
max_retries = 3
for attempt in range(max_retries):
try:
response = await self.server.chat_completion(**chat_kwargs)
break
except Exception as e:
api_elapsed = _time.monotonic() - api_start
is_retryable = "timeout" in type(e).__name__.lower() or "connection" in type(e).__name__.lower()
if is_retryable and attempt < max_retries - 1:
wait = 2 ** attempt
logger.warning(
"[%s] API call timed out on turn %d attempt %d (%.1fs), retrying in %ds: %s",
self.task_id[:8], turn + 1, attempt + 1, api_elapsed, wait, e,
)
await asyncio.sleep(wait)
api_start = _time.monotonic()
continue
logger.error("API call failed on turn %d (%.1fs): %s", turn + 1, api_elapsed, e)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
tool_errors=tool_errors,
)
api_elapsed = _time.monotonic() - api_start

View file

@ -19,11 +19,11 @@ env:
max_token_length: 32000
agent_temperature: 0.8
terminal_backend: "modal"
terminal_timeout: 300 # 5 min per command (builds, pip install)
tool_pool_size: 128 # thread pool for 89 parallel tasks
dataset_name: "NousResearch/terminal-bench-2"
terminal_timeout: 300 # 5 min per command (builds, pip install)
tool_pool_size: 128 # thread pool for 89 parallel tasks
dataset_name: "sidbin/terminal-bench-2-verified-flattened"
test_timeout: 600
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
use_wandb: true
wandb_name: "terminal-bench-2"
@ -36,7 +36,8 @@ env:
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-opus-4.6"
model_name: "openai/gpt-oss-120b:nitro"
server_type: "openai"
health_check: false
timeout: 300 # 5 min per API call (default 1200s causes 20min stalls)
# api_key loaded from OPENROUTER_API_KEY in .env

View file

@ -32,8 +32,8 @@ export PYTHONUNBUFFERED=1
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
export LOGLEVEL=INFO
python terminalbench2_env.py evaluate \
--config default.yaml \
uv run python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
--config environments/benchmarks/terminalbench_2/default.yaml \
"$@" \
2>&1 | tee "$LOG_FILE"

View file

@ -354,6 +354,16 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
for i, task in enumerate(self.all_eval_items):
self.category_index[task.get("category", "unknown")].append(i)
# Pre-compute which tasks need Modal's add_python (avoids re-decoding
# multi-MB environment_tar blobs during per-task rollouts).
self._needs_add_python: Dict[str, bool] = {
task["task_name"]: self._image_needs_add_python(task)
for task in self.all_eval_items
}
add_py_count = sum(self._needs_add_python.values())
if add_py_count:
print(f" {add_py_count} tasks need add_python (non-python base image)")
# Reward tracking for wandb logging
self.eval_metrics: List[Tuple[str, float]] = []
@ -414,6 +424,36 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
# Docker image resolution
# =========================================================================
@staticmethod
def _image_needs_add_python(item: Dict[str, Any]) -> bool:
"""Check if the task's base image lacks `python` on PATH.
Parses the Dockerfile FROM line in environment_tar. Returns True
for non-python base images (ubuntu, debian, etc.) that need
Modal's add_python parameter.
"""
environment_tar = item.get("environment_tar", "")
if not environment_tar:
return False
try:
raw = base64.b64decode(environment_tar)
buf = io.BytesIO(raw)
with tarfile.open(fileobj=buf, mode="r:gz") as tar:
for member in tar:
if not member.isfile() or "Dockerfile" not in member.name:
continue
f = tar.extractfile(member)
if not f:
continue
for line in f.read().decode("utf-8", errors="ignore").splitlines():
stripped = line.strip()
if stripped.upper().startswith("FROM "):
base = stripped.split()[1].lower()
return not base.startswith("python:")
except Exception:
pass
return False
def _resolve_task_image(
self, item: Dict[str, Any], task_name: str
) -> Tuple[str, Optional[Path]]:
@ -505,11 +545,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
# --- 2. Register per-task image override ---
# Set both modal_image and docker_image so the task image is used
# regardless of which backend is configured.
register_task_env_overrides(task_id, {
overrides = {
"modal_image": modal_image,
"docker_image": modal_image,
"cwd": "/app",
})
}
if self._needs_add_python.get(task_name, False):
overrides["add_python"] = "3.12"
register_task_env_overrides(task_id, overrides)
logger.info(
"Task %s: registered image override for task_id %s",
task_name, task_id[:8],