wip: run tb2 and fix modal instantiation

2026-05-06 02:41:48 +00:00 · 2026-04-01 16:55:03 -07:00 · 2026-04-01 16:55:03 -07:00 · 3befb9389f
commit 3befb9389f
parent 3baafea380
7 changed files with 90 additions and 23 deletions
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@ -19,11 +19,11 @@ env:
  max_token_length: 32000
  agent_temperature: 0.8
  terminal_backend: "modal"
-  terminal_timeout: 300        # 5 min per command (builds, pip install)
-  tool_pool_size: 128          # thread pool for 89 parallel tasks
-  dataset_name: "NousResearch/terminal-bench-2"
+  terminal_timeout: 300 # 5 min per command (builds, pip install)
+  tool_pool_size: 128 # thread pool for 89 parallel tasks
+  dataset_name: "sidbin/terminal-bench-2-verified-flattened"
  test_timeout: 600
-  task_timeout: 1800           # 30 min wall-clock per task, auto-FAIL if exceeded
+  task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
  use_wandb: true
  wandb_name: "terminal-bench-2"
@ -36,7 +36,8 @@ env:

 openai:
  base_url: "https://openrouter.ai/api/v1"
-  model_name: "anthropic/claude-opus-4.6"
+  model_name: "openai/gpt-oss-120b:nitro"
  server_type: "openai"
  health_check: false
+  timeout: 300 # 5 min per API call (default 1200s causes 20min stalls)
  # api_key loaded from OPENROUTER_API_KEY in .env
--- a/environments/benchmarks/terminalbench_2/run_eval.sh
+++ b/environments/benchmarks/terminalbench_2/run_eval.sh
@ -32,8 +32,8 @@ export PYTHONUNBUFFERED=1
 # These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
 export LOGLEVEL=INFO

-python terminalbench2_env.py evaluate \
-  --config default.yaml \
+uv run python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
+  --config environments/benchmarks/terminalbench_2/default.yaml \
  "$@" \
  2>&1 | tee "$LOG_FILE"

--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@ -354,6 +354,16 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
        for i, task in enumerate(self.all_eval_items):
            self.category_index[task.get("category", "unknown")].append(i)

+        # Pre-compute which tasks need Modal's add_python (avoids re-decoding
+        # multi-MB environment_tar blobs during per-task rollouts).
+        self._needs_add_python: Dict[str, bool] = {
+            task["task_name"]: self._image_needs_add_python(task)
+            for task in self.all_eval_items
+        }
+        add_py_count = sum(self._needs_add_python.values())
+        if add_py_count:
+            print(f"  {add_py_count} tasks need add_python (non-python base image)")
+
        # Reward tracking for wandb logging
        self.eval_metrics: List[Tuple[str, float]] = []

@ -414,6 +424,36 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
    # Docker image resolution
    # =========================================================================

+    @staticmethod
+    def _image_needs_add_python(item: Dict[str, Any]) -> bool:
+        """Check if the task's base image lacks `python` on PATH.
+
+        Parses the Dockerfile FROM line in environment_tar. Returns True
+        for non-python base images (ubuntu, debian, etc.) that need
+        Modal's add_python parameter.
+        """
+        environment_tar = item.get("environment_tar", "")
+        if not environment_tar:
+            return False
+        try:
+            raw = base64.b64decode(environment_tar)
+            buf = io.BytesIO(raw)
+            with tarfile.open(fileobj=buf, mode="r:gz") as tar:
+                for member in tar:
+                    if not member.isfile() or "Dockerfile" not in member.name:
+                        continue
+                    f = tar.extractfile(member)
+                    if not f:
+                        continue
+                    for line in f.read().decode("utf-8", errors="ignore").splitlines():
+                        stripped = line.strip()
+                        if stripped.upper().startswith("FROM "):
+                            base = stripped.split()[1].lower()
+                            return not base.startswith("python:")
+        except Exception:
+            pass
+        return False
+
    def _resolve_task_image(
        self, item: Dict[str, Any], task_name: str
    ) -> Tuple[str, Optional[Path]]:
@ -505,11 +545,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            # --- 2. Register per-task image override ---
            # Set both modal_image and docker_image so the task image is used
            # regardless of which backend is configured.
-            register_task_env_overrides(task_id, {
+            overrides = {
                "modal_image": modal_image,
                "docker_image": modal_image,
                "cwd": "/app",
-            })
+            }
+            if self._needs_add_python.get(task_name, False):
+                overrides["add_python"] = "3.12"
+            register_task_env_overrides(task_id, overrides)
            logger.info(
                "Task %s: registered image override for task_id %s",
                task_name, task_id[:8],