feat: add eval_concurrency limit + Docker local config for TBLite

- Add eval_concurrency config field with asyncio.Semaphore - Add local.yaml config using Docker backend (sandboxed, no cloud costs) - Register docker_image alongside modal_image for backend flexibility - Default: 8 parallel tasks for local runs
2026-05-17 04:31:55 +00:00 · 2026-03-09 20:28:28 -05:00 · 2026-03-09 20:28:28 -05:00 · 136a64942d
commit 136a64942d
parent 9f74d1f2ec
2 changed files with 70 additions and 3 deletions
--- a/environments/benchmarks/tblite/local.yaml
+++ b/environments/benchmarks/tblite/local.yaml
@ -0,0 +1,38 @@
+# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
+#
+# Runs tasks in Docker containers on the local machine.
+# Sandboxed like Modal but no cloud costs. Good for dev/testing.
+#
+# Usage:
+#   python environments/benchmarks/tblite/tblite_env.py evaluate \
+#       --config environments/benchmarks/tblite/local.yaml
+#
+#   # Override concurrency:
+#   python environments/benchmarks/tblite/tblite_env.py evaluate \
+#       --config environments/benchmarks/tblite/local.yaml \
+#       --env.eval_concurrency 4
+
+env:
+  enabled_toolsets: ["terminal", "file"]
+  max_agent_turns: 60
+  max_token_length: 32000
+  agent_temperature: 0.8
+  terminal_backend: "docker"
+  terminal_timeout: 300
+  tool_pool_size: 16
+  dataset_name: "NousResearch/openthoughts-tblite"
+  test_timeout: 600
+  task_timeout: 1200
+  eval_concurrency: 8          # max 8 tasks at once
+  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
+  use_wandb: false
+  wandb_name: "openthoughts-tblite-local"
+  ensure_scores_are_not_same: false
+  data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
+
+openai:
+  base_url: "https://openrouter.ai/api/v1"
+  model_name: "anthropic/claude-sonnet-4"
+  server_type: "openai"
+  health_check: false
+  # api_key loaded from OPENROUTER_API_KEY in .env
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@ -118,6 +118,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
        "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
    )

+    # --- Eval concurrency ---
+    eval_concurrency: int = Field(
+        default=0,
+        description="Maximum number of tasks to evaluate in parallel. "
+        "0 means unlimited (all tasks run concurrently). "
+        "Set to 8 for local backends to avoid overwhelming the machine.",
+    )
+

 # Tasks that cannot run properly on Modal and are excluded from scoring.
 MODAL_INCOMPATIBLE_TASKS = {
@ -429,8 +437,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                    "error": "no_image",
                }

-            # --- 2. Register per-task Modal image override ---
-            register_task_env_overrides(task_id, {"modal_image": modal_image})
+            # --- 2. Register per-task image override ---
+            # Set both modal_image and docker_image so the task image is used
+            # regardless of which backend is configured.
+            register_task_env_overrides(task_id, {
+                "modal_image": modal_image,
+                "docker_image": modal_image,
+            })
            logger.info(
                "Task %s: registered image override for task_id %s",
                task_name, task_id[:8],
@ -655,13 +668,19 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):

    async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
        """
-        Wrap rollout_and_score_eval with a per-task wall-clock timeout.
+        Wrap rollout_and_score_eval with a per-task wall-clock timeout
+        and optional concurrency limit via semaphore.

        If the task exceeds task_timeout seconds, it's automatically scored
        as FAIL. This prevents any single task from hanging indefinitely.
        """
        task_name = item.get("task_name", "unknown")
        category = item.get("category", "unknown")
+
+        # Acquire concurrency semaphore if configured
+        if self._eval_semaphore:
+            await self._eval_semaphore.acquire()
+
        try:
            return await asyncio.wait_for(
                self.rollout_and_score_eval(item),
@ -679,6 +698,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            }
            self._save_result(out)
            return out
+        finally:
+            if self._eval_semaphore:
+                self._eval_semaphore.release()

    async def evaluate(self, *args, **kwargs) -> None:
        """
@ -696,6 +718,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
        """
        start_time = time.time()

+        # Set up concurrency limit if configured
+        if self.config.eval_concurrency > 0:
+            self._eval_semaphore = asyncio.Semaphore(self.config.eval_concurrency)
+            print(f"  Eval concurrency: {self.config.eval_concurrency} tasks at a time")
+        else:
+            self._eval_semaphore = None
+
        # Route all logging through tqdm.write() so the progress bar stays
        # pinned at the bottom while log lines scroll above it.
        from tqdm import tqdm