feat: add eval_concurrency limit + Docker local config for TBLite

- Add eval_concurrency config field with asyncio.Semaphore - Add local.yaml config using Docker backend (sandboxed, no cloud costs) - Register docker_image alongside modal_image for backend flexibility - Default: 8 parallel tasks for local runs
2026-05-18 04:41:56 +00:00 · 2026-03-09 20:28:28 -05:00 · 2026-03-09 20:28:28 -05:00 · 136a64942d
commit 136a64942d
parent 9f74d1f2ec
2 changed files with 70 additions and 3 deletions
--- a/environments/benchmarks/tblite/local.yaml
+++ b/environments/benchmarks/tblite/local.yaml
@ -0,0 +1,38 @@
 # OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
 #
 # Runs tasks in Docker containers on the local machine.
 # Sandboxed like Modal but no cloud costs. Good for dev/testing.
 #
 # Usage:
 #   python environments/benchmarks/tblite/tblite_env.py evaluate \
 #       --config environments/benchmarks/tblite/local.yaml
 #
 #   # Override concurrency:
 #   python environments/benchmarks/tblite/tblite_env.py evaluate \
 #       --config environments/benchmarks/tblite/local.yaml \
 #       --env.eval_concurrency 4
 env:
  enabled_toolsets: ["terminal", "file"]
  max_agent_turns: 60
  max_token_length: 32000
  agent_temperature: 0.8
  terminal_backend: "docker"
  terminal_timeout: 300
  tool_pool_size: 16
  dataset_name: "NousResearch/openthoughts-tblite"
  test_timeout: 600
  task_timeout: 1200
  eval_concurrency: 8          # max 8 tasks at once
  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
  use_wandb: false
  wandb_name: "openthoughts-tblite-local"
  ensure_scores_are_not_same: false
  data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
 openai:
  base_url: "https://openrouter.ai/api/v1"
  model_name: "anthropic/claude-sonnet-4"
  server_type: "openai"
  health_check: false
  # api_key loaded from OPENROUTER_API_KEY in .env
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@ -118,6 +118,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
        "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
    )
    # --- Eval concurrency ---
    eval_concurrency: int = Field(
        default=0,
        description="Maximum number of tasks to evaluate in parallel. "
        "0 means unlimited (all tasks run concurrently). "
        "Set to 8 for local backends to avoid overwhelming the machine.",
    )
 # Tasks that cannot run properly on Modal and are excluded from scoring.
 MODAL_INCOMPATIBLE_TASKS = {
@ -429,8 +437,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                    "error": "no_image",
                }
-            # --- 2. Register per-task Modal image override ---
+            # --- 2. Register per-task image override ---
-            register_task_env_overrides(task_id, {"modal_image": modal_image})
+            # Set both modal_image and docker_image so the task image is used
            # regardless of which backend is configured.
            register_task_env_overrides(task_id, {
                "modal_image": modal_image,
                "docker_image": modal_image,
            })
            logger.info(
                "Task %s: registered image override for task_id %s",
                task_name, task_id[:8],
@ -655,13 +668,19 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
    async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
        """
-        Wrap rollout_and_score_eval with a per-task wall-clock timeout.
+        Wrap rollout_and_score_eval with a per-task wall-clock timeout
        and optional concurrency limit via semaphore.
        If the task exceeds task_timeout seconds, it's automatically scored
        as FAIL. This prevents any single task from hanging indefinitely.
        """
        task_name = item.get("task_name", "unknown")
        category = item.get("category", "unknown")
        # Acquire concurrency semaphore if configured
        if self._eval_semaphore:
            await self._eval_semaphore.acquire()
        try:
            return await asyncio.wait_for(
                self.rollout_and_score_eval(item),
@ -679,6 +698,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            }
            self._save_result(out)
            return out
        finally:
            if self._eval_semaphore:
                self._eval_semaphore.release()
    async def evaluate(self, *args, **kwargs) -> None:
        """
@ -696,6 +718,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
        """
        start_time = time.time()
        # Set up concurrency limit if configured
        if self.config.eval_concurrency > 0:
            self._eval_semaphore = asyncio.Semaphore(self.config.eval_concurrency)
            print(f"  Eval concurrency: {self.config.eval_concurrency} tasks at a time")
        else:
            self._eval_semaphore = None
        # Route all logging through tqdm.write() so the progress bar stays
        # pinned at the bottom while log lines scroll above it.
        from tqdm import tqdm