fix: limit concurrent Modal sandbox creations to avoid deadlocks

- Add max_concurrent_tasks config (default 8) with semaphore in TB2 eval - Pass cwd: /app via register_task_env_overrides for TB2 tasks - Add /home/ to host path prefixes as safety net for container backends When all 86 TerminalBench2 tasks fire simultaneously, each creates a Modal sandbox via asyncio.run() inside a thread pool worker. Modal's blocking calls deadlock when too many are created at once. The semaphore ensures max 8 concurrent creations. Co-Authored-By: hermes-agent[bot] <hermes-agent[bot]@users.noreply.github.com>
2026-04-25 00:51:20 +00:00 · 2026-03-07 21:34:06 +00:00 · 2026-03-07 21:34:06 +00:00 · c6df39955c
commit c6df39955c
parent 306d92a9d7
4 changed files with 32 additions and 3 deletions
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@ -29,6 +29,10 @@ env:
  wandb_name: "terminal-bench-2"
  ensure_scores_are_not_same: false
  data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
  # CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
  # Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
  # are created simultaneously inside thread pool workers via asyncio.run().
  max_concurrent_tasks: 8
 openai:
  base_url: "https://openrouter.ai/api/v1"
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
        "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
    )
    # --- Concurrency control ---
    max_concurrent_tasks: int = Field(
        default=8,
        description="Maximum number of tasks to run concurrently. "
        "Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
        "Modal has internal limits and creating too many sandboxes simultaneously "
        "causes blocking calls to deadlock inside the thread pool.",
    )
 # Tasks that cannot run properly on Modal and are excluded from scoring.
 MODAL_INCOMPATIBLE_TASKS = {
@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                }
            # --- 2. Register per-task Modal image override ---
-            register_task_env_overrides(task_id, {"modal_image": modal_image})
+            register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
            logger.info(
                "Task %s: registered image override for task_id %s",
                task_name, task_id[:8],
@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
        print(f"  Tool thread pool: {self.config.tool_pool_size}")
        print(f"  Terminal timeout: {self.config.terminal_timeout}s/cmd")
        print(f"  Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
        print(f"  Max concurrent tasks: {self.config.max_concurrent_tasks}")
        print(f"{'='*60}\n")
        # Semaphore to limit concurrent Modal sandbox creations.
        # Without this, all 86 tasks fire simultaneously, each creating a Modal
        # sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
        # calls (App.lookup, etc.) deadlock when too many are created at once.
        semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
        async def _eval_with_semaphore(item):
            async with semaphore:
                return await self._eval_with_timeout(item)
        # Fire all tasks with wall-clock timeout, track live accuracy on the bar
        total_tasks = len(self.all_eval_items)
        eval_tasks = [
-            asyncio.ensure_future(self._eval_with_timeout(item))
+            asyncio.ensure_future(_eval_with_semaphore(item))
            for item in self.all_eval_items
        ]
--- a/tools/environments/modal.py
+++ b/tools/environments/modal.py
@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment):
    def cleanup(self):
        """Snapshot the filesystem (if persistent) then stop the sandbox."""
        # Check if _inner was ever set (init may have failed)
        if not hasattr(self, '_inner') or self._inner is None:
            return
        if self._persistent:
            try:
                sandbox = getattr(self._inner, 'deployment', None)
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -424,7 +424,8 @@ def _get_env_config() -> Dict[str, Any]:
    # SSH is excluded since /home/ paths are valid on remote machines.
    cwd = os.getenv("TERMINAL_CWD", default_cwd)
    if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
-        host_prefixes = ("/Users/", "C:\\", "C:/")
+        # Host paths that won't exist inside containers
        host_prefixes = ("/Users/", "/home/", "C:\\", "C:/")
        if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
            logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
                        "(host path won't exist in sandbox). Using %r instead.",