fix: limit concurrent Modal sandbox creations to avoid deadlocks

- Add max_concurrent_tasks config (default 8) with semaphore in TB2 eval
- Pass cwd: /app via register_task_env_overrides for TB2 tasks
- Add /home/ to host path prefixes as safety net for container backends

When all 86 TerminalBench2 tasks fire simultaneously, each creates a Modal sandbox
via asyncio.run() inside a thread pool worker. Modal's blocking calls deadlock
when too many are created at once. The semaphore ensures max 8 concurrent creations.

Co-Authored-By: hermes-agent[bot] <hermes-agent[bot]@users.noreply.github.com>
This commit is contained in:
Blake Johnson 2026-03-07 21:34:06 +00:00
parent 306d92a9d7
commit c6df39955c
4 changed files with 32 additions and 3 deletions

View file

@ -29,6 +29,10 @@ env:
wandb_name: "terminal-bench-2" wandb_name: "terminal-bench-2"
ensure_scores_are_not_same: false ensure_scores_are_not_same: false
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2" data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
# are created simultaneously inside thread pool workers via asyncio.run().
max_concurrent_tasks: 8
openai: openai:
base_url: "https://openrouter.ai/api/v1" base_url: "https://openrouter.ai/api/v1"

View file

@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
"Tasks exceeding this are scored as FAIL. Default 30 minutes.", "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
) )
# --- Concurrency control ---
max_concurrent_tasks: int = Field(
default=8,
description="Maximum number of tasks to run concurrently. "
"Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
"Modal has internal limits and creating too many sandboxes simultaneously "
"causes blocking calls to deadlock inside the thread pool.",
)
# Tasks that cannot run properly on Modal and are excluded from scoring. # Tasks that cannot run properly on Modal and are excluded from scoring.
MODAL_INCOMPATIBLE_TASKS = { MODAL_INCOMPATIBLE_TASKS = {
@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
} }
# --- 2. Register per-task Modal image override --- # --- 2. Register per-task Modal image override ---
register_task_env_overrides(task_id, {"modal_image": modal_image}) register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
logger.info( logger.info(
"Task %s: registered image override for task_id %s", "Task %s: registered image override for task_id %s",
task_name, task_id[:8], task_name, task_id[:8],
@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
print(f" Tool thread pool: {self.config.tool_pool_size}") print(f" Tool thread pool: {self.config.tool_pool_size}")
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd") print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)") print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}")
print(f"{'='*60}\n") print(f"{'='*60}\n")
# Semaphore to limit concurrent Modal sandbox creations.
# Without this, all 86 tasks fire simultaneously, each creating a Modal
# sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
# calls (App.lookup, etc.) deadlock when too many are created at once.
semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
async def _eval_with_semaphore(item):
async with semaphore:
return await self._eval_with_timeout(item)
# Fire all tasks with wall-clock timeout, track live accuracy on the bar # Fire all tasks with wall-clock timeout, track live accuracy on the bar
total_tasks = len(self.all_eval_items) total_tasks = len(self.all_eval_items)
eval_tasks = [ eval_tasks = [
asyncio.ensure_future(self._eval_with_timeout(item)) asyncio.ensure_future(_eval_with_semaphore(item))
for item in self.all_eval_items for item in self.all_eval_items
] ]

View file

@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment):
def cleanup(self): def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox.""" """Snapshot the filesystem (if persistent) then stop the sandbox."""
# Check if _inner was ever set (init may have failed)
if not hasattr(self, '_inner') or self._inner is None:
return
if self._persistent: if self._persistent:
try: try:
sandbox = getattr(self._inner, 'deployment', None) sandbox = getattr(self._inner, 'deployment', None)

View file

@ -424,7 +424,8 @@ def _get_env_config() -> Dict[str, Any]:
# SSH is excluded since /home/ paths are valid on remote machines. # SSH is excluded since /home/ paths are valid on remote machines.
cwd = os.getenv("TERMINAL_CWD", default_cwd) cwd = os.getenv("TERMINAL_CWD", default_cwd)
if env_type in ("modal", "docker", "singularity", "daytona") and cwd: if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
host_prefixes = ("/Users/", "C:\\", "C:/") # Host paths that won't exist inside containers
host_prefixes = ("/Users/", "/home/", "C:\\", "C:/")
if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd: if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
logger.info("Ignoring TERMINAL_CWD=%r for %s backend " logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
"(host path won't exist in sandbox). Using %r instead.", "(host path won't exist in sandbox). Using %r instead.",