mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: limit concurrent Modal sandbox creations to avoid deadlocks
- Add max_concurrent_tasks config (default 8) with semaphore in TB2 eval - Pass cwd: /app via register_task_env_overrides for TB2 tasks - Add /home/ to host path prefixes as safety net for container backends When all 86 TerminalBench2 tasks fire simultaneously, each creates a Modal sandbox via asyncio.run() inside a thread pool worker. Modal's blocking calls deadlock when too many are created at once. The semaphore ensures max 8 concurrent creations. Co-Authored-By: hermes-agent[bot] <hermes-agent[bot]@users.noreply.github.com>
This commit is contained in:
parent
306d92a9d7
commit
c6df39955c
4 changed files with 32 additions and 3 deletions
|
|
@ -29,6 +29,10 @@ env:
|
||||||
wandb_name: "terminal-bench-2"
|
wandb_name: "terminal-bench-2"
|
||||||
ensure_scores_are_not_same: false
|
ensure_scores_are_not_same: false
|
||||||
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
|
||||||
|
# CRITICAL: Limit concurrent Modal sandbox creations to avoid deadlocks.
|
||||||
|
# Modal's blocking calls (App.lookup, etc.) deadlock when too many sandboxes
|
||||||
|
# are created simultaneously inside thread pool workers via asyncio.run().
|
||||||
|
max_concurrent_tasks: 8
|
||||||
|
|
||||||
openai:
|
openai:
|
||||||
base_url: "https://openrouter.ai/api/v1"
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
|
|
||||||
|
|
@ -118,6 +118,15 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
|
||||||
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --- Concurrency control ---
|
||||||
|
max_concurrent_tasks: int = Field(
|
||||||
|
default=8,
|
||||||
|
description="Maximum number of tasks to run concurrently. "
|
||||||
|
"Limits concurrent Modal sandbox creations to avoid async/threading deadlocks. "
|
||||||
|
"Modal has internal limits and creating too many sandboxes simultaneously "
|
||||||
|
"causes blocking calls to deadlock inside the thread pool.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
||||||
MODAL_INCOMPATIBLE_TASKS = {
|
MODAL_INCOMPATIBLE_TASKS = {
|
||||||
|
|
@ -430,7 +439,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- 2. Register per-task Modal image override ---
|
# --- 2. Register per-task Modal image override ---
|
||||||
register_task_env_overrides(task_id, {"modal_image": modal_image})
|
register_task_env_overrides(task_id, {"modal_image": modal_image, "cwd": "/app"})
|
||||||
logger.info(
|
logger.info(
|
||||||
"Task %s: registered image override for task_id %s",
|
"Task %s: registered image override for task_id %s",
|
||||||
task_name, task_id[:8],
|
task_name, task_id[:8],
|
||||||
|
|
@ -733,12 +742,23 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
print(f" Tool thread pool: {self.config.tool_pool_size}")
|
print(f" Tool thread pool: {self.config.tool_pool_size}")
|
||||||
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
|
print(f" Terminal timeout: {self.config.terminal_timeout}s/cmd")
|
||||||
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
|
print(f" Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
|
||||||
|
print(f" Max concurrent tasks: {self.config.max_concurrent_tasks}")
|
||||||
print(f"{'='*60}\n")
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Semaphore to limit concurrent Modal sandbox creations.
|
||||||
|
# Without this, all 86 tasks fire simultaneously, each creating a Modal
|
||||||
|
# sandbox via asyncio.run() inside a thread pool worker. Modal's blocking
|
||||||
|
# calls (App.lookup, etc.) deadlock when too many are created at once.
|
||||||
|
semaphore = asyncio.Semaphore(self.config.max_concurrent_tasks)
|
||||||
|
|
||||||
|
async def _eval_with_semaphore(item):
|
||||||
|
async with semaphore:
|
||||||
|
return await self._eval_with_timeout(item)
|
||||||
|
|
||||||
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
|
# Fire all tasks with wall-clock timeout, track live accuracy on the bar
|
||||||
total_tasks = len(self.all_eval_items)
|
total_tasks = len(self.all_eval_items)
|
||||||
eval_tasks = [
|
eval_tasks = [
|
||||||
asyncio.ensure_future(self._eval_with_timeout(item))
|
asyncio.ensure_future(_eval_with_semaphore(item))
|
||||||
for item in self.all_eval_items
|
for item in self.all_eval_items
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -137,6 +137,10 @@ class ModalEnvironment(BaseEnvironment):
|
||||||
|
|
||||||
def cleanup(self):
|
def cleanup(self):
|
||||||
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
|
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
|
||||||
|
# Check if _inner was ever set (init may have failed)
|
||||||
|
if not hasattr(self, '_inner') or self._inner is None:
|
||||||
|
return
|
||||||
|
|
||||||
if self._persistent:
|
if self._persistent:
|
||||||
try:
|
try:
|
||||||
sandbox = getattr(self._inner, 'deployment', None)
|
sandbox = getattr(self._inner, 'deployment', None)
|
||||||
|
|
|
||||||
|
|
@ -424,7 +424,8 @@ def _get_env_config() -> Dict[str, Any]:
|
||||||
# SSH is excluded since /home/ paths are valid on remote machines.
|
# SSH is excluded since /home/ paths are valid on remote machines.
|
||||||
cwd = os.getenv("TERMINAL_CWD", default_cwd)
|
cwd = os.getenv("TERMINAL_CWD", default_cwd)
|
||||||
if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
|
if env_type in ("modal", "docker", "singularity", "daytona") and cwd:
|
||||||
host_prefixes = ("/Users/", "C:\\", "C:/")
|
# Host paths that won't exist inside containers
|
||||||
|
host_prefixes = ("/Users/", "/home/", "C:\\", "C:/")
|
||||||
if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
|
if any(cwd.startswith(p) for p in host_prefixes) and cwd != default_cwd:
|
||||||
logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
|
logger.info("Ignoring TERMINAL_CWD=%r for %s backend "
|
||||||
"(host path won't exist in sandbox). Using %r instead.",
|
"(host path won't exist in sandbox). Using %r instead.",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue