mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
feat: add eval_concurrency limit + Docker local config for TBLite
- Add eval_concurrency config field with asyncio.Semaphore - Add local.yaml config using Docker backend (sandboxed, no cloud costs) - Register docker_image alongside modal_image for backend flexibility - Default: 8 parallel tasks for local runs
This commit is contained in:
parent
9f74d1f2ec
commit
136a64942d
2 changed files with 70 additions and 3 deletions
38
environments/benchmarks/tblite/local.yaml
Normal file
38
environments/benchmarks/tblite/local.yaml
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
|
||||||
|
#
|
||||||
|
# Runs tasks in Docker containers on the local machine.
|
||||||
|
# Sandboxed like Modal but no cloud costs. Good for dev/testing.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/tblite/local.yaml
|
||||||
|
#
|
||||||
|
# # Override concurrency:
|
||||||
|
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
||||||
|
# --config environments/benchmarks/tblite/local.yaml \
|
||||||
|
# --env.eval_concurrency 4
|
||||||
|
|
||||||
|
env:
|
||||||
|
enabled_toolsets: ["terminal", "file"]
|
||||||
|
max_agent_turns: 60
|
||||||
|
max_token_length: 32000
|
||||||
|
agent_temperature: 0.8
|
||||||
|
terminal_backend: "docker"
|
||||||
|
terminal_timeout: 300
|
||||||
|
tool_pool_size: 16
|
||||||
|
dataset_name: "NousResearch/openthoughts-tblite"
|
||||||
|
test_timeout: 600
|
||||||
|
task_timeout: 1200
|
||||||
|
eval_concurrency: 8 # max 8 tasks at once
|
||||||
|
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||||
|
use_wandb: false
|
||||||
|
wandb_name: "openthoughts-tblite-local"
|
||||||
|
ensure_scores_are_not_same: false
|
||||||
|
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
|
||||||
|
|
||||||
|
openai:
|
||||||
|
base_url: "https://openrouter.ai/api/v1"
|
||||||
|
model_name: "anthropic/claude-sonnet-4"
|
||||||
|
server_type: "openai"
|
||||||
|
health_check: false
|
||||||
|
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||||
|
|
@ -118,6 +118,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
|
||||||
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
"Tasks exceeding this are scored as FAIL. Default 30 minutes.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# --- Eval concurrency ---
|
||||||
|
eval_concurrency: int = Field(
|
||||||
|
default=0,
|
||||||
|
description="Maximum number of tasks to evaluate in parallel. "
|
||||||
|
"0 means unlimited (all tasks run concurrently). "
|
||||||
|
"Set to 8 for local backends to avoid overwhelming the machine.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
# Tasks that cannot run properly on Modal and are excluded from scoring.
|
||||||
MODAL_INCOMPATIBLE_TASKS = {
|
MODAL_INCOMPATIBLE_TASKS = {
|
||||||
|
|
@ -429,8 +437,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
"error": "no_image",
|
"error": "no_image",
|
||||||
}
|
}
|
||||||
|
|
||||||
# --- 2. Register per-task Modal image override ---
|
# --- 2. Register per-task image override ---
|
||||||
register_task_env_overrides(task_id, {"modal_image": modal_image})
|
# Set both modal_image and docker_image so the task image is used
|
||||||
|
# regardless of which backend is configured.
|
||||||
|
register_task_env_overrides(task_id, {
|
||||||
|
"modal_image": modal_image,
|
||||||
|
"docker_image": modal_image,
|
||||||
|
})
|
||||||
logger.info(
|
logger.info(
|
||||||
"Task %s: registered image override for task_id %s",
|
"Task %s: registered image override for task_id %s",
|
||||||
task_name, task_id[:8],
|
task_name, task_id[:8],
|
||||||
|
|
@ -655,13 +668,19 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
|
|
||||||
async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
|
||||||
"""
|
"""
|
||||||
Wrap rollout_and_score_eval with a per-task wall-clock timeout.
|
Wrap rollout_and_score_eval with a per-task wall-clock timeout
|
||||||
|
and optional concurrency limit via semaphore.
|
||||||
|
|
||||||
If the task exceeds task_timeout seconds, it's automatically scored
|
If the task exceeds task_timeout seconds, it's automatically scored
|
||||||
as FAIL. This prevents any single task from hanging indefinitely.
|
as FAIL. This prevents any single task from hanging indefinitely.
|
||||||
"""
|
"""
|
||||||
task_name = item.get("task_name", "unknown")
|
task_name = item.get("task_name", "unknown")
|
||||||
category = item.get("category", "unknown")
|
category = item.get("category", "unknown")
|
||||||
|
|
||||||
|
# Acquire concurrency semaphore if configured
|
||||||
|
if self._eval_semaphore:
|
||||||
|
await self._eval_semaphore.acquire()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return await asyncio.wait_for(
|
return await asyncio.wait_for(
|
||||||
self.rollout_and_score_eval(item),
|
self.rollout_and_score_eval(item),
|
||||||
|
|
@ -679,6 +698,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
}
|
}
|
||||||
self._save_result(out)
|
self._save_result(out)
|
||||||
return out
|
return out
|
||||||
|
finally:
|
||||||
|
if self._eval_semaphore:
|
||||||
|
self._eval_semaphore.release()
|
||||||
|
|
||||||
async def evaluate(self, *args, **kwargs) -> None:
|
async def evaluate(self, *args, **kwargs) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
@ -696,6 +718,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
||||||
"""
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Set up concurrency limit if configured
|
||||||
|
if self.config.eval_concurrency > 0:
|
||||||
|
self._eval_semaphore = asyncio.Semaphore(self.config.eval_concurrency)
|
||||||
|
print(f" Eval concurrency: {self.config.eval_concurrency} tasks at a time")
|
||||||
|
else:
|
||||||
|
self._eval_semaphore = None
|
||||||
|
|
||||||
# Route all logging through tqdm.write() so the progress bar stays
|
# Route all logging through tqdm.write() so the progress bar stays
|
||||||
# pinned at the bottom while log lines scroll above it.
|
# pinned at the bottom while log lines scroll above it.
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue