mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-06 02:41:48 +00:00
wip: run tb2 and fix modal instantiation
This commit is contained in:
parent
3baafea380
commit
3befb9389f
7 changed files with 90 additions and 23 deletions
|
|
@ -19,11 +19,11 @@ env:
|
|||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "modal"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
||||
dataset_name: "NousResearch/terminal-bench-2"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
||||
dataset_name: "sidbin/terminal-bench-2-verified-flattened"
|
||||
test_timeout: 600
|
||||
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
||||
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "terminal-bench-2"
|
||||
|
|
@ -36,7 +36,8 @@ env:
|
|||
|
||||
openai:
|
||||
base_url: "https://openrouter.ai/api/v1"
|
||||
model_name: "anthropic/claude-opus-4.6"
|
||||
model_name: "openai/gpt-oss-120b:nitro"
|
||||
server_type: "openai"
|
||||
health_check: false
|
||||
timeout: 300 # 5 min per API call (default 1200s causes 20min stalls)
|
||||
# api_key loaded from OPENROUTER_API_KEY in .env
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ export PYTHONUNBUFFERED=1
|
|||
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
|
||||
export LOGLEVEL=INFO
|
||||
|
||||
python terminalbench2_env.py evaluate \
|
||||
--config default.yaml \
|
||||
uv run python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
|
||||
--config environments/benchmarks/terminalbench_2/default.yaml \
|
||||
"$@" \
|
||||
2>&1 | tee "$LOG_FILE"
|
||||
|
||||
|
|
|
|||
|
|
@ -354,6 +354,16 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
|||
for i, task in enumerate(self.all_eval_items):
|
||||
self.category_index[task.get("category", "unknown")].append(i)
|
||||
|
||||
# Pre-compute which tasks need Modal's add_python (avoids re-decoding
|
||||
# multi-MB environment_tar blobs during per-task rollouts).
|
||||
self._needs_add_python: Dict[str, bool] = {
|
||||
task["task_name"]: self._image_needs_add_python(task)
|
||||
for task in self.all_eval_items
|
||||
}
|
||||
add_py_count = sum(self._needs_add_python.values())
|
||||
if add_py_count:
|
||||
print(f" {add_py_count} tasks need add_python (non-python base image)")
|
||||
|
||||
# Reward tracking for wandb logging
|
||||
self.eval_metrics: List[Tuple[str, float]] = []
|
||||
|
||||
|
|
@ -414,6 +424,36 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
|||
# Docker image resolution
|
||||
# =========================================================================
|
||||
|
||||
@staticmethod
|
||||
def _image_needs_add_python(item: Dict[str, Any]) -> bool:
|
||||
"""Check if the task's base image lacks `python` on PATH.
|
||||
|
||||
Parses the Dockerfile FROM line in environment_tar. Returns True
|
||||
for non-python base images (ubuntu, debian, etc.) that need
|
||||
Modal's add_python parameter.
|
||||
"""
|
||||
environment_tar = item.get("environment_tar", "")
|
||||
if not environment_tar:
|
||||
return False
|
||||
try:
|
||||
raw = base64.b64decode(environment_tar)
|
||||
buf = io.BytesIO(raw)
|
||||
with tarfile.open(fileobj=buf, mode="r:gz") as tar:
|
||||
for member in tar:
|
||||
if not member.isfile() or "Dockerfile" not in member.name:
|
||||
continue
|
||||
f = tar.extractfile(member)
|
||||
if not f:
|
||||
continue
|
||||
for line in f.read().decode("utf-8", errors="ignore").splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.upper().startswith("FROM "):
|
||||
base = stripped.split()[1].lower()
|
||||
return not base.startswith("python:")
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def _resolve_task_image(
|
||||
self, item: Dict[str, Any], task_name: str
|
||||
) -> Tuple[str, Optional[Path]]:
|
||||
|
|
@ -505,11 +545,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
|
|||
# --- 2. Register per-task image override ---
|
||||
# Set both modal_image and docker_image so the task image is used
|
||||
# regardless of which backend is configured.
|
||||
register_task_env_overrides(task_id, {
|
||||
overrides = {
|
||||
"modal_image": modal_image,
|
||||
"docker_image": modal_image,
|
||||
"cwd": "/app",
|
||||
})
|
||||
}
|
||||
if self._needs_add_python.get(task_name, False):
|
||||
overrides["add_python"] = "3.12"
|
||||
register_task_env_overrides(task_id, overrides)
|
||||
logger.info(
|
||||
"Task %s: registered image override for task_id %s",
|
||||
task_name, task_id[:8],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue