Enhance TerminalBench 2 configuration and evaluation handling

- Added task_timeout parameter to enforce a maximum wall-clock time for each task, automatically scoring as FAIL if exceeded. - Introduced terminal_timeout and tool_pool_size parameters to improve command execution and concurrency management. - Updated logging to provide detailed task execution times and timeout handling, enhancing overall monitoring. - Removed outdated evaluate_config.yaml file to streamline configuration management.
2026-04-25 00:51:20 +00:00 · 2026-02-10 22:53:24 +00:00 · 2026-02-10 22:53:24 +00:00 · ba3fea24f1
commit ba3fea24f1
parent 6b4a8d0b17
3 changed files with 130 additions and 74 deletions
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@ -19,8 +19,11 @@ env:
  max_token_length: 32000
  agent_temperature: 0.8
  terminal_backend: "modal"
+  terminal_timeout: 300        # 5 min per command (builds, pip install)
+  tool_pool_size: 128          # thread pool for 89 parallel tasks
  dataset_name: "NousResearch/terminal-bench-2"
  test_timeout: 600
+  task_timeout: 1800           # 30 min wall-clock per task, auto-FAIL if exceeded
  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
  use_wandb: true
  wandb_name: "terminal-bench-2"