feat: add OpenThoughts-TBLite evaluation script

Introduced a new evaluation script for the OpenThoughts-TBLite environment, enabling users to run evaluations with customizable options. The script includes logging capabilities and real-time output, enhancing the evaluation process for terminal agents. This addition complements the existing benchmarking tools and improves usability for users.
2026-04-25 00:51:20 +00:00 · 2026-03-04 12:55:56 +00:00 · 2026-03-04 12:55:56 +00:00 · ee7fde6531
commit ee7fde6531
parent 0ea6c34325
2 changed files with 57 additions and 5 deletions
--- a/environments/benchmarks/tblite/run_eval.sh
+++ b/environments/benchmarks/tblite/run_eval.sh
@ -0,0 +1,42 @@
 #!/bin/bash
 # OpenThoughts-TBLite Evaluation
 #
 # Run from repo root:
 #   bash environments/benchmarks/tblite/run_eval.sh
 #
 # Override model:
 #   bash environments/benchmarks/tblite/run_eval.sh \
 #       --openai.model_name anthropic/claude-sonnet-4
 #
 # Run a subset:
 #   bash environments/benchmarks/tblite/run_eval.sh \
 #       --env.task_filter broken-python,pandas-etl
 #
 # All terminal settings (backend, timeout, lifetime, pool size) are
 # configured via env config fields -- no env vars needed.
 set -euo pipefail
 mkdir -p logs evals/openthoughts-tblite
 LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"
 echo "OpenThoughts-TBLite Evaluation"
 echo "Log file: $LOG_FILE"
 echo ""
 # Unbuffered python output so logs are written in real-time
 export PYTHONUNBUFFERED=1
 # Show INFO-level agent loop timing (api/tool durations per turn)
 # These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
 export LOGLEVEL=INFO
 python tblite_env.py evaluate \
  --config default.yaml \
  "$@" \
  2>&1 | tee "$LOG_FILE"
 echo ""
 echo "Log saved to: $LOG_FILE"
 echo "Eval results: evals/openthoughts-tblite/"
--- a/environments/benchmarks/terminalbench_2/run_eval.sh
+++ b/environments/benchmarks/terminalbench_2/run_eval.sh
@ -12,21 +12,31 @@
 # Run a subset:
 #   bash environments/benchmarks/terminalbench_2/run_eval.sh \
 #       --env.task_filter fix-git,git-multibranch
 #
 # All terminal settings (backend, timeout, lifetime, pool size) are
 # configured via env config fields -- no env vars needed.
 set -euo pipefail
 mkdir -p logs evals/terminal-bench-2
 LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log"
 echo "Terminal-Bench 2.0 Evaluation"
-echo "Log: $LOG_FILE"
+echo "Log file: $LOG_FILE"
 echo ""
-export TERMINAL_ENV=modal
+# Unbuffered python output so logs are written in real-time
-export TERMINAL_TIMEOUT=300
+export PYTHONUNBUFFERED=1
-python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \
+# Show INFO-level agent loop timing (api/tool durations per turn)
-  --config environments/benchmarks/terminalbench_2/default.yaml \
+# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
 export LOGLEVEL=INFO
 python terminalbench2_env.py evaluate \
  --config default.yaml \
  "$@" \
  2>&1 | tee "$LOG_FILE"
 echo ""
 echo "Log saved to: $LOG_FILE"
 echo "Eval results: evals/terminal-bench-2/"