feat: add OpenThoughts-TBLite evaluation environment and configuration files

Introduced a new evaluation environment for OpenThoughts-TBLite, including the main evaluation script, configuration YAML, and README documentation. This environment provides a faster alternative to Terminal-Bench 2.0, featuring 100 difficulty-calibrated tasks for terminal agents. The setup allows for easy evaluation and configuration, enhancing the benchmarking capabilities for terminal agents.
2026-04-25 00:51:20 +00:00 · 2026-03-04 11:38:32 +00:00 · 2026-03-04 11:38:32 +00:00 · 0ea6c34325
commit 0ea6c34325
parent 3db3d60368
4 changed files with 231 additions and 0 deletions
--- a/environments/benchmarks/tblite/default.yaml
+++ b/environments/benchmarks/tblite/default.yaml
@ -0,0 +1,39 @@
+# OpenThoughts-TBLite Evaluation -- Default Configuration
+#
+# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated
+# terminal tasks, a faster proxy for Terminal-Bench 2.0).
+# Uses Modal terminal backend for per-task cloud-isolated sandboxes
+# and OpenRouter for inference.
+#
+# Usage:
+#   python environments/benchmarks/tblite/tblite_env.py evaluate \
+#       --config environments/benchmarks/tblite/default.yaml
+#
+#   # Override model:
+#   python environments/benchmarks/tblite/tblite_env.py evaluate \
+#       --config environments/benchmarks/tblite/default.yaml \
+#       --openai.model_name anthropic/claude-sonnet-4
+
+env:
+  enabled_toolsets: ["terminal", "file"]
+  max_agent_turns: 60
+  max_token_length: 32000
+  agent_temperature: 0.8
+  terminal_backend: "modal"
+  terminal_timeout: 300        # 5 min per command (builds, pip install)
+  tool_pool_size: 128          # thread pool for 100 parallel tasks
+  dataset_name: "NousResearch/openthoughts-tblite"
+  test_timeout: 600
+  task_timeout: 1200           # 20 min wall-clock per task (TBLite tasks are faster)
+  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
+  use_wandb: true
+  wandb_name: "openthoughts-tblite"
+  ensure_scores_are_not_same: false
+  data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite"
+
+openai:
+  base_url: "https://openrouter.ai/api/v1"
+  model_name: "anthropic/claude-opus-4.6"
+  server_type: "openai"
+  health_check: false
+  # api_key loaded from OPENROUTER_API_KEY in .env