# Terminal-Bench 2.0 Evaluation -- Default Configuration # # Eval-only environment for the TB2 benchmark (89 terminal tasks). # Uses Modal terminal backend for per-task cloud-isolated sandboxes # and OpenRouter for inference. # # Usage: # python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ # --config environments/benchmarks/terminalbench_2/default.yaml # # # Override model: # python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ # --config environments/benchmarks/terminalbench_2/default.yaml \ # --openai.model_name anthropic/claude-sonnet-4 env: enabled_toolsets: ["terminal", "file"] max_agent_turns: 60 max_token_length: 16000 agent_temperature: 0.6 terminal_backend: "modal" dataset_name: "NousResearch/terminal-bench-2" test_timeout: 180 tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" use_wandb: true wandb_name: "terminal-bench-2" ensure_scores_are_not_same: false data_dir_to_save_evals: "evals/terminal-bench-2" system_prompt: > You are a skilled software engineer and system administrator with access to a terminal and file tools. You are working inside a Linux container environment. Complete the user's task by using the available tools. Be methodical: explore the environment first, plan your approach, then execute step by step. Verify your work before finishing. openai: base_url: "https://openrouter.ai/api/v1" model_name: "anthropic/claude-opus-4.6" server_type: "openai" health_check: false # api_key loaded from OPENROUTER_API_KEY in .env