#!/bin/bash

# Terminal-only evaluation run using Modal sandboxes
# Uses 10 sample tasks from nous-terminal-tasks

# Create logs directory if it doesn't exist
mkdir -p logs

# Generate log filename with timestamp
LOG_FILE="logs/terminal_eval_$(date +%Y%m%d_%H%M%S).log"

echo "📝 Logging output to: $LOG_FILE"
echo "🔧 Using Modal sandboxes (TERMINAL_ENV=modal)"

# Set terminal to use Modal
export TERMINAL_ENV=modal
export TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
export TERMINAL_TIMEOUT=300

python batch_runner.py \
  --dataset_file="nous-terminal-tasks_eval.jsonl" \
  --batch_size=5 \
  --run_name="terminal_eval" \
  --distribution="terminal_only" \
  --model="z-ai/glm-4.7" \
  --base_url="https://openrouter.ai/api/v1" \
  --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
  --num_workers=2 \
  --max_turns=30 \
  --ephemeral_system_prompt="You have access to a terminal tool for executing commands. Use it to complete the task. Install any packages you need with apt-get or pip (use --break-system-packages if needed). Do not use interactive tools (vim, nano, python repl). If git output is large, pipe to cat." \
  2>&1 | tee "$LOG_FILE"

echo "✅ Log saved to: $LOG_FILE"