diff --git a/configs/run_browser_tasks.sh b/configs/run_browser_tasks.sh new file mode 100755 index 0000000000..14e7ad2db9 --- /dev/null +++ b/configs/run_browser_tasks.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Browser-focused data generation run +# Uses browser-use-tasks.jsonl (6504 tasks) +# Distribution: browser 97%, web 20%, vision 12%, terminal 15% + +# Create logs directory if it doesn't exist +mkdir -p logs + +# Generate log filename with timestamp +LOG_FILE="logs/browser_tasks_$(date +%Y%m%d_%H%M%S).log" + +echo "📝 Logging output to: $LOG_FILE" +echo "🌐 Running browser-focused tasks with browser_tasks distribution" + +python batch_runner.py \ + --dataset_file="browser-use-tasks.jsonl" \ + --batch_size=20 \ + --run_name="browser_tasks" \ + --distribution="browser_tasks" \ + --model="moonshotai/kimi-k2.5" \ + --verbose \ + --base_url="https://openrouter.ai/api/v1" \ + --num_workers=50 \ + --max_turns=60 \ + --resume \ + --ephemeral_system_prompt="You are an AI assistant with browser automation capabilities. Your primary task is to navigate and interact with web pages to accomplish user goals. + +IMPORTANT GUIDELINES: + +1. SEARCHING: Do NOT try to search directly on Google or other search engines via the browser - they block automated searches. Instead, ALWAYS use the web_search tool first to find URLs for any pages you need to visit, then use browser tools to navigate to those URLs. + +2. COOKIE/PRIVACY DIALOGS: After navigating to a page, ALWAYS check if there are cookie consent dialogs, privacy popups, or overlay modals blocking the page. These appear in snapshots as 'dialog' elements with buttons like 'Close', 'Accept', 'Accept All', 'Decline', 'I Agree', 'Got it', 'OK', or 'X'. You MUST dismiss these dialogs FIRST by clicking the appropriate button before trying to interact with other page elements. After dismissing a dialog, take a fresh browser_snapshot to get updated element references. + +3. HANDLING TIMEOUTS: If an action times out, it often means the element is blocked by an overlay or the page state has changed. Take a new snapshot to see the current page state and look for any dialogs or popups that need to be dismissed. If there is no dialog box to bypass, then try a new method or report the error to the user and complete the task. + +4. GENERAL: Use browser tools to click elements, fill forms, extract information, and perform web-based tasks. If terminal is available, use it for any local file operations or computations needed to support your web tasks. Be thorough in verifying your actions and handle any errors gracefully by retrying or trying alternative approaches." \ + 2>&1 | tee "$LOG_FILE" + +echo "✅ Log saved to: $LOG_FILE" + +# --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \ \ No newline at end of file diff --git a/configs/run_eval_glm4.7_newterm.sh b/configs/run_eval_glm4.7_newterm.sh new file mode 100755 index 0000000000..735758b6a6 --- /dev/null +++ b/configs/run_eval_glm4.7_newterm.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Create logs directory if it doesn't exist +mkdir -p logs + +# Generate log filename with timestamp +LOG_FILE="logs/glm4.7-terminal-tasks-newterm_$(date +%Y%m%d_%H%M%S).log" + +echo "📝 Logging output to: $LOG_FILE" + +python batch_runner.py \ + --dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_eval.jsonl" \ + --batch_size=1 \ + --run_name="terminal-tasks-test-newterm" \ + --distribution="terminal_only" \ + --verbose \ + --model="z-ai/glm-4.7" \ + --base_url="https://openrouter.ai/api/v1" \ + --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \ + --num_workers=5 \ + --max_turns=60 \ + --ephemeral_system_prompt="You have access to a variety of tools to help you complete coding, system administration, and general computing tasks. You can use them in sequence and build off of the results of prior tools you've used. Always use the terminal tool to execute commands, write code, install packages, and verify your work. You should test and validate everything you create. Always pip install any packages you need (use --break-system-packages if needed). If you need a tool that isn't available, you can use the terminal to install or create it. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Use web search when you need to look up documentation, APIs, or current best practices." \ + 2>&1 | tee "$LOG_FILE" + +echo "✅ Log saved to: $LOG_FILE" + +# --verbose \ +# --resume \ + diff --git a/configs/run_eval_terminal.sh b/configs/run_eval_terminal.sh new file mode 100755 index 0000000000..0cf6a1f654 --- /dev/null +++ b/configs/run_eval_terminal.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Terminal-only evaluation run using Modal sandboxes +# Uses 10 sample tasks from nous-terminal-tasks + +# Create logs directory if it doesn't exist +mkdir -p logs + +# Generate log filename with timestamp +LOG_FILE="logs/terminal_eval_$(date +%Y%m%d_%H%M%S).log" + +echo "📝 Logging output to: $LOG_FILE" +echo "🔧 Using Modal sandboxes (TERMINAL_ENV=modal)" + +# Set terminal to use Modal +export TERMINAL_ENV=modal +export TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20 +export TERMINAL_TIMEOUT=300 + +python batch_runner.py \ + --dataset_file="nous-terminal-tasks_eval.jsonl" \ + --batch_size=5 \ + --run_name="terminal_eval" \ + --distribution="terminal_only" \ + --model="z-ai/glm-4.7" \ + --base_url="https://openrouter.ai/api/v1" \ + --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \ + --num_workers=2 \ + --max_turns=30 \ + --ephemeral_system_prompt="You have access to a terminal tool for executing commands. Use it to complete the task. Install any packages you need with apt-get or pip (use --break-system-packages if needed). Do not use interactive tools (vim, nano, python repl). If git output is large, pipe to cat." \ + 2>&1 | tee "$LOG_FILE" + +echo "✅ Log saved to: $LOG_FILE" diff --git a/configs/run_mixed_tasks.sh b/configs/run_mixed_tasks.sh new file mode 100755 index 0000000000..b072f6541b --- /dev/null +++ b/configs/run_mixed_tasks.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Mixed browser+terminal data generation run +# Uses mixed-browser-terminal-tasks.jsonl (200 tasks) +# Distribution: browser 92%, terminal 92%, web 35%, vision 15%, image_gen 15% + +# Create logs directory if it doesn't exist +mkdir -p logs + +# Generate log filename with timestamp +LOG_FILE="logs/mixed_tasks_$(date +%Y%m%d_%H%M%S).log" + +echo "📝 Logging output to: $LOG_FILE" +echo "🔀 Running mixed browser+terminal tasks with mixed_tasks distribution" + +# Set terminal environment (Modal sandboxes recommended for safety) +export TERMINAL_ENV=modal +export TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20 +export TERMINAL_TIMEOUT=300 + +python batch_runner.py \ + --dataset_file="mixed-browser-terminal-tasks.jsonl" \ + --batch_size=20 \ + --run_name="mixed_tasks" \ + --distribution="mixed_tasks" \ + --model="z-ai/glm-4.7" \ + --base_url="https://openrouter.ai/api/v1" \ + --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \ + --num_workers=25 \ + --max_turns=60 \ + --ephemeral_system_prompt="You are an AI assistant capable of both browser automation and terminal operations. Use browser tools to navigate websites, interact with web pages, fill forms, and extract information. Use terminal tools to execute commands, write and run code, install packages (use --break-system-packages with pip if needed), and perform local computations. When web search is available, use it to find URLs, documentation, or current information. If vision is available, use it to analyze images or screenshots. If image generation is available, use it when the task requires creating images. Combine browser and terminal capabilities effectively - for example, you might use the browser to fetch data from a website and terminal to process or analyze it. Always verify your work and handle errors gracefully." \ + 2>&1 | tee "$LOG_FILE" + +echo "✅ Log saved to: $LOG_FILE" diff --git a/configs/run_terminal_tasks.sh b/configs/run_terminal_tasks.sh new file mode 100755 index 0000000000..e26945988a --- /dev/null +++ b/configs/run_terminal_tasks.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Terminal-focused data generation run +# Uses nous-terminal-tasks.jsonl (597 tasks) +# Distribution: terminal 97%, web 15%, browser 10%, vision 8%, image_gen 3% + +# Create logs directory if it doesn't exist +mkdir -p logs + +# Generate log filename with timestamp +LOG_FILE="logs/terminal_tasks_$(date +%Y%m%d_%H%M%S).log" + +echo "📝 Logging output to: $LOG_FILE" +echo "💻 Running terminal-focused tasks with terminal_tasks distribution" + +# Set terminal environment (Modal sandboxes recommended for safety) +export TERMINAL_ENV=modal +export TERMINAL_MODAL_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20 +export TERMINAL_TIMEOUT=300 + +python batch_runner.py \ + --dataset_file="nous-terminal-tasks.jsonl" \ + --batch_size=20 \ + --run_name="terminal_tasks" \ + --distribution="terminal_tasks" \ + --model="z-ai/glm-4.7" \ + --base_url="https://openrouter.ai/api/v1" \ + --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \ + --num_workers=40 \ + --max_turns=60 \ + --ephemeral_system_prompt="You have access to a terminal tool for executing commands and completing coding, system administration, and computing tasks. Use the terminal to write code, run scripts, install packages (use --break-system-packages with pip if needed), manipulate files, and verify your work. Always test and validate code you create. Do not use interactive tools like vim, nano, or python REPL. If git output is large, pipe to cat. When web search is available, use it to look up documentation, APIs, or best practices. If browser tools are available, use them for web interactions that require page manipulation. Do not use the terminal to communicate with the user - only your final response will be shown to them." \ + 2>&1 | tee "$LOG_FILE" + +echo "✅ Log saved to: $LOG_FILE"