mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
Enhance TerminalBench 2 configuration and evaluation handling
- Added task_timeout parameter to enforce a maximum wall-clock time for each task, automatically scoring as FAIL if exceeded. - Introduced terminal_timeout and tool_pool_size parameters to improve command execution and concurrency management. - Updated logging to provide detailed task execution times and timeout handling, enhancing overall monitoring. - Removed outdated evaluate_config.yaml file to streamline configuration management.
This commit is contained in:
parent
6b4a8d0b17
commit
ba3fea24f1
3 changed files with 130 additions and 74 deletions
|
|
@ -19,8 +19,11 @@ env:
|
|||
max_token_length: 32000
|
||||
agent_temperature: 0.8
|
||||
terminal_backend: "modal"
|
||||
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
||||
tool_pool_size: 128 # thread pool for 89 parallel tasks
|
||||
dataset_name: "NousResearch/terminal-bench-2"
|
||||
test_timeout: 600
|
||||
task_timeout: 1800 # 30 min wall-clock per task, auto-FAIL if exceeded
|
||||
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
||||
use_wandb: true
|
||||
wandb_name: "terminal-bench-2"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue