diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml
index 62f66316e3..0c3eeb6659 100644
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@@ -19,8 +19,11 @@ env:
   max_token_length: 32000
   agent_temperature: 0.8
   terminal_backend: "modal"
+  terminal_timeout: 300        # 5 min per command (builds, pip install)
+  tool_pool_size: 128          # thread pool for 89 parallel tasks
   dataset_name: "NousResearch/terminal-bench-2"
   test_timeout: 600
+  task_timeout: 1800           # 30 min wall-clock per task, auto-FAIL if exceeded
   tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
   use_wandb: true
   wandb_name: "terminal-bench-2"
diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
index 916fdad93f..ef8484f796 100644
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@@ -111,6 +111,12 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
         description="Comma-separated task names to skip (e.g., 'heavy-task,slow-task').",
     )
 
+    # --- Per-task wall-clock timeout ---
+    task_timeout: int = Field(
+        default=1800,
+        description="Maximum wall-clock seconds per task (agent loop + verification). "
+        "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
+    )
 
 
 # =============================================================================
@@ -190,10 +196,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
 
             # Modal backend for per-task cloud-isolated sandboxes
             terminal_backend="modal",
+            terminal_timeout=300,   # 5 min per command (builds, pip install, etc.)
 
             # Test execution timeout (TB2 test scripts can install deps like pytest)
             test_timeout=180,
 
+            # 89 tasks run in parallel, each needs a thread for tool calls
+            tool_pool_size=128,
+
             # --- Eval-only Atropos settings ---
             # These settings make the env work as an eval-only environment:
             #   - STOP_TRAIN: pauses training during eval (standard for eval envs)
@@ -231,6 +241,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
         """Load the Terminal-Bench 2.0 dataset from HuggingFace."""
         from datasets import load_dataset
 
+        # Auto-set terminal_lifetime to task_timeout + 120s so sandboxes
+        # never get killed during an active task, but still get cleaned up
+        # promptly after the task times out.
+        lifetime = self.config.task_timeout + 120
+        self.config.terminal_lifetime = lifetime
+        os.environ["TERMINAL_LIFETIME_SECONDS"] = str(lifetime)
+        print(f"  Terminal lifetime auto-set to {lifetime}s (task_timeout + 120s)")
+
         print(f"Loading TB2 dataset from: {self.config.dataset_name}")
         ds = load_dataset(self.config.dataset_name, split="train")
 
@@ -366,6 +384,10 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
         task_id = str(uuid.uuid4())
         task_dir = None  # Set if we extract a Dockerfile (needs cleanup)
 
+        from tqdm import tqdm
+        tqdm.write(f"  [START] {task_name} (task_id={task_id[:8]})")
+        task_start = time.time()
+
         try:
             # --- 1. Resolve Docker image ---
             modal_image, task_dir = self._resolve_task_image(eval_item, task_name)
@@ -416,9 +438,16 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                 )
                 reward = 0.0
             else:
+                # Run tests in a thread so the blocking ctx.terminal() calls
+                # don't freeze the entire event loop (which would stall all
+                # other tasks, tqdm updates, and timeout timers).
                 ctx = ToolContext(task_id)
                 try:
-                    reward = self._run_tests(eval_item, ctx, task_name)
+                    loop = asyncio.get_event_loop()
+                    reward = await loop.run_in_executor(
+                        None,  # default thread pool
+                        self._run_tests, eval_item, ctx, task_name,
+                    )
                 except Exception as e:
                     logger.error("Task %s: test verification failed: %s", task_name, e)
                     reward = 0.0
@@ -427,7 +456,8 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
 
             passed = reward == 1.0
             status = "PASS" if passed else "FAIL"
-            print(f"  [{status}] {task_name} (turns={result.turns_used})")
+            elapsed = time.time() - task_start
+            tqdm.write(f"  [{status}] {task_name} (turns={result.turns_used}, {elapsed:.0f}s)")
             logger.info(
                 "Task %s: reward=%.1f, turns=%d, finished=%s",
                 task_name, reward, result.turns_used, result.finished_naturally,
@@ -443,8 +473,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
             }
 
         except Exception as e:
+            elapsed = time.time() - task_start
             logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
-            print(f"  [ERROR] {task_name}: {e}")
+            tqdm.write(f"  [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
             return {
                 "passed": False, "reward": 0.0,
                 "task_name": task_name, "category": category,
@@ -586,6 +617,31 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
     # Evaluate -- main entry point for the eval subcommand
     # =========================================================================
 
+    async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
+        """
+        Wrap rollout_and_score_eval with a per-task wall-clock timeout.
+
+        If the task exceeds task_timeout seconds, it's automatically scored
+        as FAIL. This prevents any single task from hanging indefinitely.
+        """
+        task_name = item.get("task_name", "unknown")
+        category = item.get("category", "unknown")
+        try:
+            return await asyncio.wait_for(
+                self.rollout_and_score_eval(item),
+                timeout=self.config.task_timeout,
+            )
+        except asyncio.TimeoutError:
+            from tqdm import tqdm
+            elapsed = self.config.task_timeout
+            tqdm.write(f"  [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
+            logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
+            return {
+                "passed": False, "reward": 0.0,
+                "task_name": task_name, "category": category,
+                "error": f"timeout ({elapsed}s)",
+            }
+
     async def evaluate(self, *args, **kwargs) -> None:
         """
         Run Terminal-Bench 2.0 evaluation over all tasks.
@@ -594,27 +650,88 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
             python environments/terminalbench2_env.py evaluate
 
         Runs all tasks through rollout_and_score_eval() via asyncio.gather()
-        (same pattern as GPQA and other Atropos eval envs). Aggregates
-        per-task, per-category, and overall pass rates, then logs to wandb
-        and evaluate_log().
+        (same pattern as GPQA and other Atropos eval envs). Each task is
+        wrapped with a wall-clock timeout so hung tasks auto-fail.
+
+        Suppresses noisy Modal/terminal output (HERMES_QUIET) so the tqdm
+        bar stays visible.
         """
         start_time = time.time()
 
+        # Route all logging through tqdm.write() so the progress bar stays
+        # pinned at the bottom while log lines scroll above it.
+        from tqdm import tqdm
+
+        class _TqdmHandler(logging.Handler):
+            def emit(self, record):
+                try:
+                    tqdm.write(self.format(record))
+                except Exception:
+                    self.handleError(record)
+
+        handler = _TqdmHandler()
+        handler.setFormatter(logging.Formatter(
+            "%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+            datefmt="%H:%M:%S",
+        ))
+        root = logging.getLogger()
+        root.handlers = [handler]  # Replace any existing handlers
+        root.setLevel(logging.INFO)
+
+        # Silence noisy third-party loggers that flood the output
+        logging.getLogger("httpx").setLevel(logging.WARNING)      # Every HTTP request
+        logging.getLogger("openai").setLevel(logging.WARNING)     # OpenAI client retries
+        logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment
+        logging.getLogger("rex_image_builder").setLevel(logging.WARNING)  # Image builds
+
         print(f"\n{'='*60}")
         print("Starting Terminal-Bench 2.0 Evaluation")
         print(f"{'='*60}")
         print(f"  Dataset: {self.config.dataset_name}")
         print(f"  Total tasks: {len(self.all_eval_items)}")
         print(f"  Max agent turns: {self.config.max_agent_turns}")
+        print(f"  Task timeout: {self.config.task_timeout}s")
         print(f"  Terminal backend: {self.config.terminal_backend}")
+        print(f"  Tool thread pool: {self.config.tool_pool_size}")
+        print(f"  Terminal timeout: {self.config.terminal_timeout}s/cmd")
+        print(f"  Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
         print(f"{'='*60}\n")
 
-        # Fire all tasks -- Atropos / Modal handle scheduling
-        from tqdm.asyncio import tqdm_asyncio
+        # Fire all tasks with wall-clock timeout, track live accuracy on the bar
+        total_tasks = len(self.all_eval_items)
         eval_tasks = [
-            self.rollout_and_score_eval(item) for item in self.all_eval_items
+            asyncio.ensure_future(self._eval_with_timeout(item))
+            for item in self.all_eval_items
         ]
-        results = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating TB2")
+
+        results = []
+        passed_count = 0
+        pbar = tqdm(total=total_tasks, desc="Evaluating TB2", dynamic_ncols=True)
+        try:
+            for coro in asyncio.as_completed(eval_tasks):
+                result = await coro
+                results.append(result)
+                if result and result.get("passed"):
+                    passed_count += 1
+                done = len(results)
+                pct = (passed_count / done * 100) if done else 0
+                pbar.set_postfix_str(f"pass={passed_count}/{done} ({pct:.1f}%)")
+                pbar.update(1)
+        except (KeyboardInterrupt, asyncio.CancelledError):
+            pbar.close()
+            print(f"\n\nInterrupted! Cleaning up {len(eval_tasks)} tasks...")
+            # Cancel all pending tasks
+            for task in eval_tasks:
+                task.cancel()
+            # Let cancellations propagate (finally blocks run cleanup_vm)
+            await asyncio.gather(*eval_tasks, return_exceptions=True)
+            # Belt-and-suspenders: clean up any remaining sandboxes
+            from tools.terminal_tool import cleanup_all_environments
+            cleanup_all_environments()
+            print("All sandboxes cleaned up.")
+            return
+        finally:
+            pbar.close()
 
         end_time = time.time()
 
diff --git a/evals/terminal-bench-2/evaluate_config.yaml b/evals/terminal-bench-2/evaluate_config.yaml
deleted file mode 100644
index 1537d63ccc..0000000000
--- a/evals/terminal-bench-2/evaluate_config.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-env:
-  group_size: 1
-  max_num_workers: -1
-  max_eval_workers: 16
-  max_num_workers_per_node: 8
-  steps_per_eval: 1
-  max_token_length: 32000
-  eval_handling: STOP_TRAIN
-  eval_limit_ratio: 0.5
-  inference_weight: 1.0
-  batch_size: -1
-  max_batches_offpolicy: 3
-  tokenizer_name: NousResearch/Hermes-3-Llama-3.1-8B
-  use_wandb: false
-  rollout_server_url: http://localhost:8000
-  total_steps: 1
-  wandb_name: terminal-bench-2
-  num_rollouts_to_keep: 32
-  num_rollouts_per_group_for_logging: 1
-  ensure_scores_are_not_same: false
-  data_path_to_save_groups: null
-  data_dir_to_save_evals: evals/terminal-bench-2
-  min_items_sent_before_logging: 2
-  include_messages: false
-  min_batch_allocation: null
-  worker_timeout: 600.0
-  thinking_mode: false
-  reasoning_effort: null
-  max_reasoning_tokens: null
-  custom_thinking_prompt: null
-  enabled_toolsets:
-  - terminal
-  - file
-  disabled_toolsets: null
-  distribution: null
-  max_agent_turns: 60
-  system_prompt: 'You are a skilled software engineer and system administrator with
-    access to a terminal and file tools. You are working inside a Linux container
-    environment. Complete the user''s task by using the available tools. Be methodical:
-    explore the environment first, plan your approach, then execute step by step.
-    Verify your work before finishing.'
-  agent_temperature: 1.0
-  terminal_backend: modal
-  dataset_name: NousResearch/terminal-bench-2
-  dataset_split: train
-  prompt_field: prompt
-  tool_call_parser: hermes
-  test_timeout: 180
-  force_build: false
-  task_filter: fix-git
-  skip_tasks: null
-openai:
-- timeout: 1200
-  num_max_requests_at_once: 512
-  num_requests_for_eval: 64
-  model_name: anthropic/claude-sonnet-4
-  rolling_buffer_length: 1000
-  server_type: openai
-  api_key: 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
-  base_url: https://openrouter.ai/api/v1
-  n_kwarg_is_ignored: false
-  health_check: false
-slurm: false
-testing: false