From beac2ee06ab65dc2fb8d5ef734faf236631e399a Mon Sep 17 00:00:00 2001
From: Shannon Sands <shannon.sands.1979@gmail.com>
Date: Thu, 5 Feb 2026 14:54:34 +1000
Subject: [PATCH] increasing per-chat timeout (re api issues ergh), and tweaked
 logging

---
 atropos/agent/atropos_agent.py       |  6 +++---
 atropos/envs/swe_smith_oracle_env.py | 11 +++++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/atropos/agent/atropos_agent.py b/atropos/agent/atropos_agent.py
index 88edc53d358..2de2446ee95 100644
--- a/atropos/agent/atropos_agent.py
+++ b/atropos/agent/atropos_agent.py
@@ -438,12 +438,12 @@ class AtroposAgent:
         - `ATROPOS_AGENT_CHAT_TIMEOUT_S`: if set, wraps the await in `asyncio.wait_for`.
         - `ATROPOS_DEBUG_AGENT_WAIT_EVERY_S`: if set, prints a heartbeat while waiting.
         """
-        # Hard guardrail: never allow a single chat completion to block for more than 2 minutes.
+        # Hard guardrail: never allow a single chat completion to block for too long.
         # This is essential for RL data-gen stability; long hangs should be treated as failures (score=0).
         timeout_s_raw = os.getenv("ATROPOS_AGENT_CHAT_TIMEOUT_S")
-        timeout_s_default = 120.0
+        timeout_s_default = 240.0
         timeout_s = float(timeout_s_raw) if timeout_s_raw else timeout_s_default
-        timeout_s = min(timeout_s, 120.0)
+        timeout_s = min(timeout_s, 240.0)
 
         wait_every_raw = os.getenv("ATROPOS_DEBUG_AGENT_WAIT_EVERY_S")
         wait_every_s = float(wait_every_raw) if wait_every_raw else None
diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py
index fee0c9ab7bb..e35521a7c33 100644
--- a/atropos/envs/swe_smith_oracle_env.py
+++ b/atropos/envs/swe_smith_oracle_env.py
@@ -37,8 +37,11 @@ class SweSmithOracleEnvConfig(AgentEnvConfig):
 
     python_only: bool = Field(default=True, description="Filter to Python-evaluable rows")
     score_include_fail_to_pass: bool = Field(
-        default=False,
-        description="If true, score tests on PASS_TO_PASS ∪ FAIL_TO_PASS; else PASS_TO_PASS only.",
+        default=True,
+        description=(
+            "If true (default), score tests on PASS_TO_PASS ∪ FAIL_TO_PASS. "
+            "Disable to only run PASS_TO_PASS (faster but weaker signal)."
+        ),
     )
 
     prompt_mode: str = Field(
@@ -347,6 +350,10 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
 
         # Training correctness: do not reward trajectories that never actually used tools.
         if agent_result is not None and getattr(agent_result, "total_tool_calls", 0) <= 0:
+            print(
+                f"[SweSmithOracleEnv] tid={trajectory_id} verify (dataset_tests): no tool calls; score=0.0",
+                flush=True,
+            )
             return 0.0, {
                 "verification_mode": "dataset_tests",
                 "error": "No tool calls were made by the agent",