From 88286f6da3a237288935a8eecdf9111259ba2c29 Mon Sep 17 00:00:00 2001
From: Shannon Sands <shannon.sands.1979@gmail.com>
Date: Thu, 5 Feb 2026 10:57:13 +1000
Subject: [PATCH] slow completions over group_size 4, debugging added

---
 atropos/agent/atropos_agent.py       | 16 ++++++++++++----
 atropos/envs/agent_env.py            |  2 ++
 atropos/envs/swe_smith_oracle_env.py |  9 ++++++++-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/atropos/agent/atropos_agent.py b/atropos/agent/atropos_agent.py
index 6d995c53b3..254a7953ae 100644
--- a/atropos/agent/atropos_agent.py
+++ b/atropos/agent/atropos_agent.py
@@ -353,17 +353,25 @@ class AtroposAgent:
             if not wait_every_s or wait_every_s <= 0:
                 return await managed.chat_completion(**chat_kwargs)
 
+            # Heartbeat mode: wait in chunks without cancelling the underlying request.
+            # NOTE: do NOT use `asyncio.wait_for(task, timeout=...)` here, because a timeout
+            # will cancel the task and surface as `CancelledError` on the next loop.
             task = asyncio.create_task(managed.chat_completion(**chat_kwargs))
             t0 = time.perf_counter()
-            while True:
-                try:
-                    return await asyncio.wait_for(task, timeout=wait_every_s)
-                except TimeoutError:
+            try:
+                while True:
+                    done, _pending = await asyncio.wait({task}, timeout=wait_every_s)
+                    if task in done:
+                        return task.result()
+
                     waited = time.perf_counter() - t0
                     print(
                         f"[AtroposAgent] step={step_num} still waiting for chat_completion... ({waited:.1f}s)",
                         flush=True,
                     )
+            except asyncio.CancelledError:
+                task.cancel()
+                raise
 
         try:
             if timeout_s and timeout_s > 0:
diff --git a/atropos/envs/agent_env.py b/atropos/envs/agent_env.py
index 5c18e2fb81..5596266eb3 100644
--- a/atropos/envs/agent_env.py
+++ b/atropos/envs/agent_env.py
@@ -448,6 +448,8 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
 
         if len(items) != self.config.group_size:
             return None, backlog
+        
+        # TODO: Mack sure logprobs included
 
         group: ScoredDataGroup = ScoredDataGroup(
             tokens=[],
diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py
index 69dd906a1b..587bd440d1 100644
--- a/atropos/envs/swe_smith_oracle_env.py
+++ b/atropos/envs/swe_smith_oracle_env.py
@@ -6,6 +6,8 @@ This environment is intentionally minimal:
 - runs an AtroposAgent tool loop to apply a fix
 - verifies by running pytest nodeids from the dataset (reward = pass/fail)
 - Python only (no multi-language support currently, need to properly bauild & add to dropbox)
+- TODO: Get the other nonpython sandboxes up and running, then add a config knob to switch between them per row
+- oh and add to dockerhub
 
 Dataset: NousResearch/SWE-smith-oracle (train; does NOT use SWE-bench eval set).
 """
@@ -185,6 +187,8 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
         # The dataset "text" field can be extremely large (e.g. includes large code blobs
         # and long test lists). In local dev and bring-up runs this can make the first LLM
         # call appear "hung" while the model chews through a massive prompt. Keep a cap.
+
+        # TODO: Remove, smoke test only
         def _cap(s: str, n: int) -> tuple[str, bool]:
             if len(s) <= n:
                 return s, False
@@ -200,6 +204,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
 
         repo_dir = self._repo_name(item)
         verify_note = ""
+        # TODO: Remove, smoke testing only
         if self.config.verification_mode == "install":
             verify_note = (
                 "\nVerification for this run is INSTALL-ONLY:\n"
@@ -272,7 +277,9 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
 
         # Prefer a lightweight "fetch by sha" to avoid pulling full history.
         # If it fails (some servers disallow fetching unadvertised objects, or we hit
-        # shallow-object edge cases), fall back to a full clone.
+        # shallow-object edge cases), fall back to a full clone
+
+        # TODO: tbh, should just do this before setting up worktree & after sandbox build
         clone_attempts: list[tuple[str, str]] = []
         clone_attempts.append(
             (