slow completions over group_size 4, debugging added

This commit is contained in:
Shannon Sands 2026-02-05 10:57:13 +10:00
parent 5b82190460
commit 88286f6da3
3 changed files with 22 additions and 5 deletions

View file

@ -353,17 +353,25 @@ class AtroposAgent:
if not wait_every_s or wait_every_s <= 0: if not wait_every_s or wait_every_s <= 0:
return await managed.chat_completion(**chat_kwargs) return await managed.chat_completion(**chat_kwargs)
# Heartbeat mode: wait in chunks without cancelling the underlying request.
# NOTE: do NOT use `asyncio.wait_for(task, timeout=...)` here, because a timeout
# will cancel the task and surface as `CancelledError` on the next loop.
task = asyncio.create_task(managed.chat_completion(**chat_kwargs)) task = asyncio.create_task(managed.chat_completion(**chat_kwargs))
t0 = time.perf_counter() t0 = time.perf_counter()
while True: try:
try: while True:
return await asyncio.wait_for(task, timeout=wait_every_s) done, _pending = await asyncio.wait({task}, timeout=wait_every_s)
except TimeoutError: if task in done:
return task.result()
waited = time.perf_counter() - t0 waited = time.perf_counter() - t0
print( print(
f"[AtroposAgent] step={step_num} still waiting for chat_completion... ({waited:.1f}s)", f"[AtroposAgent] step={step_num} still waiting for chat_completion... ({waited:.1f}s)",
flush=True, flush=True,
) )
except asyncio.CancelledError:
task.cancel()
raise
try: try:
if timeout_s and timeout_s > 0: if timeout_s and timeout_s > 0:

View file

@ -448,6 +448,8 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]):
if len(items) != self.config.group_size: if len(items) != self.config.group_size:
return None, backlog return None, backlog
# TODO: Mack sure logprobs included
group: ScoredDataGroup = ScoredDataGroup( group: ScoredDataGroup = ScoredDataGroup(
tokens=[], tokens=[],

View file

@ -6,6 +6,8 @@ This environment is intentionally minimal:
- runs an AtroposAgent tool loop to apply a fix - runs an AtroposAgent tool loop to apply a fix
- verifies by running pytest nodeids from the dataset (reward = pass/fail) - verifies by running pytest nodeids from the dataset (reward = pass/fail)
- Python only (no multi-language support currently, need to properly bauild & add to dropbox) - Python only (no multi-language support currently, need to properly bauild & add to dropbox)
- TODO: Get the other nonpython sandboxes up and running, then add a config knob to switch between them per row
- oh and add to dockerhub
Dataset: NousResearch/SWE-smith-oracle (train; does NOT use SWE-bench eval set). Dataset: NousResearch/SWE-smith-oracle (train; does NOT use SWE-bench eval set).
""" """
@ -185,6 +187,8 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
# The dataset "text" field can be extremely large (e.g. includes large code blobs # The dataset "text" field can be extremely large (e.g. includes large code blobs
# and long test lists). In local dev and bring-up runs this can make the first LLM # and long test lists). In local dev and bring-up runs this can make the first LLM
# call appear "hung" while the model chews through a massive prompt. Keep a cap. # call appear "hung" while the model chews through a massive prompt. Keep a cap.
# TODO: Remove, smoke test only
def _cap(s: str, n: int) -> tuple[str, bool]: def _cap(s: str, n: int) -> tuple[str, bool]:
if len(s) <= n: if len(s) <= n:
return s, False return s, False
@ -200,6 +204,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
repo_dir = self._repo_name(item) repo_dir = self._repo_name(item)
verify_note = "" verify_note = ""
# TODO: Remove, smoke testing only
if self.config.verification_mode == "install": if self.config.verification_mode == "install":
verify_note = ( verify_note = (
"\nVerification for this run is INSTALL-ONLY:\n" "\nVerification for this run is INSTALL-ONLY:\n"
@ -272,7 +277,9 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]):
# Prefer a lightweight "fetch by sha" to avoid pulling full history. # Prefer a lightweight "fetch by sha" to avoid pulling full history.
# If it fails (some servers disallow fetching unadvertised objects, or we hit # If it fails (some servers disallow fetching unadvertised objects, or we hit
# shallow-object edge cases), fall back to a full clone. # shallow-object edge cases), fall back to a full clone
# TODO: tbh, should just do this before setting up worktree & after sandbox build
clone_attempts: list[tuple[str, str]] = [] clone_attempts: list[tuple[str, str]] = []
clone_attempts.append( clone_attempts.append(
( (