From 5b8219046055ab977c3a2619e9feea4658528def Mon Sep 17 00:00:00 2001 From: Shannon Sands Date: Thu, 5 Feb 2026 08:59:14 +1000 Subject: [PATCH] adding some more debugging, hitting endpoint errors or some other slowdown --- atropos/agent/atropos_agent.py | 61 +++++++++++++++++++++++++++- atropos/envs/agent_env.py | 10 ++++- atropos/envs/swe_smith_oracle_env.py | 2 +- 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/atropos/agent/atropos_agent.py b/atropos/agent/atropos_agent.py index c033cc5f90..6d995c53b3 100644 --- a/atropos/agent/atropos_agent.py +++ b/atropos/agent/atropos_agent.py @@ -333,6 +333,61 @@ class AtroposAgent: text = str(dumped) print(text[:200_000], flush=True) + async def _chat_completion_with_debug( + self, *, managed: Any, step_num: int, chat_kwargs: Dict[str, Any] + ) -> Any: + """ + Call `managed.chat_completion()` with optional timeout + richer failure logging. + + Debug env vars: + - `ATROPOS_AGENT_CHAT_TIMEOUT_S`: if set, wraps the await in `asyncio.wait_for`. + - `ATROPOS_DEBUG_AGENT_WAIT_EVERY_S`: if set, prints a heartbeat while waiting. + """ + timeout_s_raw = os.getenv("ATROPOS_AGENT_CHAT_TIMEOUT_S") + timeout_s = float(timeout_s_raw) if timeout_s_raw else None + + wait_every_raw = os.getenv("ATROPOS_DEBUG_AGENT_WAIT_EVERY_S") + wait_every_s = float(wait_every_raw) if wait_every_raw else None + + async def _await_call() -> Any: + if not wait_every_s or wait_every_s <= 0: + return await managed.chat_completion(**chat_kwargs) + + task = asyncio.create_task(managed.chat_completion(**chat_kwargs)) + t0 = time.perf_counter() + while True: + try: + return await asyncio.wait_for(task, timeout=wait_every_s) + except TimeoutError: + waited = time.perf_counter() - t0 + print( + f"[AtroposAgent] step={step_num} still waiting for chat_completion... ({waited:.1f}s)", + flush=True, + ) + + try: + if timeout_s and timeout_s > 0: + return await asyncio.wait_for(_await_call(), timeout=timeout_s) + return await _await_call() + except Exception as e: + detail: Dict[str, Any] = { + "step": step_num, + "exc_type": type(e).__name__, + "exc_str": str(e), + } + if isinstance(e, httpx.HTTPStatusError): + try: + detail["status_code"] = e.response.status_code + detail["response_text"] = e.response.text[:20_000] + except Exception: + pass + elif isinstance(e, httpx.RequestError): + detail["request"] = repr(getattr(e, "request", None)) + + print("\n=== ATROPOS_DEBUG_AGENT_CHAT_FAILURE ===", flush=True) + print(detail, flush=True) + raise + async def run( self, task: str, @@ -386,7 +441,9 @@ class AtroposAgent: flush=True, ) self._debug_dump_request(step_num=step_num + 1, chat_kwargs=chat_kwargs) - response = await managed.chat_completion(**chat_kwargs) + response = await self._chat_completion_with_debug( + managed=managed, step_num=step_num + 1, chat_kwargs=chat_kwargs + ) self._debug_dump_response(step_num=step_num + 1, response=response) print( f"[AtroposAgent] step={step_num+1} chat_completion done in {time.perf_counter() - t_req:.2f}s", @@ -541,7 +598,7 @@ class AtroposAgent: chat_kwargs["temperature"] = self.config.temperature self._debug_dump_request(step_num=1, chat_kwargs=chat_kwargs) - response = await managed.chat_completion(**chat_kwargs) + response = await self._chat_completion_with_debug(managed=managed, step_num=1, chat_kwargs=chat_kwargs) self._debug_dump_response(step_num=1, response=response) current_node = None diff --git a/atropos/envs/agent_env.py b/atropos/envs/agent_env.py index 10a863dd44..5c18e2fb81 100644 --- a/atropos/envs/agent_env.py +++ b/atropos/envs/agent_env.py @@ -6,7 +6,7 @@ providing helpers for running agent trajectories with queued/batched tool calls. """ from __future__ import annotations - +import os import asyncio import time import uuid @@ -320,7 +320,13 @@ class AgentEnv(BaseEnv, ABC, Generic[AgentEnvConfigT]): print(f"[AgentEnv] collect_trajectory(): tid={trajectory_id} start", flush=True) task = self.build_task(item) agent_config = self.build_agent_config(item) - print(f"Starting trajectory {trajectory_id} with task: {task}") + if os.getenv("ATROPOS_DEBUG_PRINT_TASK") == "1": + print(f"Starting trajectory {trajectory_id} with task: {task}", flush=True) + else: + # Avoid printing the full task prompt by default (can be huge/noisy). + one_line = " ".join(str(task).splitlines()).strip() + preview = one_line[:240] + ("…" if len(one_line) > 240 else "") + print(f"Starting trajectory {trajectory_id} (task preview): {preview}", flush=True) async def _exec(call): return await self._tool_executor.execute(trajectory_id, call) diff --git a/atropos/envs/swe_smith_oracle_env.py b/atropos/envs/swe_smith_oracle_env.py index b23d423a6b..69dd906a1b 100644 --- a/atropos/envs/swe_smith_oracle_env.py +++ b/atropos/envs/swe_smith_oracle_env.py @@ -110,7 +110,7 @@ class SweSmithOracleEnv(AgentEnv[SweSmithOracleEnvConfig]): api_key=api_key, num_max_requests_at_once=1, num_requests_for_eval=1, - timeout=300, + timeout=int(os.getenv("ATROPOS_SERVER_TIMEOUT_S") or "300"), ), ]