fix(codex-runtime): retire wedged sessions + post-tool watchdog + OAuth refresh classify (#25769)

Mirrors openclaw beta.8's app-server resilience fixes so a stuck codex
subprocess can't burn the full turn deadline and so users get a
`codex login` pointer instead of raw RPC errors when their token expires.

- TurnResult.should_retire signals the caller to drop+respawn codex.
- Deadline-hit path and dead-subprocess detection set should_retire so
  the next turn doesn't ride a CPU-spinning or auth-broken process.
- Post-tool watchdog (post_tool_quiet_timeout=90s): if a tool item
  completes and codex goes silent past the threshold without further
  output or turn/completed, fast-fail instead of waiting the full 600s.
  Resets on any non-tool activity so normal think-after-tool flows are
  not affected.
- <turn_aborted> and <turn_aborted/> in agent text are treated as
  terminal — some codex builds tear down a turn that way without
  emitting turn/completed.
- _classify_oauth_failure() inspects RPC error message + stderr tail
  for invalid_grant / token refresh / 401 / etc. and rewrites
  user-facing errors to 'run codex login'. Conservative: generic
  failures still surface verbatim. Fires at turn/start failure,
  turn/completed failure, and dead-subprocess paths.
- thread/start cross-fill: tolerate thread.id, thread.sessionId,
  top-level sessionId/threadId so future codex schema drift doesn't
  KeyError us at handshake.
- run_agent.py: when run_turn returns should_retire=True OR raises,
  close + null self._codex_session so the next turn respawns.

Tests: +30 cases across session + integration suites.
  tests/agent/transports/test_codex_app_server_session.py 50/50 pass
  tests/run_agent/test_codex_app_server_integration.py 27/27 pass
  Broader codex scope (transports + cli runtime/migration) 376/376 pass
This commit is contained in:
Teknium 2026-05-14 07:55:09 -07:00 committed by GitHub
parent 63991bbd97
commit 12f755c9eb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 711 additions and 7 deletions

View file

@ -342,3 +342,77 @@ class TestErrorHandling:
assert result["completed"] is False
assert result["partial"] is True
assert result["error"] == "user interrupted"
class TestSessionRetirementOnRunAgent:
"""run_agent.py side: when run_turn returns should_retire=True, the
AIAgent must close + null _codex_session so the next turn respawns."""
def test_should_retire_drops_session(self, monkeypatch):
closes = {"count": 0}
def fake_run_turn(self, user_input, **kwargs):
return TurnResult(
final_text="",
projected_messages=[],
tool_iterations=0,
interrupted=True,
error="turn timed out after 600.0s",
turn_id="tu1",
thread_id="th1",
should_retire=True,
)
def fake_close(self):
closes["count"] += 1
monkeypatch.setattr(CodexAppServerSession, "ensure_started",
lambda self: "th1")
monkeypatch.setattr(CodexAppServerSession, "run_turn", fake_run_turn)
monkeypatch.setattr(CodexAppServerSession, "close", fake_close)
agent = _make_codex_agent()
with patch.object(agent, "_spawn_background_review", return_value=None):
result = agent.run_conversation("hi")
# The session was closed and cleared
assert closes["count"] == 1
assert getattr(agent, "_codex_session", "MISSING") is None
# Partial result was still returned (caller still sees the error)
assert result["partial"] is True
assert result["error"] == "turn timed out after 600.0s"
def test_normal_turn_keeps_session(self, fake_session):
"""fake_session fixture returns should_retire=False (default).
The session must stay attached for the next turn to reuse."""
agent = _make_codex_agent()
with patch.object(agent, "_spawn_background_review", return_value=None):
agent.run_conversation("hi")
# Session was lazily created and still attached.
assert getattr(agent, "_codex_session", None) is not None
def test_exception_path_also_drops_session(self, monkeypatch):
"""Even if run_turn raises (not just sets should_retire), we must
drop the session a thrown exception is the strongest possible
signal the process is dead."""
closes = {"count": 0}
def boom_run_turn(self, user_input, **kwargs):
raise RuntimeError("codex segfaulted")
def fake_close(self):
closes["count"] += 1
monkeypatch.setattr(CodexAppServerSession, "ensure_started",
lambda self: "th1")
monkeypatch.setattr(CodexAppServerSession, "run_turn", boom_run_turn)
monkeypatch.setattr(CodexAppServerSession, "close", fake_close)
agent = _make_codex_agent()
with patch.object(agent, "_spawn_background_review", return_value=None):
result = agent.run_conversation("hi")
assert closes["count"] == 1
assert agent._codex_session is None
assert result["completed"] is False
assert "codex segfaulted" in result["error"]