mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(codex-runtime): retire wedged sessions + post-tool watchdog + OAuth refresh classify (#25769)
Mirrors openclaw beta.8's app-server resilience fixes so a stuck codex subprocess can't burn the full turn deadline and so users get a `codex login` pointer instead of raw RPC errors when their token expires. - TurnResult.should_retire signals the caller to drop+respawn codex. - Deadline-hit path and dead-subprocess detection set should_retire so the next turn doesn't ride a CPU-spinning or auth-broken process. - Post-tool watchdog (post_tool_quiet_timeout=90s): if a tool item completes and codex goes silent past the threshold without further output or turn/completed, fast-fail instead of waiting the full 600s. Resets on any non-tool activity so normal think-after-tool flows are not affected. - <turn_aborted> and <turn_aborted/> in agent text are treated as terminal — some codex builds tear down a turn that way without emitting turn/completed. - _classify_oauth_failure() inspects RPC error message + stderr tail for invalid_grant / token refresh / 401 / etc. and rewrites user-facing errors to 'run codex login'. Conservative: generic failures still surface verbatim. Fires at turn/start failure, turn/completed failure, and dead-subprocess paths. - thread/start cross-fill: tolerate thread.id, thread.sessionId, top-level sessionId/threadId so future codex schema drift doesn't KeyError us at handshake. - run_agent.py: when run_turn returns should_retire=True OR raises, close + null self._codex_session so the next turn respawns. Tests: +30 cases across session + integration suites. tests/agent/transports/test_codex_app_server_session.py 50/50 pass tests/run_agent/test_codex_app_server_integration.py 27/27 pass Broader codex scope (transports + cli runtime/migration) 376/376 pass
This commit is contained in:
parent
63991bbd97
commit
12f755c9eb
4 changed files with 711 additions and 7 deletions
|
|
@ -342,3 +342,77 @@ class TestErrorHandling:
|
|||
assert result["completed"] is False
|
||||
assert result["partial"] is True
|
||||
assert result["error"] == "user interrupted"
|
||||
|
||||
|
||||
class TestSessionRetirementOnRunAgent:
|
||||
"""run_agent.py side: when run_turn returns should_retire=True, the
|
||||
AIAgent must close + null _codex_session so the next turn respawns."""
|
||||
|
||||
def test_should_retire_drops_session(self, monkeypatch):
|
||||
closes = {"count": 0}
|
||||
|
||||
def fake_run_turn(self, user_input, **kwargs):
|
||||
return TurnResult(
|
||||
final_text="",
|
||||
projected_messages=[],
|
||||
tool_iterations=0,
|
||||
interrupted=True,
|
||||
error="turn timed out after 600.0s",
|
||||
turn_id="tu1",
|
||||
thread_id="th1",
|
||||
should_retire=True,
|
||||
)
|
||||
|
||||
def fake_close(self):
|
||||
closes["count"] += 1
|
||||
|
||||
monkeypatch.setattr(CodexAppServerSession, "ensure_started",
|
||||
lambda self: "th1")
|
||||
monkeypatch.setattr(CodexAppServerSession, "run_turn", fake_run_turn)
|
||||
monkeypatch.setattr(CodexAppServerSession, "close", fake_close)
|
||||
|
||||
agent = _make_codex_agent()
|
||||
with patch.object(agent, "_spawn_background_review", return_value=None):
|
||||
result = agent.run_conversation("hi")
|
||||
|
||||
# The session was closed and cleared
|
||||
assert closes["count"] == 1
|
||||
assert getattr(agent, "_codex_session", "MISSING") is None
|
||||
# Partial result was still returned (caller still sees the error)
|
||||
assert result["partial"] is True
|
||||
assert result["error"] == "turn timed out after 600.0s"
|
||||
|
||||
def test_normal_turn_keeps_session(self, fake_session):
|
||||
"""fake_session fixture returns should_retire=False (default).
|
||||
The session must stay attached for the next turn to reuse."""
|
||||
agent = _make_codex_agent()
|
||||
with patch.object(agent, "_spawn_background_review", return_value=None):
|
||||
agent.run_conversation("hi")
|
||||
# Session was lazily created and still attached.
|
||||
assert getattr(agent, "_codex_session", None) is not None
|
||||
|
||||
def test_exception_path_also_drops_session(self, monkeypatch):
|
||||
"""Even if run_turn raises (not just sets should_retire), we must
|
||||
drop the session — a thrown exception is the strongest possible
|
||||
signal the process is dead."""
|
||||
closes = {"count": 0}
|
||||
|
||||
def boom_run_turn(self, user_input, **kwargs):
|
||||
raise RuntimeError("codex segfaulted")
|
||||
|
||||
def fake_close(self):
|
||||
closes["count"] += 1
|
||||
|
||||
monkeypatch.setattr(CodexAppServerSession, "ensure_started",
|
||||
lambda self: "th1")
|
||||
monkeypatch.setattr(CodexAppServerSession, "run_turn", boom_run_turn)
|
||||
monkeypatch.setattr(CodexAppServerSession, "close", fake_close)
|
||||
|
||||
agent = _make_codex_agent()
|
||||
with patch.object(agent, "_spawn_background_review", return_value=None):
|
||||
result = agent.run_conversation("hi")
|
||||
|
||||
assert closes["count"] == 1
|
||||
assert agent._codex_session is None
|
||||
assert result["completed"] is False
|
||||
assert "codex segfaulted" in result["error"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue