mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(codex-runtime): retire wedged sessions + post-tool watchdog + OAuth refresh classify (#25769)
Mirrors openclaw beta.8's app-server resilience fixes so a stuck codex subprocess can't burn the full turn deadline and so users get a `codex login` pointer instead of raw RPC errors when their token expires. - TurnResult.should_retire signals the caller to drop+respawn codex. - Deadline-hit path and dead-subprocess detection set should_retire so the next turn doesn't ride a CPU-spinning or auth-broken process. - Post-tool watchdog (post_tool_quiet_timeout=90s): if a tool item completes and codex goes silent past the threshold without further output or turn/completed, fast-fail instead of waiting the full 600s. Resets on any non-tool activity so normal think-after-tool flows are not affected. - <turn_aborted> and <turn_aborted/> in agent text are treated as terminal — some codex builds tear down a turn that way without emitting turn/completed. - _classify_oauth_failure() inspects RPC error message + stderr tail for invalid_grant / token refresh / 401 / etc. and rewrites user-facing errors to 'run codex login'. Conservative: generic failures still surface verbatim. Fires at turn/start failure, turn/completed failure, and dead-subprocess paths. - thread/start cross-fill: tolerate thread.id, thread.sessionId, top-level sessionId/threadId so future codex schema drift doesn't KeyError us at handshake. - run_agent.py: when run_turn returns should_retire=True OR raises, close + null self._codex_session so the next turn respawns. Tests: +30 cases across session + integration suites. tests/agent/transports/test_codex_app_server_session.py 50/50 pass tests/run_agent/test_codex_app_server_integration.py 27/27 pass Broader codex scope (transports + cli runtime/migration) 376/376 pass
This commit is contained in:
parent
63991bbd97
commit
12f755c9eb
4 changed files with 711 additions and 7 deletions
23
run_agent.py
23
run_agent.py
|
|
@ -15721,6 +15721,13 @@ class AIAgent:
|
|||
turn = self._codex_session.run_turn(user_input=user_message)
|
||||
except Exception as exc:
|
||||
logger.exception("codex app-server turn failed")
|
||||
# Crash → unconditionally drop the session so the next turn
|
||||
# respawns from scratch instead of reusing a dead client.
|
||||
try:
|
||||
self._codex_session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._codex_session = None
|
||||
return {
|
||||
"final_response": (
|
||||
f"Codex app-server turn failed: {exc}. "
|
||||
|
|
@ -15733,6 +15740,22 @@ class AIAgent:
|
|||
"error": str(exc),
|
||||
}
|
||||
|
||||
# If the turn signalled the underlying client is wedged (deadline
|
||||
# blown, post-tool watchdog tripped, OAuth refresh died, subprocess
|
||||
# exited), retire the session so the next turn respawns codex
|
||||
# rather than riding the broken process. Mirrors openclaw beta.8's
|
||||
# "retire timed-out app-server clients" fix.
|
||||
if getattr(turn, "should_retire", False):
|
||||
logger.warning(
|
||||
"codex app-server session retired (turn error: %s)",
|
||||
turn.error,
|
||||
)
|
||||
try:
|
||||
self._codex_session.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._codex_session = None
|
||||
|
||||
# Splice projected messages into the conversation. The projector emits
|
||||
# standard {role, content, tool_calls, tool_call_id} entries, which
|
||||
# is exactly what curator.py / sessions DB expect.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue