mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(codex-runtime): retire wedged sessions + post-tool watchdog + OAuth refresh classify (#25769)
Mirrors openclaw beta.8's app-server resilience fixes so a stuck codex subprocess can't burn the full turn deadline and so users get a `codex login` pointer instead of raw RPC errors when their token expires. - TurnResult.should_retire signals the caller to drop+respawn codex. - Deadline-hit path and dead-subprocess detection set should_retire so the next turn doesn't ride a CPU-spinning or auth-broken process. - Post-tool watchdog (post_tool_quiet_timeout=90s): if a tool item completes and codex goes silent past the threshold without further output or turn/completed, fast-fail instead of waiting the full 600s. Resets on any non-tool activity so normal think-after-tool flows are not affected. - <turn_aborted> and <turn_aborted/> in agent text are treated as terminal — some codex builds tear down a turn that way without emitting turn/completed. - _classify_oauth_failure() inspects RPC error message + stderr tail for invalid_grant / token refresh / 401 / etc. and rewrites user-facing errors to 'run codex login'. Conservative: generic failures still surface verbatim. Fires at turn/start failure, turn/completed failure, and dead-subprocess paths. - thread/start cross-fill: tolerate thread.id, thread.sessionId, top-level sessionId/threadId so future codex schema drift doesn't KeyError us at handshake. - run_agent.py: when run_turn returns should_retire=True OR raises, close + null self._codex_session so the next turn respawns. Tests: +30 cases across session + integration suites. tests/agent/transports/test_codex_app_server_session.py 50/50 pass tests/run_agent/test_codex_app_server_integration.py 27/27 pass Broader codex scope (transports + cli runtime/migration) 376/376 pass
This commit is contained in:
parent
63991bbd97
commit
12f755c9eb
4 changed files with 711 additions and 7 deletions
|
|
@ -63,6 +63,73 @@ class TurnResult:
|
|||
error: Optional[str] = None # Set if turn ended in a non-recoverable error
|
||||
turn_id: Optional[str] = None
|
||||
thread_id: Optional[str] = None
|
||||
# Hint to the caller that the underlying codex subprocess is likely
|
||||
# wedged (turn-level timeout fired, post-tool watchdog tripped, or
|
||||
# token-refresh failure killed the child). The caller should retire
|
||||
# the session so the next turn respawns codex from scratch instead
|
||||
# of riding a CPU-spinning or auth-broken process. Mirrors openclaw
|
||||
# beta.8's "retire timed-out app-server clients" fix.
|
||||
should_retire: bool = False
|
||||
|
||||
|
||||
# Markers we accept as terminal even when codex never emits turn/completed.
|
||||
# Some codex versions stream `<turn_aborted>` as raw text in agentMessage
|
||||
# items when an interrupt or upstream error tears the turn down before the
|
||||
# normal completion path fires. Mirrors openclaw beta.8 fix.
|
||||
_TURN_ABORTED_MARKERS = ("<turn_aborted>", "<turn_aborted/>")
|
||||
|
||||
|
||||
# Substrings in codex stderr / JSON-RPC error messages that signal the
|
||||
# subprocess died because its OAuth credentials are no longer valid.
|
||||
# Kept conservative: we only redirect users to `codex login` when we're
|
||||
# reasonably sure that's the actual failure, otherwise we surface the
|
||||
# original error verbatim. Mirrors openclaw beta.8's auth-refresh
|
||||
# classification.
|
||||
_OAUTH_REFRESH_FAILURE_HINTS = (
|
||||
"invalid_grant",
|
||||
"invalid grant",
|
||||
"refresh token",
|
||||
"refresh_token",
|
||||
"token refresh",
|
||||
"token_refresh",
|
||||
"token has expired",
|
||||
"expired_token",
|
||||
"expired token",
|
||||
"not authenticated",
|
||||
"unauthenticated",
|
||||
"unauthorized",
|
||||
"401 unauthorized",
|
||||
"re-authenticate",
|
||||
"reauthenticate",
|
||||
"please log in",
|
||||
"please login",
|
||||
"auth profile",
|
||||
"no auth profile",
|
||||
"oauth",
|
||||
)
|
||||
|
||||
|
||||
def _classify_oauth_failure(*parts: str) -> Optional[str]:
|
||||
"""Return a user-friendly re-auth hint if any of the provided strings
|
||||
look like a codex OAuth/token-refresh failure; otherwise None.
|
||||
|
||||
Used for both `turn/start` JSON-RPC errors and post-mortem stderr
|
||||
inspection when the subprocess exits unexpectedly. Conservative on
|
||||
purpose — we only redirect users to `codex login` when the signal
|
||||
is strong, so unrelated runtime failures still surface verbatim.
|
||||
"""
|
||||
haystack = " ".join(p for p in parts if p).lower()
|
||||
if not haystack:
|
||||
return None
|
||||
for needle in _OAUTH_REFRESH_FAILURE_HINTS:
|
||||
if needle in haystack:
|
||||
return (
|
||||
"Codex authentication failed — your ChatGPT/Codex login "
|
||||
"looks expired or invalid. Run `codex login` to refresh, "
|
||||
"then retry. (Fall back to default runtime with "
|
||||
"`/codex-runtime auto` if the issue persists.)"
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
@ -156,7 +223,26 @@ class CodexAppServerSession:
|
|||
# ~/.codex/config.toml the same way they would for any codex usage.
|
||||
params: dict[str, Any] = {"cwd": self._cwd}
|
||||
result = self._client.request("thread/start", params, timeout=15)
|
||||
self._thread_id = result["thread"]["id"]
|
||||
# Cross-fill thread.id/sessionId — different codex versions have
|
||||
# serialized this under either key. Mirrors openclaw beta.8's
|
||||
# tolerance fix so future codex drops/renames don't KeyError us
|
||||
# at handshake time.
|
||||
thread_obj = result.get("thread") or {}
|
||||
thread_id = (
|
||||
thread_obj.get("id")
|
||||
or thread_obj.get("sessionId")
|
||||
or result.get("sessionId")
|
||||
or result.get("threadId")
|
||||
)
|
||||
if not thread_id:
|
||||
raise CodexAppServerError(
|
||||
code=-32603,
|
||||
message=(
|
||||
"codex thread/start returned no thread id "
|
||||
f"(payload keys: {sorted(result.keys())})"
|
||||
),
|
||||
)
|
||||
self._thread_id = thread_id
|
||||
logger.info(
|
||||
"codex app-server thread started: id=%s profile=%s cwd=%s",
|
||||
self._thread_id[:8],
|
||||
|
|
@ -198,10 +284,18 @@ class CodexAppServerSession:
|
|||
*,
|
||||
turn_timeout: float = 600.0,
|
||||
notification_poll_timeout: float = 0.25,
|
||||
post_tool_quiet_timeout: float = 90.0,
|
||||
) -> TurnResult:
|
||||
"""Send a user message and block until turn/completed, while
|
||||
forwarding server-initiated approval requests and projecting items
|
||||
into Hermes' messages shape."""
|
||||
into Hermes' messages shape.
|
||||
|
||||
post_tool_quiet_timeout: if codex emits a tool completion and then
|
||||
goes quiet for this many seconds without emitting another item or
|
||||
`turn/completed`, fast-fail and mark the session for retirement.
|
||||
Mirrors openclaw beta.8's post-tool completion watchdog (#81697)
|
||||
so a wedged codex doesn't burn the full turn deadline.
|
||||
"""
|
||||
self.ensure_started()
|
||||
assert self._client is not None and self._thread_id is not None
|
||||
|
||||
|
|
@ -221,12 +315,36 @@ class CodexAppServerSession:
|
|||
timeout=10,
|
||||
)
|
||||
except CodexAppServerError as exc:
|
||||
result.error = f"turn/start failed: {exc}"
|
||||
# Classify auth/refresh failures so the user gets a clear
|
||||
# `codex login` pointer instead of a raw RPC error string.
|
||||
stderr_blob = "\n".join(self._client.stderr_tail(40))
|
||||
hint = _classify_oauth_failure(exc.message, stderr_blob)
|
||||
if hint is not None:
|
||||
result.error = hint
|
||||
# Subprocess is fine on a JSON-RPC level here, but the
|
||||
# token store is broken — retire so the next turn does a
|
||||
# clean handshake (and the user has a chance to re-auth
|
||||
# via `codex login` between turns).
|
||||
result.should_retire = True
|
||||
else:
|
||||
result.error = f"turn/start failed: {exc}"
|
||||
return result
|
||||
except TimeoutError as exc:
|
||||
# turn/start hanging is a strong signal the subprocess is wedged.
|
||||
stderr_blob = "\n".join(self._client.stderr_tail(40))
|
||||
hint = _classify_oauth_failure(stderr_blob)
|
||||
result.error = hint or f"turn/start timed out: {exc}"
|
||||
result.should_retire = True
|
||||
return result
|
||||
|
||||
result.turn_id = (ts.get("turn") or {}).get("id")
|
||||
deadline = time.time() + turn_timeout
|
||||
turn_complete = False
|
||||
# Post-tool watchdog state. last_tool_completion_at is set whenever
|
||||
# a tool-shaped item completes; if no further notification arrives
|
||||
# within post_tool_quiet_timeout and the turn hasn't completed, we
|
||||
# fast-fail and retire the session.
|
||||
last_tool_completion_at: Optional[float] = None
|
||||
|
||||
while time.time() < deadline and not turn_complete:
|
||||
if self._interrupt_event.is_set():
|
||||
|
|
@ -234,6 +352,38 @@ class CodexAppServerSession:
|
|||
result.interrupted = True
|
||||
break
|
||||
|
||||
# Detect a dead subprocess between iterations. If codex exited
|
||||
# (e.g. crashed, segfaulted, or its auth refresh thread killed
|
||||
# the process), we won't get any more notifications — bail out
|
||||
# rather than waiting for the full turn deadline.
|
||||
if not self._client.is_alive():
|
||||
stderr_blob = "\n".join(self._client.stderr_tail(60))
|
||||
hint = _classify_oauth_failure(stderr_blob)
|
||||
result.error = hint or (
|
||||
f"codex app-server subprocess exited unexpectedly: "
|
||||
f"{stderr_blob[-300:] if stderr_blob else '<no stderr>'}"
|
||||
)
|
||||
result.should_retire = True
|
||||
break
|
||||
|
||||
# Post-tool watchdog: if a tool completion was the most recent
|
||||
# signal and codex has been silent past the quiet timeout, give
|
||||
# up on this turn instead of waiting for the outer deadline.
|
||||
if (
|
||||
last_tool_completion_at is not None
|
||||
and (time.time() - last_tool_completion_at)
|
||||
> post_tool_quiet_timeout
|
||||
):
|
||||
self._issue_interrupt(result.turn_id)
|
||||
result.interrupted = True
|
||||
result.error = (
|
||||
f"codex went silent for "
|
||||
f"{post_tool_quiet_timeout:.0f}s after a tool result; "
|
||||
f"retiring app-server session."
|
||||
)
|
||||
result.should_retire = True
|
||||
break
|
||||
|
||||
# Drain any server-initiated requests (approvals) before
|
||||
# reading notifications, so the codex side isn't blocked.
|
||||
sreq = self._client.take_server_request(timeout=0)
|
||||
|
|
@ -252,9 +402,20 @@ class CodexAppServerSession:
|
|||
result.projected_messages.extend(proj.messages)
|
||||
if proj.is_tool_iteration:
|
||||
result.tool_iterations += 1
|
||||
last_tool_completion_at = time.time()
|
||||
if proj.final_text is not None:
|
||||
result.final_text = proj.final_text
|
||||
if _has_turn_aborted_marker(proj.final_text):
|
||||
turn_complete = True
|
||||
result.interrupted = True
|
||||
result.error = (
|
||||
result.error
|
||||
or "codex reported turn_aborted"
|
||||
)
|
||||
self._handle_server_request(sreq)
|
||||
# Activity counts as live signal — reset the post-tool
|
||||
# quiet timer so an approval round-trip doesn't trip it.
|
||||
last_tool_completion_at = None
|
||||
continue
|
||||
|
||||
note = self._client.take_notification(
|
||||
|
|
@ -282,10 +443,29 @@ class CodexAppServerSession:
|
|||
result.projected_messages.extend(projection.messages)
|
||||
if projection.is_tool_iteration:
|
||||
result.tool_iterations += 1
|
||||
# Arm/refresh the post-tool quiet watchdog whenever a
|
||||
# tool-shaped item completes.
|
||||
last_tool_completion_at = time.time()
|
||||
else:
|
||||
# Any non-tool projected activity (assistant message,
|
||||
# status update, etc.) means codex is still producing
|
||||
# output — clear the quiet timer so we don't fast-fail.
|
||||
if projection.messages or projection.final_text is not None:
|
||||
last_tool_completion_at = None
|
||||
if projection.final_text is not None:
|
||||
# Codex can emit multiple agentMessage items in one turn
|
||||
# (e.g. partial then final). Take the last one as canonical.
|
||||
result.final_text = projection.final_text
|
||||
# Some codex builds tear a turn down by emitting a
|
||||
# `<turn_aborted>` marker in the agent message text and
|
||||
# never sending turn/completed. Treat the marker itself
|
||||
# as terminal so we don't burn the full deadline.
|
||||
if _has_turn_aborted_marker(projection.final_text):
|
||||
turn_complete = True
|
||||
result.interrupted = True
|
||||
result.error = (
|
||||
result.error or "codex reported turn_aborted"
|
||||
)
|
||||
|
||||
if method == "turn/completed":
|
||||
turn_complete = True
|
||||
|
|
@ -297,16 +477,31 @@ class CodexAppServerSession:
|
|||
(note.get("params") or {}).get("turn") or {}
|
||||
).get("error")
|
||||
if err_obj:
|
||||
result.error = (
|
||||
f"turn ended status={turn_status}: "
|
||||
f"{err_obj.get('message') or err_obj}"
|
||||
err_msg = err_obj.get("message") or str(err_obj)
|
||||
# If the turn failed for an auth/refresh reason,
|
||||
# rewrite the error into a re-auth hint AND mark
|
||||
# the session for retirement.
|
||||
stderr_blob = "\n".join(
|
||||
self._client.stderr_tail(40)
|
||||
)
|
||||
hint = _classify_oauth_failure(err_msg, stderr_blob)
|
||||
if hint is not None:
|
||||
result.error = hint
|
||||
result.should_retire = True
|
||||
else:
|
||||
result.error = (
|
||||
f"turn ended status={turn_status}: {err_msg}"
|
||||
)
|
||||
|
||||
if not turn_complete and not result.interrupted:
|
||||
# Hit the deadline. Issue interrupt to stop wasted compute.
|
||||
# Hit the deadline. Issue interrupt to stop wasted compute, and
|
||||
# tell the caller to retire the session — a turn that never
|
||||
# finished is a strong sign codex is wedged in a way the next
|
||||
# turn shouldn't inherit.
|
||||
self._issue_interrupt(result.turn_id)
|
||||
result.interrupted = True
|
||||
result.error = result.error or f"turn timed out after {turn_timeout}s"
|
||||
result.should_retire = True
|
||||
|
||||
return result
|
||||
|
||||
|
|
@ -515,6 +710,24 @@ def _approval_choice_to_codex_decision(choice: str) -> str:
|
|||
return "decline"
|
||||
|
||||
|
||||
def _has_turn_aborted_marker(text: str) -> bool:
|
||||
"""Return True if `text` contains any of the raw markers codex uses
|
||||
to signal a turn was aborted without emitting `turn/completed`.
|
||||
|
||||
Codex emits `<turn_aborted>` (and sometimes `<turn_aborted/>`) as raw
|
||||
text inside agentMessage items when an interrupt or upstream error
|
||||
tears the turn down before the normal completion path fires. Mirrors
|
||||
openclaw beta.8's terminal-marker fix so we don't burn the full turn
|
||||
deadline waiting for a turn/completed that never comes.
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
for marker in _TURN_ABORTED_MARKERS:
|
||||
if marker in text:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _get_hermes_version() -> str:
|
||||
"""Best-effort Hermes version string for codex's userAgent line."""
|
||||
try:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue