mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-24 05:41:40 +00:00
fix(codex-app-server): attach redacted stderr tail to generic failures (#25929)
When codex app-server fails outside the OAuth-classified path (non-auth turn/start errors, plain TimeoutErrors, generic turn-ended status, subprocess silently exits, hard deadline timeout), the user got a bare 'Internal error' / 'turn/start failed: ...' with no context. Diagnosing config/provider/auth-bridge issues forced a re-run with verbose codex flags. Add a _format_error_with_stderr helper that appends the last few stderr lines via agent.redact.redact_sensitive_text(force=True), and use it at every catch-all error site: - ensure_started() failures (codex init / thread/start) now return a TurnResult.error with should_retire=True instead of bubbling - non-OAuth turn/start CodexAppServerError / TimeoutError - subprocess-died branch (previously dumped raw stderr_blob[-300:] with no redaction — a leak risk) - turn ended with non-completed status - hard turn-timeout deadline OAuth-classified failures and the post-tool quiet watchdog already produce clean hints and stay unchanged. The redactor catches sk-*, gh*_*, Authorization: Bearer, query-string tokens, JWTs, private keys, etc., so provider error payloads can't leak into chat output or trajectories. Inspired by openclaw#80718, adapted for our app-server transport.
This commit is contained in:
parent
a28add199d
commit
fe83c4001b
2 changed files with 163 additions and 11 deletions
|
|
@ -31,6 +31,7 @@ import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
|
from agent.redact import redact_sensitive_text
|
||||||
from agent.transports.codex_app_server import (
|
from agent.transports.codex_app_server import (
|
||||||
CodexAppServerClient,
|
CodexAppServerClient,
|
||||||
CodexAppServerError,
|
CodexAppServerError,
|
||||||
|
|
@ -40,6 +41,13 @@ from agent.transports.codex_event_projector import CodexEventProjector
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# How many tailing stderr lines from the codex subprocess to attach to a
|
||||||
|
# user-facing error when we don't have a more specific classification (OAuth,
|
||||||
|
# wedge watchdog, etc.). Small enough to keep error messages legible, large
|
||||||
|
# enough to surface a config/provider/auth diagnostic.
|
||||||
|
_STDERR_TAIL_LINES = 12
|
||||||
|
|
||||||
|
|
||||||
# Permission profile mapping mirrors the docstring in PR proposal:
|
# Permission profile mapping mirrors the docstring in PR proposal:
|
||||||
# Hermes' tools.terminal.security_mode → Codex's permissions profile id.
|
# Hermes' tools.terminal.security_mode → Codex's permissions profile id.
|
||||||
# Defaults if config is missing → workspace-write (matches Codex's own default).
|
# Defaults if config is missing → workspace-write (matches Codex's own default).
|
||||||
|
|
@ -276,6 +284,45 @@ class CodexAppServerSession:
|
||||||
and unwind. Called by AIAgent's _interrupt_requested path."""
|
and unwind. Called by AIAgent's _interrupt_requested path."""
|
||||||
self._interrupt_event.set()
|
self._interrupt_event.set()
|
||||||
|
|
||||||
|
# ---------- diagnostics ----------
|
||||||
|
|
||||||
|
def _format_error_with_stderr(
|
||||||
|
self,
|
||||||
|
prefix: str,
|
||||||
|
exc: Any = "",
|
||||||
|
*,
|
||||||
|
tail_lines: int = _STDERR_TAIL_LINES,
|
||||||
|
) -> str:
|
||||||
|
"""Build a user-facing error string for codex failures.
|
||||||
|
|
||||||
|
Appends the last few lines of codex's stderr buffer when available,
|
||||||
|
passed through agent.redact with force=True so secrets in provider
|
||||||
|
error responses (auth headers, query-string tokens, sk-* keys) never
|
||||||
|
leak into chat output or trajectories. The codex CLI's own error
|
||||||
|
text ('Internal error', 'turn/start failed: ...') is otherwise
|
||||||
|
opaque and forces users to re-run with verbose flags to diagnose
|
||||||
|
config / provider / auth-bridge problems.
|
||||||
|
|
||||||
|
Use this for the generic / catch-all branches. Specific
|
||||||
|
classifications (OAuth via _classify_oauth_failure, post-tool wedge
|
||||||
|
watchdog) already produce a clean hint and should be used instead.
|
||||||
|
"""
|
||||||
|
exc_str = str(exc) if exc != "" and exc is not None else ""
|
||||||
|
base = f"{prefix}: {exc_str}" if exc_str else prefix
|
||||||
|
if self._client is None:
|
||||||
|
return base
|
||||||
|
try:
|
||||||
|
tail = self._client.stderr_tail(tail_lines)
|
||||||
|
except Exception: # pragma: no cover - diagnostic best-effort
|
||||||
|
return base
|
||||||
|
if not tail:
|
||||||
|
return base
|
||||||
|
joined = "\n".join(line.rstrip() for line in tail if line)
|
||||||
|
if not joined.strip():
|
||||||
|
return base
|
||||||
|
redacted = redact_sensitive_text(joined, force=True)
|
||||||
|
return f"{base}\ncodex stderr (last {len(tail)} lines):\n{redacted}"
|
||||||
|
|
||||||
# ---------- per-turn ----------
|
# ---------- per-turn ----------
|
||||||
|
|
||||||
def run_turn(
|
def run_turn(
|
||||||
|
|
@ -296,12 +343,27 @@ class CodexAppServerSession:
|
||||||
Mirrors openclaw beta.8's post-tool completion watchdog (#81697)
|
Mirrors openclaw beta.8's post-tool completion watchdog (#81697)
|
||||||
so a wedged codex doesn't burn the full turn deadline.
|
so a wedged codex doesn't burn the full turn deadline.
|
||||||
"""
|
"""
|
||||||
self.ensure_started()
|
# Pre-create the result so startup failures (codex subprocess can't
|
||||||
|
# spawn, initialize handshake rejects, thread/start blows up) surface
|
||||||
|
# the same way per-turn failures do — with a TurnResult.error string
|
||||||
|
# the caller can render — instead of bubbling raw codex exceptions
|
||||||
|
# up to AIAgent.run_conversation.
|
||||||
|
result = TurnResult()
|
||||||
|
try:
|
||||||
|
self.ensure_started()
|
||||||
|
except (CodexAppServerError, TimeoutError) as exc:
|
||||||
|
result.error = self._format_error_with_stderr(
|
||||||
|
"codex app-server startup failed", exc
|
||||||
|
)
|
||||||
|
# Subprocess almost certainly unhealthy — retire so the next
|
||||||
|
# turn re-spawns cleanly.
|
||||||
|
result.should_retire = True
|
||||||
|
return result
|
||||||
assert self._client is not None and self._thread_id is not None
|
assert self._client is not None and self._thread_id is not None
|
||||||
|
result.thread_id = self._thread_id
|
||||||
|
|
||||||
self._interrupt_event.clear()
|
self._interrupt_event.clear()
|
||||||
projector = CodexEventProjector()
|
projector = CodexEventProjector()
|
||||||
result = TurnResult(thread_id=self._thread_id)
|
|
||||||
|
|
||||||
# Send turn/start with the user input. Text-only for now (codex
|
# Send turn/start with the user input. Text-only for now (codex
|
||||||
# supports rich content but Hermes' text path is the common case).
|
# supports rich content but Hermes' text path is the common case).
|
||||||
|
|
@ -327,13 +389,17 @@ class CodexAppServerSession:
|
||||||
# via `codex login` between turns).
|
# via `codex login` between turns).
|
||||||
result.should_retire = True
|
result.should_retire = True
|
||||||
else:
|
else:
|
||||||
result.error = f"turn/start failed: {exc}"
|
result.error = self._format_error_with_stderr(
|
||||||
|
"turn/start failed", exc
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
except TimeoutError as exc:
|
except TimeoutError as exc:
|
||||||
# turn/start hanging is a strong signal the subprocess is wedged.
|
# turn/start hanging is a strong signal the subprocess is wedged.
|
||||||
stderr_blob = "\n".join(self._client.stderr_tail(40))
|
stderr_blob = "\n".join(self._client.stderr_tail(40))
|
||||||
hint = _classify_oauth_failure(stderr_blob)
|
hint = _classify_oauth_failure(stderr_blob)
|
||||||
result.error = hint or f"turn/start timed out: {exc}"
|
result.error = hint or self._format_error_with_stderr(
|
||||||
|
"turn/start timed out", exc
|
||||||
|
)
|
||||||
result.should_retire = True
|
result.should_retire = True
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -359,10 +425,13 @@ class CodexAppServerSession:
|
||||||
if not self._client.is_alive():
|
if not self._client.is_alive():
|
||||||
stderr_blob = "\n".join(self._client.stderr_tail(60))
|
stderr_blob = "\n".join(self._client.stderr_tail(60))
|
||||||
hint = _classify_oauth_failure(stderr_blob)
|
hint = _classify_oauth_failure(stderr_blob)
|
||||||
result.error = hint or (
|
if hint is not None:
|
||||||
f"codex app-server subprocess exited unexpectedly: "
|
result.error = hint
|
||||||
f"{stderr_blob[-300:] if stderr_blob else '<no stderr>'}"
|
else:
|
||||||
)
|
result.error = self._format_error_with_stderr(
|
||||||
|
"codex app-server subprocess exited unexpectedly",
|
||||||
|
tail_lines=20,
|
||||||
|
)
|
||||||
result.should_retire = True
|
result.should_retire = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -489,8 +558,8 @@ class CodexAppServerSession:
|
||||||
result.error = hint
|
result.error = hint
|
||||||
result.should_retire = True
|
result.should_retire = True
|
||||||
else:
|
else:
|
||||||
result.error = (
|
result.error = self._format_error_with_stderr(
|
||||||
f"turn ended status={turn_status}: {err_msg}"
|
f"turn ended status={turn_status}", err_msg
|
||||||
)
|
)
|
||||||
|
|
||||||
if not turn_complete and not result.interrupted:
|
if not turn_complete and not result.interrupted:
|
||||||
|
|
@ -500,7 +569,10 @@ class CodexAppServerSession:
|
||||||
# turn shouldn't inherit.
|
# turn shouldn't inherit.
|
||||||
self._issue_interrupt(result.turn_id)
|
self._issue_interrupt(result.turn_id)
|
||||||
result.interrupted = True
|
result.interrupted = True
|
||||||
result.error = result.error or f"turn timed out after {turn_timeout}s"
|
if not result.error:
|
||||||
|
result.error = self._format_error_with_stderr(
|
||||||
|
f"turn timed out after {turn_timeout}s"
|
||||||
|
)
|
||||||
result.should_retire = True
|
result.should_retire = True
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
|
||||||
|
|
@ -231,6 +231,86 @@ class TestRunTurn:
|
||||||
assert "bad input" in r.error
|
assert "bad input" in r.error
|
||||||
assert r.final_text == ""
|
assert r.final_text == ""
|
||||||
|
|
||||||
|
def test_turn_start_failure_attaches_redacted_stderr_tail(self):
|
||||||
|
"""When codex stderr has content (non-OAuth), the tail gets attached
|
||||||
|
to the user-facing error so config/provider problems are debuggable
|
||||||
|
instead of just 'Internal error'. Secrets in stderr are redacted
|
||||||
|
via agent.redact(force=True)."""
|
||||||
|
client = FakeClient()
|
||||||
|
client.set_stderr_tail([
|
||||||
|
"ERROR: provider auth failed",
|
||||||
|
"Authorization: Bearer sk-live-deadbeefdeadbeef",
|
||||||
|
"url=https://api.example.com/v1?token=querysecret12345",
|
||||||
|
])
|
||||||
|
from agent.transports.codex_app_server import CodexAppServerError
|
||||||
|
|
||||||
|
def boom(method, params):
|
||||||
|
if method == "turn/start":
|
||||||
|
raise CodexAppServerError(code=-32603, message="Internal error")
|
||||||
|
return {"thread": {"id": "t"}, "activePermissionProfile": {"id": "x"}}
|
||||||
|
|
||||||
|
client._request_handler = boom
|
||||||
|
s = make_session(client)
|
||||||
|
r = s.run_turn("hi", turn_timeout=2.0)
|
||||||
|
assert r.error is not None
|
||||||
|
assert "turn/start failed" in r.error
|
||||||
|
assert "Internal error" in r.error
|
||||||
|
# Stderr tail attached
|
||||||
|
assert "codex stderr" in r.error
|
||||||
|
assert "provider auth failed" in r.error
|
||||||
|
# Secrets redacted
|
||||||
|
assert "sk-live-deadbeefdeadbeef" not in r.error
|
||||||
|
assert "querysecret12345" not in r.error
|
||||||
|
# Non-OAuth → should NOT retire (subprocess JSON-RPC is still healthy).
|
||||||
|
assert r.should_retire is False
|
||||||
|
|
||||||
|
def test_turn_start_timeout_attaches_redacted_stderr_tail(self):
|
||||||
|
"""A non-OAuth TimeoutError on turn/start surfaces with codex stderr
|
||||||
|
context attached and marks the session for retirement."""
|
||||||
|
client = FakeClient()
|
||||||
|
client.set_stderr_tail([
|
||||||
|
"WARN: provider request stalled",
|
||||||
|
"Authorization: Bearer sk-stalled-secret-abc123",
|
||||||
|
])
|
||||||
|
|
||||||
|
def stall(method, params):
|
||||||
|
if method == "turn/start":
|
||||||
|
raise TimeoutError("codex method 'turn/start' timed out after 10s")
|
||||||
|
return {"thread": {"id": "t"}, "activePermissionProfile": {"id": "x"}}
|
||||||
|
|
||||||
|
client._request_handler = stall
|
||||||
|
s = make_session(client)
|
||||||
|
r = s.run_turn("hi", turn_timeout=2.0)
|
||||||
|
assert r.error is not None
|
||||||
|
assert "turn/start timed out" in r.error
|
||||||
|
assert "provider request stalled" in r.error
|
||||||
|
assert "sk-stalled-secret-abc123" not in r.error
|
||||||
|
assert r.should_retire is True
|
||||||
|
|
||||||
|
def test_startup_failure_returns_error_with_stderr(self):
|
||||||
|
"""Codex thread/start failures during ensure_started() used to bubble
|
||||||
|
up as uncaught exceptions. Now they return a TurnResult.error so
|
||||||
|
AIAgent surfaces a clean diagnostic instead of crashing the turn."""
|
||||||
|
client = FakeClient()
|
||||||
|
client.set_stderr_tail([
|
||||||
|
"FATAL: model_provider 'azure_foundry' not configured",
|
||||||
|
])
|
||||||
|
from agent.transports.codex_app_server import CodexAppServerError
|
||||||
|
|
||||||
|
def boom(method, params):
|
||||||
|
if method == "thread/start":
|
||||||
|
raise CodexAppServerError(code=-32603, message="Internal error")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
client._request_handler = boom
|
||||||
|
s = make_session(client)
|
||||||
|
r = s.run_turn("hi", turn_timeout=2.0)
|
||||||
|
assert r.error is not None
|
||||||
|
assert "startup failed" in r.error
|
||||||
|
assert "model_provider 'azure_foundry' not configured" in r.error
|
||||||
|
assert r.should_retire is True
|
||||||
|
assert r.final_text == ""
|
||||||
|
|
||||||
def test_interrupt_during_turn_issues_turn_interrupt(self):
|
def test_interrupt_during_turn_issues_turn_interrupt(self):
|
||||||
client = FakeClient()
|
client = FakeClient()
|
||||||
# Don't queue turn/completed — the loop has to interrupt out
|
# Don't queue turn/completed — the loop has to interrupt out
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue