fix(codex-app-server): attach redacted stderr tail to generic failures (#25929)

When codex app-server fails outside the OAuth-classified path (non-auth turn/start errors, plain TimeoutErrors, generic turn-ended status, subprocess silently exits, hard deadline timeout), the user got a bare 'Internal error' / 'turn/start failed: ...' with no context. Diagnosing config/provider/auth-bridge issues forced a re-run with verbose codex flags. Add a _format_error_with_stderr helper that appends the last few stderr lines via agent.redact.redact_sensitive_text(force=True), and use it at every catch-all error site: - ensure_started() failures (codex init / thread/start) now return a TurnResult.error with should_retire=True instead of bubbling - non-OAuth turn/start CodexAppServerError / TimeoutError - subprocess-died branch (previously dumped raw stderr_blob[-300:] with no redaction — a leak risk) - turn ended with non-completed status - hard turn-timeout deadline OAuth-classified failures and the post-tool quiet watchdog already produce clean hints and stay unchanged. The redactor catches sk-*, gh*_*, Authorization: Bearer, query-string tokens, JWTs, private keys, etc., so provider error payloads can't leak into chat output or trajectories. Inspired by openclaw#80718, adapted for our app-server transport.
2026-07-02 12:13:05 +00:00 · 2026-05-14 14:55:23 -07:00 · 2026-05-14 14:55:23 -07:00 · fe83c4001b
commit fe83c4001b
parent a28add199d
2 changed files with 163 additions and 11 deletions
--- a/agent/transports/codex_app_server_session.py
+++ b/agent/transports/codex_app_server_session.py
@ -31,6 +31,7 @@ import time
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional

+from agent.redact import redact_sensitive_text
 from agent.transports.codex_app_server import (
    CodexAppServerClient,
    CodexAppServerError,
@ -40,6 +41,13 @@ from agent.transports.codex_event_projector import CodexEventProjector
 logger = logging.getLogger(__name__)


+# How many tailing stderr lines from the codex subprocess to attach to a
+# user-facing error when we don't have a more specific classification (OAuth,
+# wedge watchdog, etc.). Small enough to keep error messages legible, large
+# enough to surface a config/provider/auth diagnostic.
+_STDERR_TAIL_LINES = 12
+
+
 # Permission profile mapping mirrors the docstring in PR proposal:
 # Hermes' tools.terminal.security_mode → Codex's permissions profile id.
 # Defaults if config is missing → workspace-write (matches Codex's own default).
@ -276,6 +284,45 @@ class CodexAppServerSession:
        and unwind. Called by AIAgent's _interrupt_requested path."""
        self._interrupt_event.set()

+    # ---------- diagnostics ----------
+
+    def _format_error_with_stderr(
+        self,
+        prefix: str,
+        exc: Any = "",
+        *,
+        tail_lines: int = _STDERR_TAIL_LINES,
+    ) -> str:
+        """Build a user-facing error string for codex failures.
+
+        Appends the last few lines of codex's stderr buffer when available,
+        passed through agent.redact with force=True so secrets in provider
+        error responses (auth headers, query-string tokens, sk-* keys) never
+        leak into chat output or trajectories. The codex CLI's own error
+        text ('Internal error', 'turn/start failed: ...') is otherwise
+        opaque and forces users to re-run with verbose flags to diagnose
+        config / provider / auth-bridge problems.
+
+        Use this for the generic / catch-all branches. Specific
+        classifications (OAuth via _classify_oauth_failure, post-tool wedge
+        watchdog) already produce a clean hint and should be used instead.
+        """
+        exc_str = str(exc) if exc != "" and exc is not None else ""
+        base = f"{prefix}: {exc_str}" if exc_str else prefix
+        if self._client is None:
+            return base
+        try:
+            tail = self._client.stderr_tail(tail_lines)
+        except Exception:  # pragma: no cover - diagnostic best-effort
+            return base
+        if not tail:
+            return base
+        joined = "\n".join(line.rstrip() for line in tail if line)
+        if not joined.strip():
+            return base
+        redacted = redact_sensitive_text(joined, force=True)
+        return f"{base}\ncodex stderr (last {len(tail)} lines):\n{redacted}"
+
    # ---------- per-turn ----------

    def run_turn(
@ -296,12 +343,27 @@ class CodexAppServerSession:
        Mirrors openclaw beta.8's post-tool completion watchdog (#81697)
        so a wedged codex doesn't burn the full turn deadline.
        """
-        self.ensure_started()
+        # Pre-create the result so startup failures (codex subprocess can't
+        # spawn, initialize handshake rejects, thread/start blows up) surface
+        # the same way per-turn failures do — with a TurnResult.error string
+        # the caller can render — instead of bubbling raw codex exceptions
+        # up to AIAgent.run_conversation.
+        result = TurnResult()
+        try:
+            self.ensure_started()
+        except (CodexAppServerError, TimeoutError) as exc:
+            result.error = self._format_error_with_stderr(
+                "codex app-server startup failed", exc
+            )
+            # Subprocess almost certainly unhealthy — retire so the next
+            # turn re-spawns cleanly.
+            result.should_retire = True
+            return result
        assert self._client is not None and self._thread_id is not None
+        result.thread_id = self._thread_id

        self._interrupt_event.clear()
        projector = CodexEventProjector()
-        result = TurnResult(thread_id=self._thread_id)

        # Send turn/start with the user input. Text-only for now (codex
        # supports rich content but Hermes' text path is the common case).
@ -327,13 +389,17 @@ class CodexAppServerSession:
                # via `codex login` between turns).
                result.should_retire = True
            else:
-                result.error = f"turn/start failed: {exc}"
+                result.error = self._format_error_with_stderr(
+                    "turn/start failed", exc
+                )
            return result
        except TimeoutError as exc:
            # turn/start hanging is a strong signal the subprocess is wedged.
            stderr_blob = "\n".join(self._client.stderr_tail(40))
            hint = _classify_oauth_failure(stderr_blob)
-            result.error = hint or f"turn/start timed out: {exc}"
+            result.error = hint or self._format_error_with_stderr(
+                "turn/start timed out", exc
+            )
            result.should_retire = True
            return result

@ -359,10 +425,13 @@ class CodexAppServerSession:
            if not self._client.is_alive():
                stderr_blob = "\n".join(self._client.stderr_tail(60))
                hint = _classify_oauth_failure(stderr_blob)
-                result.error = hint or (
-                    f"codex app-server subprocess exited unexpectedly: "
-                    f"{stderr_blob[-300:] if stderr_blob else '<no stderr>'}"
-                )
+                if hint is not None:
+                    result.error = hint
+                else:
+                    result.error = self._format_error_with_stderr(
+                        "codex app-server subprocess exited unexpectedly",
+                        tail_lines=20,
+                    )
                result.should_retire = True
                break

@ -489,8 +558,8 @@ class CodexAppServerSession:
                            result.error = hint
                            result.should_retire = True
                        else:
-                            result.error = (
-                                f"turn ended status={turn_status}: {err_msg}"
+                            result.error = self._format_error_with_stderr(
+                                f"turn ended status={turn_status}", err_msg
                            )

        if not turn_complete and not result.interrupted:
@ -500,7 +569,10 @@ class CodexAppServerSession:
            # turn shouldn't inherit.
            self._issue_interrupt(result.turn_id)
            result.interrupted = True
-            result.error = result.error or f"turn timed out after {turn_timeout}s"
+            if not result.error:
+                result.error = self._format_error_with_stderr(
+                    f"turn timed out after {turn_timeout}s"
+                )
            result.should_retire = True

        return result
--- a/tests/agent/transports/test_codex_app_server_session.py
+++ b/tests/agent/transports/test_codex_app_server_session.py
@ -231,6 +231,86 @@ class TestRunTurn:
        assert "bad input" in r.error
        assert r.final_text == ""

+    def test_turn_start_failure_attaches_redacted_stderr_tail(self):
+        """When codex stderr has content (non-OAuth), the tail gets attached
+        to the user-facing error so config/provider problems are debuggable
+        instead of just 'Internal error'. Secrets in stderr are redacted
+        via agent.redact(force=True)."""
+        client = FakeClient()
+        client.set_stderr_tail([
+            "ERROR: provider auth failed",
+            "Authorization: Bearer sk-live-deadbeefdeadbeef",
+            "url=https://api.example.com/v1?token=querysecret12345",
+        ])
+        from agent.transports.codex_app_server import CodexAppServerError
+
+        def boom(method, params):
+            if method == "turn/start":
+                raise CodexAppServerError(code=-32603, message="Internal error")
+            return {"thread": {"id": "t"}, "activePermissionProfile": {"id": "x"}}
+
+        client._request_handler = boom
+        s = make_session(client)
+        r = s.run_turn("hi", turn_timeout=2.0)
+        assert r.error is not None
+        assert "turn/start failed" in r.error
+        assert "Internal error" in r.error
+        # Stderr tail attached
+        assert "codex stderr" in r.error
+        assert "provider auth failed" in r.error
+        # Secrets redacted
+        assert "sk-live-deadbeefdeadbeef" not in r.error
+        assert "querysecret12345" not in r.error
+        # Non-OAuth → should NOT retire (subprocess JSON-RPC is still healthy).
+        assert r.should_retire is False
+
+    def test_turn_start_timeout_attaches_redacted_stderr_tail(self):
+        """A non-OAuth TimeoutError on turn/start surfaces with codex stderr
+        context attached and marks the session for retirement."""
+        client = FakeClient()
+        client.set_stderr_tail([
+            "WARN: provider request stalled",
+            "Authorization: Bearer sk-stalled-secret-abc123",
+        ])
+
+        def stall(method, params):
+            if method == "turn/start":
+                raise TimeoutError("codex method 'turn/start' timed out after 10s")
+            return {"thread": {"id": "t"}, "activePermissionProfile": {"id": "x"}}
+
+        client._request_handler = stall
+        s = make_session(client)
+        r = s.run_turn("hi", turn_timeout=2.0)
+        assert r.error is not None
+        assert "turn/start timed out" in r.error
+        assert "provider request stalled" in r.error
+        assert "sk-stalled-secret-abc123" not in r.error
+        assert r.should_retire is True
+
+    def test_startup_failure_returns_error_with_stderr(self):
+        """Codex thread/start failures during ensure_started() used to bubble
+        up as uncaught exceptions. Now they return a TurnResult.error so
+        AIAgent surfaces a clean diagnostic instead of crashing the turn."""
+        client = FakeClient()
+        client.set_stderr_tail([
+            "FATAL: model_provider 'azure_foundry' not configured",
+        ])
+        from agent.transports.codex_app_server import CodexAppServerError
+
+        def boom(method, params):
+            if method == "thread/start":
+                raise CodexAppServerError(code=-32603, message="Internal error")
+            return {}
+
+        client._request_handler = boom
+        s = make_session(client)
+        r = s.run_turn("hi", turn_timeout=2.0)
+        assert r.error is not None
+        assert "startup failed" in r.error
+        assert "model_provider 'azure_foundry' not configured" in r.error
+        assert r.should_retire is True
+        assert r.final_text == ""
+
    def test_interrupt_during_turn_issues_turn_interrupt(self):
        client = FakeClient()
        # Don't queue turn/completed — the loop has to interrupt out