feat(stream-retry): add upstream + timing diagnostics to drop log (#23005)

The previous PR (#22993) gave us a structured WARNING per stream drop but the only diagnostic was 'error_type=APIError error=Network connection lost.' — same nothing the user started with. To actually diagnose why subagents drop streams disproportionately we need to know WHERE the drop happened. Adds three breadcrumbs to the agent.log WARNING: 1. Inner exception chain. openai SDK wraps httpx errors as APIConnectionError / APIError so the catch site only sees the wrapper. _flatten_exception_chain walks __cause__/__context__ up to 4 levels deep and renders 'Outer(msg) <- Inner(msg)' so we can tell ConnectError vs RemoteProtocolError vs ReadError vs ProxyError without enabling verbose mode. 2. Upstream HTTP headers. Snapshots cf-ray, x-openrouter-provider, x-openrouter-model, x-openrouter-id, x-request-id, server, via, etc. from stream.response immediately after open (so they survive even when the stream dies before the first chunk). These answer 'is one CF edge / one downstream provider responsible, or random?' 3. Per-attempt counters. bytes streamed, chunk count, elapsed time on the dying attempt, and time-to-first-byte. Distinguishes 'couldn't connect at all' (0s, 0 bytes) from 'died after 30s mid-stream' (very different root causes — first is auth/routing, second is upstream idle-kill or proxy timeout). Plumbing: - _stream_diag_init / _stream_diag_capture_response live on AIAgent and produce a per-attempt dict held on request_client_holder['diag'] for closure access from the retry block. - _call_chat_completions and _call_anthropic both initialize the diag and increment counters per chunk/event (best-effort, never raises in the streaming hot path). - _log_stream_retry / _emit_stream_drop accept an optional diag and render the new fields. Final-exhaustion log goes through the same helper so it gets the same diagnostic dump. - UI status line gains a brief 'after Xs' suffix when timing is available — distinguishes 'connect failed' from 'died mid-stream' at a glance without grepping logs. Sample WARNING after this change: Stream drop mid tool-call on attempt 2/3 — retrying. subagent_id=sa-2-cafef00d depth=1 provider=openrouter base_url=https://openrouter.ai/api/v1 error_type=APIError error=Connection error. chain=APIError(Connection error.) <- RemoteProtocolError(peer closed connection without sending complete message body) http_status=200 bytes=12400 chunks=47 elapsed=12.00s ttfb=0.83s upstream=[cf-ray=8f1a2b3c4d5e6f7g-LAX x-openrouter-provider=Anthropic x-openrouter-id=gen-abc123 server=cloudflare] Tests: 10 covering diag init, header capture (whitelist enforced for PII), exception-chain walking + depth cap, log content with full diag, log content without diag (placeholders), UI elapsed-suffix on/off.
2026-05-18 04:41:56 +00:00 · 2026-05-09 22:49:35 -07:00 · 2026-05-09 22:49:35 -07:00 · 126cbffb8a
commit 126cbffb8a
parent 5a70d9b6be
2 changed files with 420 additions and 120 deletions
--- a/tests/run_agent/test_stream_drop_logging.py
+++ b/tests/run_agent/test_stream_drop_logging.py
@ -1,24 +1,24 @@
-"""Tests for the structured stream-drop log + clearer single-line status.
+"""Tests for richer stream-drop diagnostics in agent.log.

-Regression coverage for the change that:
+When a subagent's stream drops mid-tool-call, the WARNING in agent.log must
+carry enough breadcrumbs to answer "WHY did it drop" without requiring a
+verbose-mode rerun.  Specifically:

-1. Removed ``logger.setLevel(ERROR)`` on tools/run_agent/etc. in quiet mode.
-   It was clobbering the root logger's file handlers (agent.log/errors.log)
-   because Python checks logger-level before handler propagation, so
-   subagent stream-drop diagnostics were never written to disk.
-2. Replaced the two ``⚠️ Connection dropped …`` + ``🔄 Reconnected …``
-   ``_emit_status`` calls with a single ``_emit_stream_drop`` helper that:
-   - Always writes a structured WARNING to ``agent.log``.
-   - Always emits exactly ONE user-visible status line per drop (no
-     follow-up "Reconnected" line) that names the provider and error
-     class so multi-provider sessions can attribute it cleanly.
-   - Subagent lines get the ``[subagent-N]`` ``log_prefix`` automatically
-     via ``_emit_status`` → ``_vprint``.
+- Inner exception chain (httpx errors wrapped by openai SDK)
+- Upstream HTTP headers (cf-ray, x-openrouter-provider, x-openrouter-id, ...)
+- HTTP status of the dying response
+- Bytes streamed and chunks received before the drop
+- Elapsed time on the attempt + time-to-first-byte
+
+Plus the user-visible UI line gains an ``after Xs`` suffix when timing data
+is available, distinguishing "couldn't connect at all" from "died mid-stream
+after N seconds" (very different root causes).
 """

 from __future__ import annotations

 import logging
+import time
 from unittest.mock import patch

 import pytest
@ -37,124 +37,211 @@ def _make_agent() -> AIAgent:
    )


-def test_quiet_mode_does_not_clobber_runagent_logger_level():
-    """quiet_mode must not raise the ``run_agent`` logger above WARNING.
-
-    Setting ``setLevel(ERROR)`` on the logger filters records *before* root
-    logger handlers (agent.log/errors.log) ever see them.  Stream-drop
-    diagnostics must reach the file handlers regardless of console verbosity.
-    """
-    _ = _make_agent()
-    for name in ("run_agent", "tools", "trajectory_compressor", "cron", "hermes_cli"):
-        logger = logging.getLogger(name)
-        assert logger.getEffectiveLevel() <= logging.WARNING, (
-            f"{name} logger blocked at level {logger.getEffectiveLevel()} — "
-            f"file handlers will lose WARNING records"
-        )
+def test_stream_diag_init_returns_well_formed_dict():
+    diag = AIAgent._stream_diag_init()
+    assert "started_at" in diag
+    assert diag["chunks"] == 0
+    assert diag["bytes"] == 0
+    assert diag["first_chunk_at"] is None
+    assert diag["http_status"] is None
+    assert diag["headers"] == {}


-def test_log_stream_retry_writes_structured_warning(caplog):
+class _FakeHeaders:
+    def __init__(self, d): self._d = {k.lower(): v for k, v in d.items()}
+    def get(self, k, default=None): return self._d.get(k.lower(), default)
+
+
+class _FakeResponse:
+    def __init__(self, headers, status=200):
+        self.status_code = status
+        self.headers = _FakeHeaders(headers)
+
+
+def test_stream_diag_capture_response_collects_known_headers():
+    agent = _make_agent()
+    diag = AIAgent._stream_diag_init()
+    resp = _FakeResponse({
+        "cf-ray": "8f1a2b3c4d5e6f7g-LAX",
+        "x-openrouter-provider": "Anthropic",
+        "x-openrouter-id": "gen-abc123",
+        "x-request-id": "req-xyz",
+        "server": "cloudflare",
+        "irrelevant-header": "ignored",
+    })
+    agent._stream_diag_capture_response(diag, resp)
+    assert diag["http_status"] == 200
+    assert diag["headers"]["cf-ray"] == "8f1a2b3c4d5e6f7g-LAX"
+    assert diag["headers"]["x-openrouter-provider"] == "Anthropic"
+    assert diag["headers"]["x-openrouter-id"] == "gen-abc123"
+    assert diag["headers"]["server"] == "cloudflare"
+    # Headers not in _STREAM_DIAG_HEADERS must not be captured (PII surface).
+    assert "irrelevant-header" not in diag["headers"]
+
+
+def test_stream_diag_capture_response_safe_with_none():
+    agent = _make_agent()
+    diag = AIAgent._stream_diag_init()
+    agent._stream_diag_capture_response(diag, None)
+    # Must not raise; diag stays initialized.
+    assert diag["headers"] == {}
+
+
+def test_flatten_exception_chain_walks_cause():
+    inner = ConnectionError("upstream closed")
+    middle = TimeoutError("timed out")
+    middle.__cause__ = inner
+    outer = RuntimeError("wrapper")
+    outer.__cause__ = middle
+    chain = AIAgent._flatten_exception_chain(outer)
+    assert "RuntimeError" in chain
+    assert "TimeoutError" in chain
+    assert "ConnectionError" in chain
+    assert " <- " in chain
+
+
+def test_flatten_exception_chain_caps_depth():
+    """Chain renders no more than 4 deep so log lines stay bounded."""
+    e0 = ValueError("0")
+    prev = e0
+    for i in range(1, 8):
+        nxt = ValueError(str(i))
+        nxt.__cause__ = prev
+        prev = nxt
+    chain = AIAgent._flatten_exception_chain(prev)
+    # 4 layers + 3 separators max.
+    assert chain.count("<-") <= 3
+
+
+def test_log_stream_retry_includes_diagnostic_fields(caplog):
    agent = _make_agent()
    agent._delegate_depth = 1
-    agent._subagent_id = "sa-7-cafef00d"
+    agent._subagent_id = "sa-3-deadbeef"
    agent.provider = "openrouter"

-    err = ConnectionError("Network connection lost.")
+    diag = AIAgent._stream_diag_init()
+    diag["http_status"] = 200
+    diag["headers"] = {
+        "cf-ray": "8f1a2b3c4d5e6f7g-LAX",
+        "x-openrouter-provider": "Anthropic",
+        "x-openrouter-id": "gen-xyz789",
+    }
+    diag["chunks"] = 12
+    diag["bytes"] = 4096
+    # Simulate 5s elapsed with first chunk at 0.5s.
+    diag["started_at"] = time.time() - 5.0
+    diag["first_chunk_at"] = diag["started_at"] + 0.5
+
+    inner = ConnectionError("peer closed")
+    outer = RuntimeError("Connection error.")
+    outer.__cause__ = inner
+
    with caplog.at_level(logging.WARNING, logger="run_agent"):
        agent._log_stream_retry(
            kind="drop mid tool-call",
-            error=err,
+            error=outer,
            attempt=2,
            max_attempts=3,
            mid_tool_call=True,
+            diag=diag,
        )

-    matching = [r for r in caplog.records if "Stream drop mid tool-call" in r.getMessage()]
-    assert matching, f"no stream-drop WARNING captured; got {[r.getMessage() for r in caplog.records]}"
-    msg = matching[0].getMessage()
-    assert "subagent_id=sa-7-cafef00d" in msg
-    assert "depth=1" in msg
-    assert "provider=openrouter" in msg
-    assert "base_url=https://openrouter.ai/api/v1" in msg
-    assert "error_type=ConnectionError" in msg
-
-
-@pytest.mark.parametrize("depth", [0, 1])
-def test_emit_stream_drop_emits_status_line(depth):
-    """Both top-level and subagent paths emit exactly one status line.
-
-    Subagent lines get the ``[subagent-N]`` log_prefix via the parent's
-    ``_vprint`` plumbing — this test only checks that ``_emit_status`` is
-    called once with the right content.  No "Reconnected" follow-up.
-    """
-    agent = _make_agent()
-    agent._delegate_depth = depth
-    if depth > 0:
-        agent._subagent_id = "sa-2-cafe"
-    agent.provider = "openrouter"
-
-    with patch.object(agent, "_emit_status") as mock_emit:
-        agent._emit_stream_drop(
-            error=ConnectionError("boom"),
-            attempt=2,
-            max_attempts=3,
-            mid_tool_call=True,
-        )
-
-    assert mock_emit.call_count == 1, (
-        f"expected exactly one _emit_status call (no Reconnected follow-up), "
-        f"got {mock_emit.call_count}"
+    msg = next(
+        r.getMessage() for r in caplog.records
+        if "Stream drop mid tool-call" in r.getMessage()
    )
-    msg = mock_emit.call_args.args[0]
-    assert "openrouter" in msg, f"provider name missing from status: {msg}"
-    assert "stream drop" in msg
-    assert "ConnectionError" in msg
-    assert "retry 2/3" in msg
-    # Single line — no separate "Reconnected" message.  But the line itself
-    # should mention reconnecting so the user knows we're recovering.
-    assert "reconnect" in msg.lower()
+
+    # Identity
+    assert "subagent_id=sa-3-deadbeef" in msg
+    assert "provider=openrouter" in msg
+
+    # Inner-cause chain
+    assert "RuntimeError" in msg and "ConnectionError" in msg
+
+    # Counters and timing
+    assert "http_status=200" in msg
+    assert "bytes=4096" in msg
+    assert "chunks=12" in msg
+    # elapsed should be roughly 5s; allow some slack.
+    assert "elapsed=" in msg
+    assert "ttfb=0.50s" in msg
+
+    # Upstream headers
+    assert "cf-ray=8f1a2b3c4d5e6f7g-LAX" in msg
+    assert "x-openrouter-provider=Anthropic" in msg
+    assert "x-openrouter-id=gen-xyz789" in msg


-@pytest.mark.parametrize("mid_tool_call", [True, False])
-def test_emit_stream_drop_always_writes_to_log(caplog, mid_tool_call):
-    """Both subagent and top-level drops produce a WARNING in agent.log."""
+def test_log_stream_retry_works_without_diag(caplog):
+    """diag is optional — older callers / unit tests still work."""
    agent = _make_agent()
-    agent._delegate_depth = 1 if mid_tool_call else 0
+    agent._delegate_depth = 0
    agent.provider = "openrouter"
-    if mid_tool_call:
-        agent._subagent_id = "sa-99-feed"

    with caplog.at_level(logging.WARNING, logger="run_agent"):
-        agent._emit_stream_drop(
-            error=TimeoutError("read timeout"),
+        agent._log_stream_retry(
+            kind="drop",
+            error=ConnectionError("x"),
            attempt=2,
            max_attempts=3,
-            mid_tool_call=mid_tool_call,
+            mid_tool_call=False,
        )

-    found = [r for r in caplog.records if r.getMessage().startswith("Stream drop")]
-    assert found, "expected at least one Stream drop WARNING record"
-    msg = found[0].getMessage()
-    assert "error_type=TimeoutError" in msg
-    assert "provider=openrouter" in msg
+    msg = next(r.getMessage() for r in caplog.records if "Stream drop" in r.getMessage())
+    # Without diag, the structured fields show "-" placeholders.
+    assert "http_status=-" in msg
+    assert "upstream=[-]" in msg
+    assert "bytes=0" in msg
+    assert "chunks=0" in msg
+    assert "ttfb=-" in msg


-def test_emit_stream_drop_provider_named_when_multi_provider():
-    """The user-visible line must name the provider so multi-provider
-    sessions can tell which subagent dropped (the original two-line message
-    only said 'provider', forcing a log dive)."""
+def test_emit_stream_drop_ui_includes_elapsed_when_available():
    agent = _make_agent()
-    agent._delegate_depth = 1
-    agent._subagent_id = "sa-1"
-    agent.provider = "anthropic"
+    agent.provider = "openrouter"
+
+    diag = AIAgent._stream_diag_init()
+    diag["started_at"] = time.time() - 8.0  # 8s on the wire before drop

    with patch.object(agent, "_emit_status") as mock_emit:
        agent._emit_stream_drop(
            error=ConnectionError("x"),
-            attempt=3,
+            attempt=2,
+            max_attempts=3,
+            mid_tool_call=True,
+            diag=diag,
+        )
+
+    msg = mock_emit.call_args.args[0]
+    # Suffix with elapsed time helps distinguish "couldn't connect" (0s)
+    # from "died mid-stream after a while".
+    assert "after" in msg and "s" in msg
+
+
+def test_emit_stream_drop_ui_omits_suffix_without_diag():
+    """When there's no diag, no suffix — line stays compact."""
+    agent = _make_agent()
+    agent.provider = "openrouter"
+
+    with patch.object(agent, "_emit_status") as mock_emit:
+        agent._emit_stream_drop(
+            error=ConnectionError("x"),
+            attempt=2,
            max_attempts=3,
            mid_tool_call=False,
        )

    msg = mock_emit.call_args.args[0]
-    assert "anthropic" in msg
+    # No "after Xs" suffix when diag is not provided.
+    assert " after " not in msg
+    # Still names the provider and error class.
+    assert "openrouter" in msg
+    assert "ConnectionError" in msg
+
+
+def test_quiet_mode_does_not_clobber_runagent_logger_level():
+    """Regression guard for the parent fix — must persist across this PR."""
+    _ = _make_agent()
+    for name in ("run_agent", "tools", "trajectory_compressor", "cron", "hermes_cli"):
+        logger = logging.getLogger(name)
+        assert logger.getEffectiveLevel() <= logging.WARNING