fix(stream-retry): collapse two-line drop status, name provider, and let agent.log capture diagnostics (#22993)

Subagent stream drops were spamming the parent terminal with two lines per blip ('Connection dropped...' + 'Reconnected...') while leaving zero breadcrumb in agent.log to debug them. Two underlying bugs, fixed together: 1. quiet_mode raised the run_agent/tools/etc. loggers to ERROR, which filters records before root-logger file handlers see them. The comment claimed 'File handlers still capture everything' — that was wrong. Removed in both run_agent.py and cli.py; console quietness already comes from hermes_logging not installing a console StreamHandler in non-verbose mode. 2. The stream-retry blocks emitted two _emit_status calls per drop ('⚠️ Connection dropped... Reconnecting...' + '🔄 Reconnected — resuming…') with no provider name, so multi-provider sessions had to dig through agent.log to attribute a drop. Replaced both call sites with a single _emit_stream_drop helper that emits ONE line naming the provider and error class, and always writes a structured WARNING to agent.log with subagent_id, depth, provider, base_url, error_type. Net UX change: 6 lines per triple-subagent drop → 3 lines, each naming the provider. agent.log now has a structured breadcrumb per retry that didn't exist before. Tests: 6 new tests in tests/run_agent/test_stream_drop_logging.py covering the logger-level guard, structured WARNING content, single status line per drop (no Reconnected follow-up), and provider naming.
2026-05-18 04:41:56 +00:00 · 2026-05-09 22:35:35 -07:00 · 2026-05-09 22:35:35 -07:00 · 68e44642c8
commit 68e44642c8
parent 3800972dd0
3 changed files with 290 additions and 56 deletions
--- a/cli.py
+++ b/cli.py
@ -8267,8 +8267,13 @@ class HermesCLI:
                logging.getLogger(noisy).setLevel(logging.WARNING)
        else:
            logging.getLogger().setLevel(logging.INFO)
-            for quiet_logger in ('tools', 'run_agent', 'trajectory_compressor', 'cron', 'hermes_cli'):
-                logging.getLogger(quiet_logger).setLevel(logging.ERROR)
+            # NOTE: We deliberately do NOT raise per-logger levels for
+            # tools/run_agent/etc. in quiet mode. Setting logger.setLevel
+            # above the file handler level filters records before they
+            # reach handlers, so agent.log / errors.log lose visibility
+            # into stream-retry events, credential rotations, etc.
+            # Console quietness is enforced by hermes_logging not
+            # installing a console StreamHandler in non-verbose mode.

    def _show_insights(self, command: str = "/insights"):
        """Show usage insights and analytics from session history."""
--- a/run_agent.py
+++ b/run_agent.py
@ -1435,18 +1435,17 @@ class AIAgent:
            logger.info("Verbose logging enabled (third-party library logs suppressed)")
        else:
            if self.quiet_mode:
-                # In quiet mode (CLI default), suppress all tool/infra log
-                # noise on the *console*. The TUI has its own rich display
-                # for status; logger INFO/WARNING messages just clutter it.
-                # File handlers (agent.log, errors.log) still capture everything.
-                for quiet_logger in [
-                    'tools',               # all tools.* (terminal, browser, web, file, etc.)
-                    'run_agent',            # agent runner internals
-                    'trajectory_compressor',
-                    'cron',                 # scheduler (only relevant in daemon mode)
-                    'hermes_cli',           # CLI helpers
-                ]:
-                    logging.getLogger(quiet_logger).setLevel(logging.ERROR)
+                # In quiet mode (CLI default), keep console output clean —
+                # but DO NOT raise per-logger levels. Doing so prevents the
+                # root logger's file handlers (agent.log, errors.log) from
+                # ever seeing the records, because Python checks
+                # logger.isEnabledFor() before handler propagation. We rely
+                # on the fact that hermes_logging.setup_logging() does not
+                # install a console StreamHandler in quiet mode — so INFO
+                # records flow to the file handlers but never reach a
+                # console. Any future noise reduction belongs at the
+                # handler level inside hermes_logging.py, not here.
+                pass
        
        # Internal stream callback (set during streaming TTS).
        # Initialized here so _vprint can reference it before run_conversation.
@ -2823,6 +2822,90 @@ class AIAgent:
            except Exception:
                logger.debug("status_callback error in _emit_warning", exc_info=True)

+    def _log_stream_retry(
+        self,
+        *,
+        kind: str,
+        error: BaseException,
+        attempt: int,
+        max_attempts: int,
+        mid_tool_call: bool,
+    ) -> None:
+        """Record a transient stream-drop and retry to ``agent.log``.
+
+        Always logs a structured WARNING so users have a breadcrumb regardless
+        of UI verbosity.  Subagents in particular benefit because their
+        retries no longer spam the parent's terminal — but the file log keeps
+        full detail (provider, error class, attempt, base_url, subagent_id).
+        """
+        try:
+            try:
+                _summary = self._summarize_api_error(error)
+            except Exception:
+                _summary = str(error)
+            if _summary and len(_summary) > 240:
+                _summary = _summary[:240] + "…"
+            logger.warning(
+                "Stream %s on attempt %s/%s — retrying. "
+                "subagent_id=%s depth=%s provider=%s base_url=%s "
+                "error_type=%s error=%s",
+                kind,
+                attempt,
+                max_attempts,
+                getattr(self, "_subagent_id", None) or "-",
+                getattr(self, "_delegate_depth", 0),
+                self.provider or "-",
+                self.base_url or "-",
+                type(error).__name__,
+                _summary,
+                extra={"mid_tool_call": mid_tool_call},
+            )
+        except Exception:
+            logger.debug("stream-retry log emit failed", exc_info=True)
+
+    def _emit_stream_drop(
+        self,
+        *,
+        error: BaseException,
+        attempt: int,
+        max_attempts: int,
+        mid_tool_call: bool,
+    ) -> None:
+        """Emit a single user-visible line for a stream drop+retry.
+
+        Both top-level agents and subagents announce drops in the UI — the
+        parent prefixes subagent lines with ``[subagent-N]`` via ``log_prefix``
+        so they're easy to attribute.  All cases also write a structured
+        WARNING to ``agent.log`` via :meth:`_log_stream_retry` with the full
+        diagnostic detail (subagent_id, provider, base_url, error_type) for
+        post-hoc analysis.
+
+        Replaces the older two-line ``⚠️ Connection dropped …`` +
+        ``🔄 Reconnected …`` pair with a single information-dense line that
+        names the provider (so multi-provider sessions can tell who dropped)
+        and the error class without ambiguity.
+        """
+        kind = "drop mid tool-call" if mid_tool_call else "drop"
+        self._log_stream_retry(
+            kind=kind,
+            error=error,
+            attempt=attempt,
+            max_attempts=max_attempts,
+            mid_tool_call=mid_tool_call,
+        )
+        provider = self.provider or "provider"
+        try:
+            self._emit_status(
+                f"⚠️ {provider} stream {kind} ({type(error).__name__}) "
+                f"— reconnecting, retry {attempt}/{max_attempts}"
+            )
+            self._touch_activity(
+                f"stream retry {attempt}/{max_attempts} "
+                f"after {type(error).__name__}"
+            )
+        except Exception:
+            pass
+
    def _emit_auxiliary_failure(self, task: str, exc: BaseException) -> None:
        """Surface a compact warning for failed auxiliary work."""
        try:
@ -7719,17 +7802,9 @@ class AIAgent:
                            # retry silently.  Clear per-attempt state so the
                            # next stream starts clean.  Fire a "reconnecting"
                            # marker so the user sees why the preamble is
-                            # about to be re-streamed.
-                            logger.info(
-                                "Streaming attempt %s/%s died mid tool-call "
-                                "(%s: %s) after user-visible text; retrying "
-                                "silently to avoid losing the action. "
-                                "Preamble will re-stream.",
-                                _stream_attempt + 1,
-                                _max_stream_retries + 1,
-                                type(e).__name__,
-                                e,
-                            )
+                            # about to be re-streamed.  Structured WARNING is
+                            # emitted by ``_emit_stream_drop`` below; no
+                            # additional INFO line needed.
                            try:
                                self._fire_stream_delta(
                                    "\n\n⚠ Connection dropped mid tool-call; "
@ -7751,14 +7826,11 @@ class AIAgent:
                            result["partial_tool_names"] = []
                            deltas_were_sent["yes"] = False
                            first_delta_fired["done"] = False
-                            self._emit_status(
-                                f"⚠️ Connection dropped mid tool-call "
-                                f"({type(e).__name__}). Reconnecting… "
-                                f"(attempt {_stream_attempt + 2}/{_max_stream_retries + 1})"
-                            )
-                            self._touch_activity(
-                                f"stream retry {_stream_attempt + 2}/{_max_stream_retries + 1} "
-                                f"mid tool-call after {type(e).__name__}"
+                            self._emit_stream_drop(
+                                error=e,
+                                attempt=_stream_attempt + 2,
+                                max_attempts=_max_stream_retries + 1,
+                                mid_tool_call=True,
                            )
                            stale = request_client_holder.get("client")
                            if stale is not None:
@ -7772,7 +7844,6 @@ class AIAgent:
                                )
                            except Exception:
                                pass
-                            self._emit_status("🔄 Reconnected — resuming…")
                            continue

                        # SSE error events from proxies (e.g. OpenRouter sends
@ -7809,22 +7880,11 @@ class AIAgent:
                            # Transient network / timeout error. Retry the
                            # streaming request with a fresh connection first.
                            if _stream_attempt < _max_stream_retries:
-                                logger.info(
-                                    "Streaming attempt %s/%s failed (%s: %s), "
-                                    "retrying with fresh connection...",
-                                    _stream_attempt + 1,
-                                    _max_stream_retries + 1,
-                                    type(e).__name__,
-                                    e,
-                                )
-                                self._emit_status(
-                                    f"⚠️ Connection to provider dropped "
-                                    f"({type(e).__name__}). Reconnecting… "
-                                    f"(attempt {_stream_attempt + 2}/{_max_stream_retries + 1})"
-                                )
-                                self._touch_activity(
-                                    f"stream retry {_stream_attempt + 2}/{_max_stream_retries + 1} "
-                                    f"after {type(e).__name__}"
+                                self._emit_stream_drop(
+                                    error=e,
+                                    attempt=_stream_attempt + 2,
+                                    max_attempts=_max_stream_retries + 1,
+                                    mid_tool_call=False,
                                )
                                # Close the stale request client before retry
                                stale = request_client_holder.get("client")
@ -7841,19 +7901,28 @@ class AIAgent:
                                    )
                                except Exception:
                                    pass
-                                self._emit_status("🔄 Reconnected — resuming…")
                                continue
+                            # Retries exhausted. Log the final failure with
+                            # full diagnostic detail and surface a status
+                            # line — subagent lines get the ``[subagent-N]``
+                            # log_prefix so the parent can attribute them.
+                            logger.warning(
+                                "Streaming exhausted %s retries on transient "
+                                "error: subagent_id=%s depth=%s provider=%s "
+                                "error_type=%s error=%s",
+                                _max_stream_retries + 1,
+                                getattr(self, "_subagent_id", None) or "-",
+                                getattr(self, "_delegate_depth", 0),
+                                self.provider or "-",
+                                type(e).__name__,
+                                e,
+                            )
                            self._emit_status(
                                "❌ Connection to provider failed after "
                                f"{_max_stream_retries + 1} attempts. "
                                "The provider may be experiencing issues — "
                                "try again in a moment."
                            )
-                            logger.warning(
-                                "Streaming exhausted %s retries on transient error: %s",
-                                _max_stream_retries + 1,
-                                e,
-                            )
                        else:
                            _err_lower = str(e).lower()
                            _is_stream_unsupported = (
--- a/tests/run_agent/test_stream_drop_logging.py
+++ b/tests/run_agent/test_stream_drop_logging.py
@ -0,0 +1,160 @@
+"""Tests for the structured stream-drop log + clearer single-line status.
+
+Regression coverage for the change that:
+
+1. Removed ``logger.setLevel(ERROR)`` on tools/run_agent/etc. in quiet mode.
+   It was clobbering the root logger's file handlers (agent.log/errors.log)
+   because Python checks logger-level before handler propagation, so
+   subagent stream-drop diagnostics were never written to disk.
+2. Replaced the two ``⚠️ Connection dropped …`` + ``🔄 Reconnected …``
+   ``_emit_status`` calls with a single ``_emit_stream_drop`` helper that:
+   - Always writes a structured WARNING to ``agent.log``.
+   - Always emits exactly ONE user-visible status line per drop (no
+     follow-up "Reconnected" line) that names the provider and error
+     class so multi-provider sessions can attribute it cleanly.
+   - Subagent lines get the ``[subagent-N]`` ``log_prefix`` automatically
+     via ``_emit_status`` → ``_vprint``.
+"""
+
+from __future__ import annotations
+
+import logging
+from unittest.mock import patch
+
+import pytest
+
+import run_agent
+from run_agent import AIAgent
+
+
+def _make_agent() -> AIAgent:
+    return AIAgent(
+        api_key="test-key",
+        base_url="https://openrouter.ai/api/v1",
+        quiet_mode=True,
+        skip_context_files=True,
+        skip_memory=True,
+    )
+
+
+def test_quiet_mode_does_not_clobber_runagent_logger_level():
+    """quiet_mode must not raise the ``run_agent`` logger above WARNING.
+
+    Setting ``setLevel(ERROR)`` on the logger filters records *before* root
+    logger handlers (agent.log/errors.log) ever see them.  Stream-drop
+    diagnostics must reach the file handlers regardless of console verbosity.
+    """
+    _ = _make_agent()
+    for name in ("run_agent", "tools", "trajectory_compressor", "cron", "hermes_cli"):
+        logger = logging.getLogger(name)
+        assert logger.getEffectiveLevel() <= logging.WARNING, (
+            f"{name} logger blocked at level {logger.getEffectiveLevel()} — "
+            f"file handlers will lose WARNING records"
+        )
+
+
+def test_log_stream_retry_writes_structured_warning(caplog):
+    agent = _make_agent()
+    agent._delegate_depth = 1
+    agent._subagent_id = "sa-7-cafef00d"
+    agent.provider = "openrouter"
+
+    err = ConnectionError("Network connection lost.")
+    with caplog.at_level(logging.WARNING, logger="run_agent"):
+        agent._log_stream_retry(
+            kind="drop mid tool-call",
+            error=err,
+            attempt=2,
+            max_attempts=3,
+            mid_tool_call=True,
+        )
+
+    matching = [r for r in caplog.records if "Stream drop mid tool-call" in r.getMessage()]
+    assert matching, f"no stream-drop WARNING captured; got {[r.getMessage() for r in caplog.records]}"
+    msg = matching[0].getMessage()
+    assert "subagent_id=sa-7-cafef00d" in msg
+    assert "depth=1" in msg
+    assert "provider=openrouter" in msg
+    assert "base_url=https://openrouter.ai/api/v1" in msg
+    assert "error_type=ConnectionError" in msg
+
+
+@pytest.mark.parametrize("depth", [0, 1])
+def test_emit_stream_drop_emits_status_line(depth):
+    """Both top-level and subagent paths emit exactly one status line.
+
+    Subagent lines get the ``[subagent-N]`` log_prefix via the parent's
+    ``_vprint`` plumbing — this test only checks that ``_emit_status`` is
+    called once with the right content.  No "Reconnected" follow-up.
+    """
+    agent = _make_agent()
+    agent._delegate_depth = depth
+    if depth > 0:
+        agent._subagent_id = "sa-2-cafe"
+    agent.provider = "openrouter"
+
+    with patch.object(agent, "_emit_status") as mock_emit:
+        agent._emit_stream_drop(
+            error=ConnectionError("boom"),
+            attempt=2,
+            max_attempts=3,
+            mid_tool_call=True,
+        )
+
+    assert mock_emit.call_count == 1, (
+        f"expected exactly one _emit_status call (no Reconnected follow-up), "
+        f"got {mock_emit.call_count}"
+    )
+    msg = mock_emit.call_args.args[0]
+    assert "openrouter" in msg, f"provider name missing from status: {msg}"
+    assert "stream drop" in msg
+    assert "ConnectionError" in msg
+    assert "retry 2/3" in msg
+    # Single line — no separate "Reconnected" message.  But the line itself
+    # should mention reconnecting so the user knows we're recovering.
+    assert "reconnect" in msg.lower()
+
+
+@pytest.mark.parametrize("mid_tool_call", [True, False])
+def test_emit_stream_drop_always_writes_to_log(caplog, mid_tool_call):
+    """Both subagent and top-level drops produce a WARNING in agent.log."""
+    agent = _make_agent()
+    agent._delegate_depth = 1 if mid_tool_call else 0
+    agent.provider = "openrouter"
+    if mid_tool_call:
+        agent._subagent_id = "sa-99-feed"
+
+    with caplog.at_level(logging.WARNING, logger="run_agent"):
+        agent._emit_stream_drop(
+            error=TimeoutError("read timeout"),
+            attempt=2,
+            max_attempts=3,
+            mid_tool_call=mid_tool_call,
+        )
+
+    found = [r for r in caplog.records if r.getMessage().startswith("Stream drop")]
+    assert found, "expected at least one Stream drop WARNING record"
+    msg = found[0].getMessage()
+    assert "error_type=TimeoutError" in msg
+    assert "provider=openrouter" in msg
+
+
+def test_emit_stream_drop_provider_named_when_multi_provider():
+    """The user-visible line must name the provider so multi-provider
+    sessions can tell which subagent dropped (the original two-line message
+    only said 'provider', forcing a log dive)."""
+    agent = _make_agent()
+    agent._delegate_depth = 1
+    agent._subagent_id = "sa-1"
+    agent.provider = "anthropic"
+
+    with patch.object(agent, "_emit_status") as mock_emit:
+        agent._emit_stream_drop(
+            error=ConnectionError("x"),
+            attempt=3,
+            max_attempts=3,
+            mid_tool_call=False,
+        )
+
+    msg = mock_emit.call_args.args[0]
+    assert "anthropic" in msg