From 68e44642c8d126e058ed0bf2fa3c888fe98d7cc5 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 9 May 2026 22:35:35 -0700 Subject: [PATCH] fix(stream-retry): collapse two-line drop status, name provider, and let agent.log capture diagnostics (#22993) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Subagent stream drops were spamming the parent terminal with two lines per blip ('Connection dropped...' + 'Reconnected...') while leaving zero breadcrumb in agent.log to debug them. Two underlying bugs, fixed together: 1. quiet_mode raised the run_agent/tools/etc. loggers to ERROR, which filters records before root-logger file handlers see them. The comment claimed 'File handlers still capture everything' — that was wrong. Removed in both run_agent.py and cli.py; console quietness already comes from hermes_logging not installing a console StreamHandler in non-verbose mode. 2. The stream-retry blocks emitted two _emit_status calls per drop ('⚠️ Connection dropped... Reconnecting...' + '🔄 Reconnected — resuming…') with no provider name, so multi-provider sessions had to dig through agent.log to attribute a drop. Replaced both call sites with a single _emit_stream_drop helper that emits ONE line naming the provider and error class, and always writes a structured WARNING to agent.log with subagent_id, depth, provider, base_url, error_type. Net UX change: 6 lines per triple-subagent drop → 3 lines, each naming the provider. agent.log now has a structured breadcrumb per retry that didn't exist before. Tests: 6 new tests in tests/run_agent/test_stream_drop_logging.py covering the logger-level guard, structured WARNING content, single status line per drop (no Reconnected follow-up), and provider naming. --- cli.py | 9 +- run_agent.py | 177 ++++++++++++++------ tests/run_agent/test_stream_drop_logging.py | 160 ++++++++++++++++++ 3 files changed, 290 insertions(+), 56 deletions(-) create mode 100644 tests/run_agent/test_stream_drop_logging.py diff --git a/cli.py b/cli.py index d474e405a16..488d505b70b 100644 --- a/cli.py +++ b/cli.py @@ -8267,8 +8267,13 @@ class HermesCLI: logging.getLogger(noisy).setLevel(logging.WARNING) else: logging.getLogger().setLevel(logging.INFO) - for quiet_logger in ('tools', 'run_agent', 'trajectory_compressor', 'cron', 'hermes_cli'): - logging.getLogger(quiet_logger).setLevel(logging.ERROR) + # NOTE: We deliberately do NOT raise per-logger levels for + # tools/run_agent/etc. in quiet mode. Setting logger.setLevel + # above the file handler level filters records before they + # reach handlers, so agent.log / errors.log lose visibility + # into stream-retry events, credential rotations, etc. + # Console quietness is enforced by hermes_logging not + # installing a console StreamHandler in non-verbose mode. def _show_insights(self, command: str = "/insights"): """Show usage insights and analytics from session history.""" diff --git a/run_agent.py b/run_agent.py index a4c2465d04d..8413af96d72 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1435,18 +1435,17 @@ class AIAgent: logger.info("Verbose logging enabled (third-party library logs suppressed)") else: if self.quiet_mode: - # In quiet mode (CLI default), suppress all tool/infra log - # noise on the *console*. The TUI has its own rich display - # for status; logger INFO/WARNING messages just clutter it. - # File handlers (agent.log, errors.log) still capture everything. - for quiet_logger in [ - 'tools', # all tools.* (terminal, browser, web, file, etc.) - 'run_agent', # agent runner internals - 'trajectory_compressor', - 'cron', # scheduler (only relevant in daemon mode) - 'hermes_cli', # CLI helpers - ]: - logging.getLogger(quiet_logger).setLevel(logging.ERROR) + # In quiet mode (CLI default), keep console output clean — + # but DO NOT raise per-logger levels. Doing so prevents the + # root logger's file handlers (agent.log, errors.log) from + # ever seeing the records, because Python checks + # logger.isEnabledFor() before handler propagation. We rely + # on the fact that hermes_logging.setup_logging() does not + # install a console StreamHandler in quiet mode — so INFO + # records flow to the file handlers but never reach a + # console. Any future noise reduction belongs at the + # handler level inside hermes_logging.py, not here. + pass # Internal stream callback (set during streaming TTS). # Initialized here so _vprint can reference it before run_conversation. @@ -2823,6 +2822,90 @@ class AIAgent: except Exception: logger.debug("status_callback error in _emit_warning", exc_info=True) + def _log_stream_retry( + self, + *, + kind: str, + error: BaseException, + attempt: int, + max_attempts: int, + mid_tool_call: bool, + ) -> None: + """Record a transient stream-drop and retry to ``agent.log``. + + Always logs a structured WARNING so users have a breadcrumb regardless + of UI verbosity. Subagents in particular benefit because their + retries no longer spam the parent's terminal — but the file log keeps + full detail (provider, error class, attempt, base_url, subagent_id). + """ + try: + try: + _summary = self._summarize_api_error(error) + except Exception: + _summary = str(error) + if _summary and len(_summary) > 240: + _summary = _summary[:240] + "…" + logger.warning( + "Stream %s on attempt %s/%s — retrying. " + "subagent_id=%s depth=%s provider=%s base_url=%s " + "error_type=%s error=%s", + kind, + attempt, + max_attempts, + getattr(self, "_subagent_id", None) or "-", + getattr(self, "_delegate_depth", 0), + self.provider or "-", + self.base_url or "-", + type(error).__name__, + _summary, + extra={"mid_tool_call": mid_tool_call}, + ) + except Exception: + logger.debug("stream-retry log emit failed", exc_info=True) + + def _emit_stream_drop( + self, + *, + error: BaseException, + attempt: int, + max_attempts: int, + mid_tool_call: bool, + ) -> None: + """Emit a single user-visible line for a stream drop+retry. + + Both top-level agents and subagents announce drops in the UI — the + parent prefixes subagent lines with ``[subagent-N]`` via ``log_prefix`` + so they're easy to attribute. All cases also write a structured + WARNING to ``agent.log`` via :meth:`_log_stream_retry` with the full + diagnostic detail (subagent_id, provider, base_url, error_type) for + post-hoc analysis. + + Replaces the older two-line ``⚠️ Connection dropped …`` + + ``🔄 Reconnected …`` pair with a single information-dense line that + names the provider (so multi-provider sessions can tell who dropped) + and the error class without ambiguity. + """ + kind = "drop mid tool-call" if mid_tool_call else "drop" + self._log_stream_retry( + kind=kind, + error=error, + attempt=attempt, + max_attempts=max_attempts, + mid_tool_call=mid_tool_call, + ) + provider = self.provider or "provider" + try: + self._emit_status( + f"⚠️ {provider} stream {kind} ({type(error).__name__}) " + f"— reconnecting, retry {attempt}/{max_attempts}" + ) + self._touch_activity( + f"stream retry {attempt}/{max_attempts} " + f"after {type(error).__name__}" + ) + except Exception: + pass + def _emit_auxiliary_failure(self, task: str, exc: BaseException) -> None: """Surface a compact warning for failed auxiliary work.""" try: @@ -7719,17 +7802,9 @@ class AIAgent: # retry silently. Clear per-attempt state so the # next stream starts clean. Fire a "reconnecting" # marker so the user sees why the preamble is - # about to be re-streamed. - logger.info( - "Streaming attempt %s/%s died mid tool-call " - "(%s: %s) after user-visible text; retrying " - "silently to avoid losing the action. " - "Preamble will re-stream.", - _stream_attempt + 1, - _max_stream_retries + 1, - type(e).__name__, - e, - ) + # about to be re-streamed. Structured WARNING is + # emitted by ``_emit_stream_drop`` below; no + # additional INFO line needed. try: self._fire_stream_delta( "\n\n⚠ Connection dropped mid tool-call; " @@ -7751,14 +7826,11 @@ class AIAgent: result["partial_tool_names"] = [] deltas_were_sent["yes"] = False first_delta_fired["done"] = False - self._emit_status( - f"⚠️ Connection dropped mid tool-call " - f"({type(e).__name__}). Reconnecting… " - f"(attempt {_stream_attempt + 2}/{_max_stream_retries + 1})" - ) - self._touch_activity( - f"stream retry {_stream_attempt + 2}/{_max_stream_retries + 1} " - f"mid tool-call after {type(e).__name__}" + self._emit_stream_drop( + error=e, + attempt=_stream_attempt + 2, + max_attempts=_max_stream_retries + 1, + mid_tool_call=True, ) stale = request_client_holder.get("client") if stale is not None: @@ -7772,7 +7844,6 @@ class AIAgent: ) except Exception: pass - self._emit_status("🔄 Reconnected — resuming…") continue # SSE error events from proxies (e.g. OpenRouter sends @@ -7809,22 +7880,11 @@ class AIAgent: # Transient network / timeout error. Retry the # streaming request with a fresh connection first. if _stream_attempt < _max_stream_retries: - logger.info( - "Streaming attempt %s/%s failed (%s: %s), " - "retrying with fresh connection...", - _stream_attempt + 1, - _max_stream_retries + 1, - type(e).__name__, - e, - ) - self._emit_status( - f"⚠️ Connection to provider dropped " - f"({type(e).__name__}). Reconnecting… " - f"(attempt {_stream_attempt + 2}/{_max_stream_retries + 1})" - ) - self._touch_activity( - f"stream retry {_stream_attempt + 2}/{_max_stream_retries + 1} " - f"after {type(e).__name__}" + self._emit_stream_drop( + error=e, + attempt=_stream_attempt + 2, + max_attempts=_max_stream_retries + 1, + mid_tool_call=False, ) # Close the stale request client before retry stale = request_client_holder.get("client") @@ -7841,19 +7901,28 @@ class AIAgent: ) except Exception: pass - self._emit_status("🔄 Reconnected — resuming…") continue + # Retries exhausted. Log the final failure with + # full diagnostic detail and surface a status + # line — subagent lines get the ``[subagent-N]`` + # log_prefix so the parent can attribute them. + logger.warning( + "Streaming exhausted %s retries on transient " + "error: subagent_id=%s depth=%s provider=%s " + "error_type=%s error=%s", + _max_stream_retries + 1, + getattr(self, "_subagent_id", None) or "-", + getattr(self, "_delegate_depth", 0), + self.provider or "-", + type(e).__name__, + e, + ) self._emit_status( "❌ Connection to provider failed after " f"{_max_stream_retries + 1} attempts. " "The provider may be experiencing issues — " "try again in a moment." ) - logger.warning( - "Streaming exhausted %s retries on transient error: %s", - _max_stream_retries + 1, - e, - ) else: _err_lower = str(e).lower() _is_stream_unsupported = ( diff --git a/tests/run_agent/test_stream_drop_logging.py b/tests/run_agent/test_stream_drop_logging.py new file mode 100644 index 00000000000..b319725f76b --- /dev/null +++ b/tests/run_agent/test_stream_drop_logging.py @@ -0,0 +1,160 @@ +"""Tests for the structured stream-drop log + clearer single-line status. + +Regression coverage for the change that: + +1. Removed ``logger.setLevel(ERROR)`` on tools/run_agent/etc. in quiet mode. + It was clobbering the root logger's file handlers (agent.log/errors.log) + because Python checks logger-level before handler propagation, so + subagent stream-drop diagnostics were never written to disk. +2. Replaced the two ``⚠️ Connection dropped …`` + ``🔄 Reconnected …`` + ``_emit_status`` calls with a single ``_emit_stream_drop`` helper that: + - Always writes a structured WARNING to ``agent.log``. + - Always emits exactly ONE user-visible status line per drop (no + follow-up "Reconnected" line) that names the provider and error + class so multi-provider sessions can attribute it cleanly. + - Subagent lines get the ``[subagent-N]`` ``log_prefix`` automatically + via ``_emit_status`` → ``_vprint``. +""" + +from __future__ import annotations + +import logging +from unittest.mock import patch + +import pytest + +import run_agent +from run_agent import AIAgent + + +def _make_agent() -> AIAgent: + return AIAgent( + api_key="test-key", + base_url="https://openrouter.ai/api/v1", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + + +def test_quiet_mode_does_not_clobber_runagent_logger_level(): + """quiet_mode must not raise the ``run_agent`` logger above WARNING. + + Setting ``setLevel(ERROR)`` on the logger filters records *before* root + logger handlers (agent.log/errors.log) ever see them. Stream-drop + diagnostics must reach the file handlers regardless of console verbosity. + """ + _ = _make_agent() + for name in ("run_agent", "tools", "trajectory_compressor", "cron", "hermes_cli"): + logger = logging.getLogger(name) + assert logger.getEffectiveLevel() <= logging.WARNING, ( + f"{name} logger blocked at level {logger.getEffectiveLevel()} — " + f"file handlers will lose WARNING records" + ) + + +def test_log_stream_retry_writes_structured_warning(caplog): + agent = _make_agent() + agent._delegate_depth = 1 + agent._subagent_id = "sa-7-cafef00d" + agent.provider = "openrouter" + + err = ConnectionError("Network connection lost.") + with caplog.at_level(logging.WARNING, logger="run_agent"): + agent._log_stream_retry( + kind="drop mid tool-call", + error=err, + attempt=2, + max_attempts=3, + mid_tool_call=True, + ) + + matching = [r for r in caplog.records if "Stream drop mid tool-call" in r.getMessage()] + assert matching, f"no stream-drop WARNING captured; got {[r.getMessage() for r in caplog.records]}" + msg = matching[0].getMessage() + assert "subagent_id=sa-7-cafef00d" in msg + assert "depth=1" in msg + assert "provider=openrouter" in msg + assert "base_url=https://openrouter.ai/api/v1" in msg + assert "error_type=ConnectionError" in msg + + +@pytest.mark.parametrize("depth", [0, 1]) +def test_emit_stream_drop_emits_status_line(depth): + """Both top-level and subagent paths emit exactly one status line. + + Subagent lines get the ``[subagent-N]`` log_prefix via the parent's + ``_vprint`` plumbing — this test only checks that ``_emit_status`` is + called once with the right content. No "Reconnected" follow-up. + """ + agent = _make_agent() + agent._delegate_depth = depth + if depth > 0: + agent._subagent_id = "sa-2-cafe" + agent.provider = "openrouter" + + with patch.object(agent, "_emit_status") as mock_emit: + agent._emit_stream_drop( + error=ConnectionError("boom"), + attempt=2, + max_attempts=3, + mid_tool_call=True, + ) + + assert mock_emit.call_count == 1, ( + f"expected exactly one _emit_status call (no Reconnected follow-up), " + f"got {mock_emit.call_count}" + ) + msg = mock_emit.call_args.args[0] + assert "openrouter" in msg, f"provider name missing from status: {msg}" + assert "stream drop" in msg + assert "ConnectionError" in msg + assert "retry 2/3" in msg + # Single line — no separate "Reconnected" message. But the line itself + # should mention reconnecting so the user knows we're recovering. + assert "reconnect" in msg.lower() + + +@pytest.mark.parametrize("mid_tool_call", [True, False]) +def test_emit_stream_drop_always_writes_to_log(caplog, mid_tool_call): + """Both subagent and top-level drops produce a WARNING in agent.log.""" + agent = _make_agent() + agent._delegate_depth = 1 if mid_tool_call else 0 + agent.provider = "openrouter" + if mid_tool_call: + agent._subagent_id = "sa-99-feed" + + with caplog.at_level(logging.WARNING, logger="run_agent"): + agent._emit_stream_drop( + error=TimeoutError("read timeout"), + attempt=2, + max_attempts=3, + mid_tool_call=mid_tool_call, + ) + + found = [r for r in caplog.records if r.getMessage().startswith("Stream drop")] + assert found, "expected at least one Stream drop WARNING record" + msg = found[0].getMessage() + assert "error_type=TimeoutError" in msg + assert "provider=openrouter" in msg + + +def test_emit_stream_drop_provider_named_when_multi_provider(): + """The user-visible line must name the provider so multi-provider + sessions can tell which subagent dropped (the original two-line message + only said 'provider', forcing a log dive).""" + agent = _make_agent() + agent._delegate_depth = 1 + agent._subagent_id = "sa-1" + agent.provider = "anthropic" + + with patch.object(agent, "_emit_status") as mock_emit: + agent._emit_stream_drop( + error=ConnectionError("x"), + attempt=3, + max_attempts=3, + mid_tool_call=False, + ) + + msg = mock_emit.call_args.args[0] + assert "anthropic" in msg