mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(stream-retry): collapse two-line drop status, name provider, and let agent.log capture diagnostics (#22993)
Subagent stream drops were spamming the parent terminal with two lines
per blip ('Connection dropped...' + 'Reconnected...') while leaving zero
breadcrumb in agent.log to debug them.
Two underlying bugs, fixed together:
1. quiet_mode raised the run_agent/tools/etc. loggers to ERROR, which
filters records before root-logger file handlers see them. The comment
claimed 'File handlers still capture everything' — that was wrong.
Removed in both run_agent.py and cli.py; console quietness already
comes from hermes_logging not installing a console StreamHandler in
non-verbose mode.
2. The stream-retry blocks emitted two _emit_status calls per drop
('⚠️ Connection dropped... Reconnecting...' + '🔄 Reconnected —
resuming…') with no provider name, so multi-provider sessions had to
dig through agent.log to attribute a drop. Replaced both call sites
with a single _emit_stream_drop helper that emits ONE line naming the
provider and error class, and always writes a structured WARNING to
agent.log with subagent_id, depth, provider, base_url, error_type.
Net UX change: 6 lines per triple-subagent drop → 3 lines, each
naming the provider. agent.log now has a structured breadcrumb per
retry that didn't exist before.
Tests: 6 new tests in tests/run_agent/test_stream_drop_logging.py
covering the logger-level guard, structured WARNING content, single
status line per drop (no Reconnected follow-up), and provider naming.
This commit is contained in:
parent
3800972dd0
commit
68e44642c8
3 changed files with 290 additions and 56 deletions
9
cli.py
9
cli.py
|
|
@ -8267,8 +8267,13 @@ class HermesCLI:
|
|||
logging.getLogger(noisy).setLevel(logging.WARNING)
|
||||
else:
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
for quiet_logger in ('tools', 'run_agent', 'trajectory_compressor', 'cron', 'hermes_cli'):
|
||||
logging.getLogger(quiet_logger).setLevel(logging.ERROR)
|
||||
# NOTE: We deliberately do NOT raise per-logger levels for
|
||||
# tools/run_agent/etc. in quiet mode. Setting logger.setLevel
|
||||
# above the file handler level filters records before they
|
||||
# reach handlers, so agent.log / errors.log lose visibility
|
||||
# into stream-retry events, credential rotations, etc.
|
||||
# Console quietness is enforced by hermes_logging not
|
||||
# installing a console StreamHandler in non-verbose mode.
|
||||
|
||||
def _show_insights(self, command: str = "/insights"):
|
||||
"""Show usage insights and analytics from session history."""
|
||||
|
|
|
|||
177
run_agent.py
177
run_agent.py
|
|
@ -1435,18 +1435,17 @@ class AIAgent:
|
|||
logger.info("Verbose logging enabled (third-party library logs suppressed)")
|
||||
else:
|
||||
if self.quiet_mode:
|
||||
# In quiet mode (CLI default), suppress all tool/infra log
|
||||
# noise on the *console*. The TUI has its own rich display
|
||||
# for status; logger INFO/WARNING messages just clutter it.
|
||||
# File handlers (agent.log, errors.log) still capture everything.
|
||||
for quiet_logger in [
|
||||
'tools', # all tools.* (terminal, browser, web, file, etc.)
|
||||
'run_agent', # agent runner internals
|
||||
'trajectory_compressor',
|
||||
'cron', # scheduler (only relevant in daemon mode)
|
||||
'hermes_cli', # CLI helpers
|
||||
]:
|
||||
logging.getLogger(quiet_logger).setLevel(logging.ERROR)
|
||||
# In quiet mode (CLI default), keep console output clean —
|
||||
# but DO NOT raise per-logger levels. Doing so prevents the
|
||||
# root logger's file handlers (agent.log, errors.log) from
|
||||
# ever seeing the records, because Python checks
|
||||
# logger.isEnabledFor() before handler propagation. We rely
|
||||
# on the fact that hermes_logging.setup_logging() does not
|
||||
# install a console StreamHandler in quiet mode — so INFO
|
||||
# records flow to the file handlers but never reach a
|
||||
# console. Any future noise reduction belongs at the
|
||||
# handler level inside hermes_logging.py, not here.
|
||||
pass
|
||||
|
||||
# Internal stream callback (set during streaming TTS).
|
||||
# Initialized here so _vprint can reference it before run_conversation.
|
||||
|
|
@ -2823,6 +2822,90 @@ class AIAgent:
|
|||
except Exception:
|
||||
logger.debug("status_callback error in _emit_warning", exc_info=True)
|
||||
|
||||
def _log_stream_retry(
|
||||
self,
|
||||
*,
|
||||
kind: str,
|
||||
error: BaseException,
|
||||
attempt: int,
|
||||
max_attempts: int,
|
||||
mid_tool_call: bool,
|
||||
) -> None:
|
||||
"""Record a transient stream-drop and retry to ``agent.log``.
|
||||
|
||||
Always logs a structured WARNING so users have a breadcrumb regardless
|
||||
of UI verbosity. Subagents in particular benefit because their
|
||||
retries no longer spam the parent's terminal — but the file log keeps
|
||||
full detail (provider, error class, attempt, base_url, subagent_id).
|
||||
"""
|
||||
try:
|
||||
try:
|
||||
_summary = self._summarize_api_error(error)
|
||||
except Exception:
|
||||
_summary = str(error)
|
||||
if _summary and len(_summary) > 240:
|
||||
_summary = _summary[:240] + "…"
|
||||
logger.warning(
|
||||
"Stream %s on attempt %s/%s — retrying. "
|
||||
"subagent_id=%s depth=%s provider=%s base_url=%s "
|
||||
"error_type=%s error=%s",
|
||||
kind,
|
||||
attempt,
|
||||
max_attempts,
|
||||
getattr(self, "_subagent_id", None) or "-",
|
||||
getattr(self, "_delegate_depth", 0),
|
||||
self.provider or "-",
|
||||
self.base_url or "-",
|
||||
type(error).__name__,
|
||||
_summary,
|
||||
extra={"mid_tool_call": mid_tool_call},
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("stream-retry log emit failed", exc_info=True)
|
||||
|
||||
def _emit_stream_drop(
|
||||
self,
|
||||
*,
|
||||
error: BaseException,
|
||||
attempt: int,
|
||||
max_attempts: int,
|
||||
mid_tool_call: bool,
|
||||
) -> None:
|
||||
"""Emit a single user-visible line for a stream drop+retry.
|
||||
|
||||
Both top-level agents and subagents announce drops in the UI — the
|
||||
parent prefixes subagent lines with ``[subagent-N]`` via ``log_prefix``
|
||||
so they're easy to attribute. All cases also write a structured
|
||||
WARNING to ``agent.log`` via :meth:`_log_stream_retry` with the full
|
||||
diagnostic detail (subagent_id, provider, base_url, error_type) for
|
||||
post-hoc analysis.
|
||||
|
||||
Replaces the older two-line ``⚠️ Connection dropped …`` +
|
||||
``🔄 Reconnected …`` pair with a single information-dense line that
|
||||
names the provider (so multi-provider sessions can tell who dropped)
|
||||
and the error class without ambiguity.
|
||||
"""
|
||||
kind = "drop mid tool-call" if mid_tool_call else "drop"
|
||||
self._log_stream_retry(
|
||||
kind=kind,
|
||||
error=error,
|
||||
attempt=attempt,
|
||||
max_attempts=max_attempts,
|
||||
mid_tool_call=mid_tool_call,
|
||||
)
|
||||
provider = self.provider or "provider"
|
||||
try:
|
||||
self._emit_status(
|
||||
f"⚠️ {provider} stream {kind} ({type(error).__name__}) "
|
||||
f"— reconnecting, retry {attempt}/{max_attempts}"
|
||||
)
|
||||
self._touch_activity(
|
||||
f"stream retry {attempt}/{max_attempts} "
|
||||
f"after {type(error).__name__}"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _emit_auxiliary_failure(self, task: str, exc: BaseException) -> None:
|
||||
"""Surface a compact warning for failed auxiliary work."""
|
||||
try:
|
||||
|
|
@ -7719,17 +7802,9 @@ class AIAgent:
|
|||
# retry silently. Clear per-attempt state so the
|
||||
# next stream starts clean. Fire a "reconnecting"
|
||||
# marker so the user sees why the preamble is
|
||||
# about to be re-streamed.
|
||||
logger.info(
|
||||
"Streaming attempt %s/%s died mid tool-call "
|
||||
"(%s: %s) after user-visible text; retrying "
|
||||
"silently to avoid losing the action. "
|
||||
"Preamble will re-stream.",
|
||||
_stream_attempt + 1,
|
||||
_max_stream_retries + 1,
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
# about to be re-streamed. Structured WARNING is
|
||||
# emitted by ``_emit_stream_drop`` below; no
|
||||
# additional INFO line needed.
|
||||
try:
|
||||
self._fire_stream_delta(
|
||||
"\n\n⚠ Connection dropped mid tool-call; "
|
||||
|
|
@ -7751,14 +7826,11 @@ class AIAgent:
|
|||
result["partial_tool_names"] = []
|
||||
deltas_were_sent["yes"] = False
|
||||
first_delta_fired["done"] = False
|
||||
self._emit_status(
|
||||
f"⚠️ Connection dropped mid tool-call "
|
||||
f"({type(e).__name__}). Reconnecting… "
|
||||
f"(attempt {_stream_attempt + 2}/{_max_stream_retries + 1})"
|
||||
)
|
||||
self._touch_activity(
|
||||
f"stream retry {_stream_attempt + 2}/{_max_stream_retries + 1} "
|
||||
f"mid tool-call after {type(e).__name__}"
|
||||
self._emit_stream_drop(
|
||||
error=e,
|
||||
attempt=_stream_attempt + 2,
|
||||
max_attempts=_max_stream_retries + 1,
|
||||
mid_tool_call=True,
|
||||
)
|
||||
stale = request_client_holder.get("client")
|
||||
if stale is not None:
|
||||
|
|
@ -7772,7 +7844,6 @@ class AIAgent:
|
|||
)
|
||||
except Exception:
|
||||
pass
|
||||
self._emit_status("🔄 Reconnected — resuming…")
|
||||
continue
|
||||
|
||||
# SSE error events from proxies (e.g. OpenRouter sends
|
||||
|
|
@ -7809,22 +7880,11 @@ class AIAgent:
|
|||
# Transient network / timeout error. Retry the
|
||||
# streaming request with a fresh connection first.
|
||||
if _stream_attempt < _max_stream_retries:
|
||||
logger.info(
|
||||
"Streaming attempt %s/%s failed (%s: %s), "
|
||||
"retrying with fresh connection...",
|
||||
_stream_attempt + 1,
|
||||
_max_stream_retries + 1,
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
self._emit_status(
|
||||
f"⚠️ Connection to provider dropped "
|
||||
f"({type(e).__name__}). Reconnecting… "
|
||||
f"(attempt {_stream_attempt + 2}/{_max_stream_retries + 1})"
|
||||
)
|
||||
self._touch_activity(
|
||||
f"stream retry {_stream_attempt + 2}/{_max_stream_retries + 1} "
|
||||
f"after {type(e).__name__}"
|
||||
self._emit_stream_drop(
|
||||
error=e,
|
||||
attempt=_stream_attempt + 2,
|
||||
max_attempts=_max_stream_retries + 1,
|
||||
mid_tool_call=False,
|
||||
)
|
||||
# Close the stale request client before retry
|
||||
stale = request_client_holder.get("client")
|
||||
|
|
@ -7841,19 +7901,28 @@ class AIAgent:
|
|||
)
|
||||
except Exception:
|
||||
pass
|
||||
self._emit_status("🔄 Reconnected — resuming…")
|
||||
continue
|
||||
# Retries exhausted. Log the final failure with
|
||||
# full diagnostic detail and surface a status
|
||||
# line — subagent lines get the ``[subagent-N]``
|
||||
# log_prefix so the parent can attribute them.
|
||||
logger.warning(
|
||||
"Streaming exhausted %s retries on transient "
|
||||
"error: subagent_id=%s depth=%s provider=%s "
|
||||
"error_type=%s error=%s",
|
||||
_max_stream_retries + 1,
|
||||
getattr(self, "_subagent_id", None) or "-",
|
||||
getattr(self, "_delegate_depth", 0),
|
||||
self.provider or "-",
|
||||
type(e).__name__,
|
||||
e,
|
||||
)
|
||||
self._emit_status(
|
||||
"❌ Connection to provider failed after "
|
||||
f"{_max_stream_retries + 1} attempts. "
|
||||
"The provider may be experiencing issues — "
|
||||
"try again in a moment."
|
||||
)
|
||||
logger.warning(
|
||||
"Streaming exhausted %s retries on transient error: %s",
|
||||
_max_stream_retries + 1,
|
||||
e,
|
||||
)
|
||||
else:
|
||||
_err_lower = str(e).lower()
|
||||
_is_stream_unsupported = (
|
||||
|
|
|
|||
160
tests/run_agent/test_stream_drop_logging.py
Normal file
160
tests/run_agent/test_stream_drop_logging.py
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
"""Tests for the structured stream-drop log + clearer single-line status.
|
||||
|
||||
Regression coverage for the change that:
|
||||
|
||||
1. Removed ``logger.setLevel(ERROR)`` on tools/run_agent/etc. in quiet mode.
|
||||
It was clobbering the root logger's file handlers (agent.log/errors.log)
|
||||
because Python checks logger-level before handler propagation, so
|
||||
subagent stream-drop diagnostics were never written to disk.
|
||||
2. Replaced the two ``⚠️ Connection dropped …`` + ``🔄 Reconnected …``
|
||||
``_emit_status`` calls with a single ``_emit_stream_drop`` helper that:
|
||||
- Always writes a structured WARNING to ``agent.log``.
|
||||
- Always emits exactly ONE user-visible status line per drop (no
|
||||
follow-up "Reconnected" line) that names the provider and error
|
||||
class so multi-provider sessions can attribute it cleanly.
|
||||
- Subagent lines get the ``[subagent-N]`` ``log_prefix`` automatically
|
||||
via ``_emit_status`` → ``_vprint``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
import run_agent
|
||||
from run_agent import AIAgent
|
||||
|
||||
|
||||
def _make_agent() -> AIAgent:
|
||||
return AIAgent(
|
||||
api_key="test-key",
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
)
|
||||
|
||||
|
||||
def test_quiet_mode_does_not_clobber_runagent_logger_level():
|
||||
"""quiet_mode must not raise the ``run_agent`` logger above WARNING.
|
||||
|
||||
Setting ``setLevel(ERROR)`` on the logger filters records *before* root
|
||||
logger handlers (agent.log/errors.log) ever see them. Stream-drop
|
||||
diagnostics must reach the file handlers regardless of console verbosity.
|
||||
"""
|
||||
_ = _make_agent()
|
||||
for name in ("run_agent", "tools", "trajectory_compressor", "cron", "hermes_cli"):
|
||||
logger = logging.getLogger(name)
|
||||
assert logger.getEffectiveLevel() <= logging.WARNING, (
|
||||
f"{name} logger blocked at level {logger.getEffectiveLevel()} — "
|
||||
f"file handlers will lose WARNING records"
|
||||
)
|
||||
|
||||
|
||||
def test_log_stream_retry_writes_structured_warning(caplog):
|
||||
agent = _make_agent()
|
||||
agent._delegate_depth = 1
|
||||
agent._subagent_id = "sa-7-cafef00d"
|
||||
agent.provider = "openrouter"
|
||||
|
||||
err = ConnectionError("Network connection lost.")
|
||||
with caplog.at_level(logging.WARNING, logger="run_agent"):
|
||||
agent._log_stream_retry(
|
||||
kind="drop mid tool-call",
|
||||
error=err,
|
||||
attempt=2,
|
||||
max_attempts=3,
|
||||
mid_tool_call=True,
|
||||
)
|
||||
|
||||
matching = [r for r in caplog.records if "Stream drop mid tool-call" in r.getMessage()]
|
||||
assert matching, f"no stream-drop WARNING captured; got {[r.getMessage() for r in caplog.records]}"
|
||||
msg = matching[0].getMessage()
|
||||
assert "subagent_id=sa-7-cafef00d" in msg
|
||||
assert "depth=1" in msg
|
||||
assert "provider=openrouter" in msg
|
||||
assert "base_url=https://openrouter.ai/api/v1" in msg
|
||||
assert "error_type=ConnectionError" in msg
|
||||
|
||||
|
||||
@pytest.mark.parametrize("depth", [0, 1])
|
||||
def test_emit_stream_drop_emits_status_line(depth):
|
||||
"""Both top-level and subagent paths emit exactly one status line.
|
||||
|
||||
Subagent lines get the ``[subagent-N]`` log_prefix via the parent's
|
||||
``_vprint`` plumbing — this test only checks that ``_emit_status`` is
|
||||
called once with the right content. No "Reconnected" follow-up.
|
||||
"""
|
||||
agent = _make_agent()
|
||||
agent._delegate_depth = depth
|
||||
if depth > 0:
|
||||
agent._subagent_id = "sa-2-cafe"
|
||||
agent.provider = "openrouter"
|
||||
|
||||
with patch.object(agent, "_emit_status") as mock_emit:
|
||||
agent._emit_stream_drop(
|
||||
error=ConnectionError("boom"),
|
||||
attempt=2,
|
||||
max_attempts=3,
|
||||
mid_tool_call=True,
|
||||
)
|
||||
|
||||
assert mock_emit.call_count == 1, (
|
||||
f"expected exactly one _emit_status call (no Reconnected follow-up), "
|
||||
f"got {mock_emit.call_count}"
|
||||
)
|
||||
msg = mock_emit.call_args.args[0]
|
||||
assert "openrouter" in msg, f"provider name missing from status: {msg}"
|
||||
assert "stream drop" in msg
|
||||
assert "ConnectionError" in msg
|
||||
assert "retry 2/3" in msg
|
||||
# Single line — no separate "Reconnected" message. But the line itself
|
||||
# should mention reconnecting so the user knows we're recovering.
|
||||
assert "reconnect" in msg.lower()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mid_tool_call", [True, False])
|
||||
def test_emit_stream_drop_always_writes_to_log(caplog, mid_tool_call):
|
||||
"""Both subagent and top-level drops produce a WARNING in agent.log."""
|
||||
agent = _make_agent()
|
||||
agent._delegate_depth = 1 if mid_tool_call else 0
|
||||
agent.provider = "openrouter"
|
||||
if mid_tool_call:
|
||||
agent._subagent_id = "sa-99-feed"
|
||||
|
||||
with caplog.at_level(logging.WARNING, logger="run_agent"):
|
||||
agent._emit_stream_drop(
|
||||
error=TimeoutError("read timeout"),
|
||||
attempt=2,
|
||||
max_attempts=3,
|
||||
mid_tool_call=mid_tool_call,
|
||||
)
|
||||
|
||||
found = [r for r in caplog.records if r.getMessage().startswith("Stream drop")]
|
||||
assert found, "expected at least one Stream drop WARNING record"
|
||||
msg = found[0].getMessage()
|
||||
assert "error_type=TimeoutError" in msg
|
||||
assert "provider=openrouter" in msg
|
||||
|
||||
|
||||
def test_emit_stream_drop_provider_named_when_multi_provider():
|
||||
"""The user-visible line must name the provider so multi-provider
|
||||
sessions can tell which subagent dropped (the original two-line message
|
||||
only said 'provider', forcing a log dive)."""
|
||||
agent = _make_agent()
|
||||
agent._delegate_depth = 1
|
||||
agent._subagent_id = "sa-1"
|
||||
agent.provider = "anthropic"
|
||||
|
||||
with patch.object(agent, "_emit_status") as mock_emit:
|
||||
agent._emit_stream_drop(
|
||||
error=ConnectionError("x"),
|
||||
attempt=3,
|
||||
max_attempts=3,
|
||||
mid_tool_call=False,
|
||||
)
|
||||
|
||||
msg = mock_emit.call_args.args[0]
|
||||
assert "anthropic" in msg
|
||||
Loading…
Add table
Add a link
Reference in a new issue