From fde1c8570ffe1bcd1d352efffb4eeafdfd975f0c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 28 Jun 2026 02:35:01 -0700 Subject: [PATCH] fix(tui_gateway): suppress WS peer-hangup teardown error flood (#50005) (#54126) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the Desktop forcibly closes its WebSocket mid-write, asyncio logs a full traceback for every pending connection-lost callback — 50+ identical WinError 10054 (ConnectionResetError) lines per disconnect on Windows, the equivalent ConnectionResetError/BrokenPipeError on POSIX. These are not actionable: they are the expected side effect of the peer hanging up before our writes drained. Install a loop exception handler on the gateway serving loop that collapses exactly this teardown class (ConnectionResetError/ConnectionAbortedError/ BrokenPipeError originating from _call_connection_lost) to a single debug line, forwarding every other loop error to the existing/default handler unchanged so genuine loop bugs still surface. Idempotent per loop. --- hermes_cli/web_server.py | 13 +++ tests/test_tui_gateway_loop_noise.py | 114 +++++++++++++++++++++++++++ tui_gateway/loop_noise.py | 83 +++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 tests/test_tui_gateway_loop_noise.py create mode 100644 tui_gateway/loop_noise.py diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 308e5f697b8..ccb20c60fc7 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -13686,6 +13686,19 @@ def start_server( print(f" Hermes Web UI → http://{host}:{actual_port}") _maybe_open_browser(host, actual_port, open_browser, initial_profile) + # Collapse the peer-hangup teardown flood (#50005). When the Desktop + # forcibly closes its WebSocket mid-write, asyncio logs a full + # traceback per pending connection-lost callback — 50+ identical + # WinError 10054 (ConnectionResetError) lines per disconnect on + # Windows. This filter downgrades exactly that class to one debug + # line and passes every other loop error through unchanged. + try: + from tui_gateway.loop_noise import install_loop_noise_filter + + install_loop_noise_filter(asyncio.get_running_loop()) + except Exception as exc: # pragma: no cover - best-effort + _log.debug("loop noise filter install skipped: %s", exc) + await server.main_loop() if server.started: await server.shutdown() diff --git a/tests/test_tui_gateway_loop_noise.py b/tests/test_tui_gateway_loop_noise.py new file mode 100644 index 00000000000..6172c937c01 --- /dev/null +++ b/tests/test_tui_gateway_loop_noise.py @@ -0,0 +1,114 @@ +"""Tests for tui_gateway.loop_noise — the WS peer-hangup teardown filter (#50005).""" + +from __future__ import annotations + +import asyncio + +import pytest + +from tui_gateway.loop_noise import ( + _is_benign_teardown, + install_loop_noise_filter, +) + + +class _FakeConnectionLostCallback: + """Stand-in whose repr matches asyncio's ``_call_connection_lost`` flood.""" + + def __repr__(self) -> str: + return "" + + +def test_benign_teardown_matches_reset_in_connection_lost(): + ctx = { + "exception": ConnectionResetError(10054, "forcibly closed"), + "handle": _FakeConnectionLostCallback(), + } + assert _is_benign_teardown(ctx) is True + + +def test_benign_teardown_matches_aborted_and_broken_pipe(): + for exc in ( + ConnectionAbortedError(10053, "aborted"), + BrokenPipeError("epipe"), + ): + ctx = {"exception": exc, "callback": _FakeConnectionLostCallback()} + assert _is_benign_teardown(ctx) is True + + +def test_reset_outside_connection_lost_is_not_suppressed(): + # Same error type, but NOT from the connection-lost teardown path — must + # fall through to the default handler. + ctx = { + "exception": ConnectionResetError("reset in a real handler"), + "handle": "", + } + assert _is_benign_teardown(ctx) is False + + +def test_unrelated_exception_is_not_suppressed(): + ctx = { + "exception": ValueError("boom"), + "handle": _FakeConnectionLostCallback(), + } + assert _is_benign_teardown(ctx) is False + + +def test_no_exception_is_not_suppressed(): + assert _is_benign_teardown({"message": "loop warning, no exc"}) is False + + +def test_install_suppresses_flood_and_forwards_real_errors(): + loop = asyncio.new_event_loop() + try: + forwarded: list[dict] = [] + loop.set_exception_handler(lambda _loop, ctx: forwarded.append(ctx)) + + install_loop_noise_filter(loop) + + # Benign teardown flood → swallowed, not forwarded. + loop.call_exception_handler( + { + "exception": ConnectionResetError(10054, "forcibly closed"), + "handle": _FakeConnectionLostCallback(), + } + ) + assert forwarded == [] + + # Genuine loop error → forwarded to the previous handler unchanged. + real_ctx = {"exception": RuntimeError("genuine loop bug")} + loop.call_exception_handler(real_ctx) + assert len(forwarded) == 1 + assert forwarded[0] is real_ctx + finally: + loop.close() + + +def test_install_is_idempotent(): + loop = asyncio.new_event_loop() + try: + install_loop_noise_filter(loop) + first = loop.get_exception_handler() + install_loop_noise_filter(loop) + # Second install must NOT wrap again — same handler object. + assert loop.get_exception_handler() is first + finally: + loop.close() + + +def test_install_falls_back_to_default_handler_when_none_set(): + loop = asyncio.new_event_loop() + try: + # No previous handler installed; benign flood still swallowed, and a + # real error must not raise out of the filter. + install_loop_noise_filter(loop) + loop.call_exception_handler( + { + "exception": ConnectionResetError(10054, "reset"), + "handle": _FakeConnectionLostCallback(), + } + ) + # A genuine error routes to default_exception_handler — should not raise. + loop.call_exception_handler({"message": "some loop warning"}) + finally: + loop.close() diff --git a/tui_gateway/loop_noise.py b/tui_gateway/loop_noise.py new file mode 100644 index 00000000000..321509747e6 --- /dev/null +++ b/tui_gateway/loop_noise.py @@ -0,0 +1,83 @@ +"""Suppress benign event-loop teardown noise on the gateway serving loop. + +When the Desktop client forcibly closes its WebSocket while the gateway still +has pending socket operations, asyncio's transport teardown logs a full +traceback for every pending ``_call_connection_lost`` callback. On Windows this +surfaces as ``ConnectionResetError: [WinError 10054]`` (and the rarer +``ConnectionAbortedError: [WinError 10053]``); on POSIX it is the equivalent +``ConnectionResetError``/``BrokenPipeError``. A single client disconnect can +emit 50+ identical tracebacks into ``errors.log`` (#50005). + +These are not actionable — they are the expected side effect of the peer +hanging up before our writes drained. We install a loop exception handler that +collapses exactly this class of teardown error to one debug line and forwards +everything else to asyncio's default handler unchanged, so genuine loop bugs +still surface. +""" + +from __future__ import annotations + +import asyncio +import logging +from typing import Any + +_log = logging.getLogger(__name__) + +# Connection-teardown errors that mean "the peer hung up mid-write". WinError +# 10054 (connection reset) and 10053 (connection aborted) raise as these. +_BENIGN_TEARDOWN_ERRORS = ( + ConnectionResetError, + ConnectionAbortedError, + BrokenPipeError, +) + + +def _is_benign_teardown(context: dict[str, Any]) -> bool: + """True when the loop error is a peer-hangup during transport teardown. + + Gated on BOTH the exception type AND the ``_call_connection_lost`` + callback so we only swallow the disconnect flood — any other place these + errors surface (a real handler, a custom callback) still goes to the + default handler. + """ + exc = context.get("exception") + if not isinstance(exc, _BENIGN_TEARDOWN_ERRORS): + return False + # The flood originates from the transport's connection-lost callback. Match + # on its repr so we don't suppress the same error type raised elsewhere. + callback = context.get("callback") + handle = context.get("handle") + marker = "_call_connection_lost" + return marker in repr(callback) or marker in repr(handle) + + +def install_loop_noise_filter(loop: asyncio.AbstractEventLoop) -> None: + """Chain a teardown-noise filter ahead of the loop's existing handler. + + Idempotent: re-installing on a loop that already has the filter is a no-op, + so it's safe to call on every reconnect/serve entry. + """ + if getattr(loop, "_hermes_noise_filter_installed", False): + return + + previous = loop.get_exception_handler() + + def _handler(loop: asyncio.AbstractEventLoop, context: dict[str, Any]) -> None: + if _is_benign_teardown(context): + _log.debug( + "ws peer hangup during teardown (suppressed): %s", + context.get("exception"), + ) + return + if previous is not None: + previous(loop, context) + else: + loop.default_exception_handler(context) + + loop.set_exception_handler(_handler) + # Mark on the loop instance so a second install (reconnect, re-serve) is a + # no-op rather than stacking handlers. + try: + loop._hermes_noise_filter_installed = True # type: ignore[attr-defined] + except (AttributeError, TypeError): # pragma: no cover - exotic loop impls + pass