fix(tui_gateway): suppress WS peer-hangup teardown error flood (#50005) (#54126)

When the Desktop forcibly closes its WebSocket mid-write, asyncio logs a
full traceback for every pending connection-lost callback — 50+ identical
WinError 10054 (ConnectionResetError) lines per disconnect on Windows, the
equivalent ConnectionResetError/BrokenPipeError on POSIX. These are not
actionable: they are the expected side effect of the peer hanging up before
our writes drained.

Install a loop exception handler on the gateway serving loop that collapses
exactly this teardown class (ConnectionResetError/ConnectionAbortedError/
BrokenPipeError originating from _call_connection_lost) to a single debug
line, forwarding every other loop error to the existing/default handler
unchanged so genuine loop bugs still surface. Idempotent per loop.
This commit is contained in:
Teknium 2026-06-28 02:35:01 -07:00 committed by GitHub
parent 6eec0d4f08
commit fde1c8570f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 210 additions and 0 deletions

View file

@ -13686,6 +13686,19 @@ def start_server(
print(f" Hermes Web UI → http://{host}:{actual_port}")
_maybe_open_browser(host, actual_port, open_browser, initial_profile)
# Collapse the peer-hangup teardown flood (#50005). When the Desktop
# forcibly closes its WebSocket mid-write, asyncio logs a full
# traceback per pending connection-lost callback — 50+ identical
# WinError 10054 (ConnectionResetError) lines per disconnect on
# Windows. This filter downgrades exactly that class to one debug
# line and passes every other loop error through unchanged.
try:
from tui_gateway.loop_noise import install_loop_noise_filter
install_loop_noise_filter(asyncio.get_running_loop())
except Exception as exc: # pragma: no cover - best-effort
_log.debug("loop noise filter install skipped: %s", exc)
await server.main_loop()
if server.started:
await server.shutdown()

View file

@ -0,0 +1,114 @@
"""Tests for tui_gateway.loop_noise — the WS peer-hangup teardown filter (#50005)."""
from __future__ import annotations
import asyncio
import pytest
from tui_gateway.loop_noise import (
_is_benign_teardown,
install_loop_noise_filter,
)
class _FakeConnectionLostCallback:
"""Stand-in whose repr matches asyncio's ``_call_connection_lost`` flood."""
def __repr__(self) -> str:
return "<Handle _ProactorBasePipeTransport._call_connection_lost(None)>"
def test_benign_teardown_matches_reset_in_connection_lost():
ctx = {
"exception": ConnectionResetError(10054, "forcibly closed"),
"handle": _FakeConnectionLostCallback(),
}
assert _is_benign_teardown(ctx) is True
def test_benign_teardown_matches_aborted_and_broken_pipe():
for exc in (
ConnectionAbortedError(10053, "aborted"),
BrokenPipeError("epipe"),
):
ctx = {"exception": exc, "callback": _FakeConnectionLostCallback()}
assert _is_benign_teardown(ctx) is True
def test_reset_outside_connection_lost_is_not_suppressed():
# Same error type, but NOT from the connection-lost teardown path — must
# fall through to the default handler.
ctx = {
"exception": ConnectionResetError("reset in a real handler"),
"handle": "<Handle some_other_handler()>",
}
assert _is_benign_teardown(ctx) is False
def test_unrelated_exception_is_not_suppressed():
ctx = {
"exception": ValueError("boom"),
"handle": _FakeConnectionLostCallback(),
}
assert _is_benign_teardown(ctx) is False
def test_no_exception_is_not_suppressed():
assert _is_benign_teardown({"message": "loop warning, no exc"}) is False
def test_install_suppresses_flood_and_forwards_real_errors():
loop = asyncio.new_event_loop()
try:
forwarded: list[dict] = []
loop.set_exception_handler(lambda _loop, ctx: forwarded.append(ctx))
install_loop_noise_filter(loop)
# Benign teardown flood → swallowed, not forwarded.
loop.call_exception_handler(
{
"exception": ConnectionResetError(10054, "forcibly closed"),
"handle": _FakeConnectionLostCallback(),
}
)
assert forwarded == []
# Genuine loop error → forwarded to the previous handler unchanged.
real_ctx = {"exception": RuntimeError("genuine loop bug")}
loop.call_exception_handler(real_ctx)
assert len(forwarded) == 1
assert forwarded[0] is real_ctx
finally:
loop.close()
def test_install_is_idempotent():
loop = asyncio.new_event_loop()
try:
install_loop_noise_filter(loop)
first = loop.get_exception_handler()
install_loop_noise_filter(loop)
# Second install must NOT wrap again — same handler object.
assert loop.get_exception_handler() is first
finally:
loop.close()
def test_install_falls_back_to_default_handler_when_none_set():
loop = asyncio.new_event_loop()
try:
# No previous handler installed; benign flood still swallowed, and a
# real error must not raise out of the filter.
install_loop_noise_filter(loop)
loop.call_exception_handler(
{
"exception": ConnectionResetError(10054, "reset"),
"handle": _FakeConnectionLostCallback(),
}
)
# A genuine error routes to default_exception_handler — should not raise.
loop.call_exception_handler({"message": "some loop warning"})
finally:
loop.close()

83
tui_gateway/loop_noise.py Normal file
View file

@ -0,0 +1,83 @@
"""Suppress benign event-loop teardown noise on the gateway serving loop.
When the Desktop client forcibly closes its WebSocket while the gateway still
has pending socket operations, asyncio's transport teardown logs a full
traceback for every pending ``_call_connection_lost`` callback. On Windows this
surfaces as ``ConnectionResetError: [WinError 10054]`` (and the rarer
``ConnectionAbortedError: [WinError 10053]``); on POSIX it is the equivalent
``ConnectionResetError``/``BrokenPipeError``. A single client disconnect can
emit 50+ identical tracebacks into ``errors.log`` (#50005).
These are not actionable they are the expected side effect of the peer
hanging up before our writes drained. We install a loop exception handler that
collapses exactly this class of teardown error to one debug line and forwards
everything else to asyncio's default handler unchanged, so genuine loop bugs
still surface.
"""
from __future__ import annotations
import asyncio
import logging
from typing import Any
_log = logging.getLogger(__name__)
# Connection-teardown errors that mean "the peer hung up mid-write". WinError
# 10054 (connection reset) and 10053 (connection aborted) raise as these.
_BENIGN_TEARDOWN_ERRORS = (
ConnectionResetError,
ConnectionAbortedError,
BrokenPipeError,
)
def _is_benign_teardown(context: dict[str, Any]) -> bool:
"""True when the loop error is a peer-hangup during transport teardown.
Gated on BOTH the exception type AND the ``_call_connection_lost``
callback so we only swallow the disconnect flood any other place these
errors surface (a real handler, a custom callback) still goes to the
default handler.
"""
exc = context.get("exception")
if not isinstance(exc, _BENIGN_TEARDOWN_ERRORS):
return False
# The flood originates from the transport's connection-lost callback. Match
# on its repr so we don't suppress the same error type raised elsewhere.
callback = context.get("callback")
handle = context.get("handle")
marker = "_call_connection_lost"
return marker in repr(callback) or marker in repr(handle)
def install_loop_noise_filter(loop: asyncio.AbstractEventLoop) -> None:
"""Chain a teardown-noise filter ahead of the loop's existing handler.
Idempotent: re-installing on a loop that already has the filter is a no-op,
so it's safe to call on every reconnect/serve entry.
"""
if getattr(loop, "_hermes_noise_filter_installed", False):
return
previous = loop.get_exception_handler()
def _handler(loop: asyncio.AbstractEventLoop, context: dict[str, Any]) -> None:
if _is_benign_teardown(context):
_log.debug(
"ws peer hangup during teardown (suppressed): %s",
context.get("exception"),
)
return
if previous is not None:
previous(loop, context)
else:
loop.default_exception_handler(context)
loop.set_exception_handler(_handler)
# Mark on the loop instance so a second install (reconnect, re-serve) is a
# no-op rather than stacking handlers.
try:
loop._hermes_noise_filter_installed = True # type: ignore[attr-defined]
except (AttributeError, TypeError): # pragma: no cover - exotic loop impls
pass