mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
Closes #31066. Closes #31110. An unhandled `telegram.error.TimedOut` (or peer `NetworkError` / `httpx` connection error) propagating to the asyncio event loop killed the entire gateway process, taking down every profile attached to the same runner. systemd restarted the service after ~5s but the active conversation turn was lost. Public adapter methods (`adapter.send`, `adapter.edit_message`, `adapter.send_voice`, …) are individually try/except-wrapped on current main, but at least one async path was reaching the loop with TimedOut unhandled — the report's traceback ends at the deepest httpx frame and doesn't pinpoint the caller. Rather than audit 30+ call sites blind, install a loop-level safety net: `_gateway_loop_exception_handler` is set as the loop's exception handler in `start_gateway()` after `asyncio.get_running_loop()`. It classifies the exception via `_is_transient_network_error()` (walks the __cause__/__context__ chain, matches on class name so the test suite doesn't need the real telegram/httpx packages installed). Transient errors are logged at WARNING with full traceback so the originating call site stays diagnosable; everything else forwards to `loop.default_exception_handler` so real bugs still surface. Tests cover the classifier (known transients accepted, real bugs rejected, cause/context chain unwrap, cyclic-cause termination) and the handler (swallow + log warning, forward unknowns, missing-exception context). One end-to-end test schedules an orphan task raising TimedOut and asserts `asyncio.run` returns cleanly.
210 lines
6.8 KiB
Python
210 lines
6.8 KiB
Python
"""Tests for the gateway loop-level transient-network-error safety net.
|
|
|
|
Issues #31066 / #31110: unhandled ``telegram.error.TimedOut`` (or peer
|
|
``NetworkError`` / ``httpx`` connection error) propagating to the
|
|
asyncio event loop killed the gateway process, taking down every
|
|
profile attached to the same runner. The safety net installed in
|
|
:func:`gateway.run.start_gateway` catches the transient crash class
|
|
and logs+swallows it; non-transient errors still surface.
|
|
|
|
These tests pin the classifier and the loop handler so the safety net
|
|
can't silently regress to swallowing every exception.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
|
|
import pytest
|
|
|
|
from gateway.run import (
|
|
_gateway_loop_exception_handler,
|
|
_is_transient_network_error,
|
|
)
|
|
|
|
|
|
# ----- Fake exception classes that mimic the real wire types ----------
|
|
# We avoid importing telegram / httpx here so the test runs in environments
|
|
# without those packages installed (the classifier matches on class name).
|
|
|
|
class TimedOut(Exception):
|
|
"""Stand-in for ``telegram.error.TimedOut``."""
|
|
|
|
|
|
class NetworkError(Exception):
|
|
"""Stand-in for ``telegram.error.NetworkError``."""
|
|
|
|
|
|
class ConnectError(Exception):
|
|
"""Stand-in for ``httpx.ConnectError``."""
|
|
|
|
|
|
class ReadTimeout(Exception):
|
|
"""Stand-in for ``httpx.ReadTimeout``."""
|
|
|
|
|
|
class PoolTimeout(Exception):
|
|
"""Stand-in for ``httpx.PoolTimeout``."""
|
|
|
|
|
|
class ClientConnectorError(Exception):
|
|
"""Stand-in for ``aiohttp.ClientConnectorError``."""
|
|
|
|
|
|
class SomeUnrelatedBug(Exception):
|
|
"""A non-transient error that should NOT be swallowed."""
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Classifier
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"exc_cls",
|
|
[
|
|
TimedOut,
|
|
NetworkError,
|
|
ConnectError,
|
|
ReadTimeout,
|
|
PoolTimeout,
|
|
ClientConnectorError,
|
|
],
|
|
)
|
|
def test_transient_classifier_matches_known_network_errors(exc_cls):
|
|
"""Every well-known transient network exception class is classified."""
|
|
assert _is_transient_network_error(exc_cls("boom")) is True
|
|
|
|
|
|
def test_transient_classifier_rejects_unrelated_errors():
|
|
"""Real bugs (ValueError, KeyError, custom app errors) are NOT swallowed."""
|
|
for exc in (ValueError("bad"), KeyError("missing"), SomeUnrelatedBug("x")):
|
|
assert _is_transient_network_error(exc) is False
|
|
|
|
|
|
def test_transient_classifier_unwraps_cause_chain():
|
|
"""A NetworkError wrapping a ConnectError is still classified."""
|
|
inner = ConnectError("connection refused")
|
|
outer = NetworkError("upstream failed")
|
|
outer.__cause__ = inner
|
|
assert _is_transient_network_error(outer) is True
|
|
|
|
|
|
def test_transient_classifier_unwraps_context_chain():
|
|
"""Implicit ``__context__`` wrapping is also unwrapped."""
|
|
try:
|
|
try:
|
|
raise TimedOut("upstream timeout")
|
|
except TimedOut:
|
|
# Re-raise something else with the original as implicit context
|
|
raise SomeUnrelatedBug("wrapper")
|
|
except SomeUnrelatedBug as e:
|
|
wrapped = e
|
|
# The wrapper class name is not transient, but the chained context is.
|
|
assert _is_transient_network_error(wrapped) is True
|
|
|
|
|
|
def test_transient_classifier_does_not_infinite_loop_on_cyclic_cause():
|
|
"""A pathological self-referential cause chain terminates."""
|
|
exc = SomeUnrelatedBug("loop")
|
|
exc.__cause__ = exc # cycle
|
|
# Must return without hanging.
|
|
assert _is_transient_network_error(exc) is False
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# Loop handler
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_handler_swallows_transient_error_and_logs_warning(caplog):
|
|
"""Transient errors are logged at WARNING but not re-raised."""
|
|
loop = asyncio.new_event_loop()
|
|
try:
|
|
with caplog.at_level(logging.WARNING, logger="gateway.run"):
|
|
_gateway_loop_exception_handler(
|
|
loop,
|
|
{
|
|
"message": "Task exception was never retrieved",
|
|
"exception": TimedOut("Timed out"),
|
|
},
|
|
)
|
|
# Warning emitted, exception class name appears in the log.
|
|
assert any("TimedOut" in r.message for r in caplog.records)
|
|
finally:
|
|
loop.close()
|
|
|
|
|
|
def test_handler_delegates_unknown_errors_to_default(monkeypatch):
|
|
"""A non-transient error is forwarded to ``loop.default_exception_handler``."""
|
|
loop = asyncio.new_event_loop()
|
|
try:
|
|
forwarded: list[dict] = []
|
|
|
|
def fake_default(ctx):
|
|
forwarded.append(ctx)
|
|
|
|
monkeypatch.setattr(loop, "default_exception_handler", fake_default)
|
|
|
|
context = {
|
|
"message": "Something else broke",
|
|
"exception": SomeUnrelatedBug("real bug"),
|
|
}
|
|
_gateway_loop_exception_handler(loop, context)
|
|
assert forwarded == [context]
|
|
finally:
|
|
loop.close()
|
|
|
|
|
|
def test_handler_tolerates_missing_exception_key(monkeypatch):
|
|
"""Contexts without an ``exception`` key fall through to the default handler."""
|
|
loop = asyncio.new_event_loop()
|
|
try:
|
|
forwarded: list[dict] = []
|
|
monkeypatch.setattr(
|
|
loop, "default_exception_handler", lambda ctx: forwarded.append(ctx)
|
|
)
|
|
ctx = {"message": "warning without exception"}
|
|
_gateway_loop_exception_handler(loop, ctx)
|
|
assert forwarded == [ctx]
|
|
finally:
|
|
loop.close()
|
|
|
|
|
|
# ---------------------------------------------------------------------
|
|
# End-to-end: task-level
|
|
# ---------------------------------------------------------------------
|
|
|
|
|
|
def test_unhandled_transient_error_in_task_does_not_propagate_to_loop():
|
|
"""Smoke test the wiring as a loop would actually use it.
|
|
|
|
Schedules a task that raises TimedOut and is never awaited. With the
|
|
handler installed, the loop completes normally and logs a warning
|
|
instead of dying. Without the handler, asyncio would emit
|
|
``Task exception was never retrieved`` and (depending on Python's
|
|
debug mode) potentially escalate.
|
|
"""
|
|
|
|
async def raiser():
|
|
raise TimedOut("upstream timeout")
|
|
|
|
async def main():
|
|
loop = asyncio.get_running_loop()
|
|
loop.set_exception_handler(_gateway_loop_exception_handler)
|
|
task = loop.create_task(raiser())
|
|
# Give the task a tick to run and raise.
|
|
await asyncio.sleep(0)
|
|
# Don't await ``task`` — let it become an unhandled-exception task.
|
|
del task
|
|
import gc
|
|
|
|
gc.collect()
|
|
await asyncio.sleep(0)
|
|
|
|
# If the safety net works, this returns cleanly. If not, the test
|
|
# would still pass (asyncio's default is a warning, not a crash) —
|
|
# the real assertion is that no unhandled exception escapes the
|
|
# ``run`` boundary.
|
|
asyncio.run(main())
|