hermes-agent/tests/agent/test_cascading_interrupt_6600.py
teknium1 dd0d1222a2 fix(agent): don't retry interrupt-induced transport errors (cascading-interrupt hang)
When agent.interrupt() fires during an active LLM call, the main poll loop
force-closes the worker-local httpx client to stop token generation. That
raises a transport error (RemoteProtocolError) on the worker thread — the
EXPECTED consequence of our own close, not a network bug.

The streaming retry loop misclassified it as a transient connection error
and retried; each doomed retry stalled for the full stream-stale timeout
(up to 300s). Because the gateway caches AIAgent instances per session, the
stale worker outlived the interrupted turn and raced the next turn's request
on shared client state — the root of the multi-minute cascading-interrupt
hang reported in the wild.

Fix: a request-local _request_cancelled token set by the poll loop right
before the force-close, in both interruptible_api_call (non-streaming) and
interruptible_streaming_api_call. The worker's exception handler checks the
token and exits cleanly — no retry, no fallback, no 'reconnecting' status —
instead of treating the forced error as transient. The token is request-
local (not agent._interrupt_requested, which is cleared at turn boundaries)
so a stale worker outliving its turn still recognizes its own forced close.

Original diagnosis and fix by @kristianvast (PR #6600), against the then-
inline methods in run_agent.py. Those were since extracted into
agent/chat_completion_helpers.py, so the fix is reapplied there.

Co-authored-by: Kristian Vastveit <kristianvast@users.noreply.github.com>
2026-06-08 02:19:13 -07:00

134 lines
5.5 KiB
Python

"""Regression guard for the cascading-interrupt hang (PR #6600).
Original diagnosis and fix by Kristian Vastveit (@kristianvast) in PR #6600,
against the then-inline ``_interruptible_api_call`` /
``_interruptible_streaming_api_call`` methods in run_agent.py. Those methods
have since been extracted into ``agent/chat_completion_helpers.py``, so the
fix is reapplied there and these tests target the extracted functions.
The bug: when ``agent.interrupt()`` fires during an active LLM call, the main
poll loop force-closes the worker-local httpx client to stop token generation.
That raises a transport error (RemoteProtocolError) on the worker — the
EXPECTED consequence of our own close, not a network bug. The streaming retry
loop misclassified it as a transient connection error and retried, each doomed
retry stalling for the full stream-stale timeout (up to 300s). Because the
gateway caches AIAgent instances per session, the stale worker outlived the
turn and raced the next turn's request — the root of the multi-minute
cascading-interrupt hang.
The fix: a request-local ``_request_cancelled`` token set by the poll loop
right before the force-close. The worker's exception handler checks it and
exits cleanly (no retry, no fallback, no "reconnecting" status) instead of
treating the forced error as transient.
"""
import threading
import time
import types
from unittest.mock import MagicMock
import httpx
import pytest
from agent import chat_completion_helpers as cch
class _FakeInterruptError(Exception):
"""Stand-in for the transport error a force-close raises on the worker."""
def _make_agent():
"""A MagicMock agent wired with just enough surface for the helpers."""
agent = MagicMock()
agent.api_mode = "chat_completions"
agent._interrupt_requested = False
agent.verbose_logging = False
# _compute_non_stream_stale_timeout / streaming setup helpers return
# benign values; the real call path is mocked per-test.
agent._compute_non_stream_stale_timeout.return_value = 5.0
return agent
def test_non_streaming_cancel_does_not_surface_network_error():
"""A force-close during a non-streaming call must raise InterruptedError,
not the swallowed transport error."""
agent = _make_agent()
create_calls = {"n": 0}
fake_client = MagicMock()
def _create(**kwargs):
create_calls["n"] += 1
# Simulate the main thread firing an interrupt mid-call, then the
# force-close raising a transport error on this worker.
agent._interrupt_requested = True
time.sleep(0.3) # let the poll loop observe the interrupt + force-close
raise httpx.RemoteProtocolError("peer closed connection")
fake_client.chat.completions.create.side_effect = _create
agent._create_request_openai_client.return_value = fake_client
agent._close_request_openai_client = MagicMock()
agent._abort_request_openai_client = MagicMock()
t0 = time.time()
with pytest.raises(InterruptedError):
cch.interruptible_api_call(agent, {"model": "x", "messages": []})
elapsed = time.time() - t0
# The forced RemoteProtocolError must NOT surface as the raised error.
assert create_calls["n"] == 1
assert elapsed < 3.0, f"interrupt took {elapsed:.1f}s — should be near-instant"
def test_normal_transient_error_still_raises_when_not_cancelled():
"""Regression guard: a real transport error with NO interrupt must still
surface to the caller (so the outer retry loop can recover)."""
agent = _make_agent()
fake_client = MagicMock()
fake_client.chat.completions.create.side_effect = httpx.RemoteProtocolError(
"genuine network drop"
)
agent._create_request_openai_client.return_value = fake_client
agent._close_request_openai_client = MagicMock()
agent._abort_request_openai_client = MagicMock()
agent._interrupt_requested = False
with pytest.raises(httpx.RemoteProtocolError):
cch.interruptible_api_call(agent, {"model": "x", "messages": []})
def test_request_cancelled_token_is_request_local():
"""The cancellation token must be created per call, not shared on the
agent — a stale worker from a previous turn must not see the next turn's
interrupt flag flip back to False and mistake its own forced error for a
network bug. We assert the helper reads agent._interrupt_requested at the
force-close site (request-local token set there), by confirming two
independent calls don't share cancellation state."""
agent = _make_agent()
# First call: interrupted.
fake_client_1 = MagicMock()
def _create_1(**kwargs):
agent._interrupt_requested = True
time.sleep(0.3)
raise httpx.RemoteProtocolError("forced close turn A")
fake_client_1.chat.completions.create.side_effect = _create_1
agent._create_request_openai_client.return_value = fake_client_1
agent._close_request_openai_client = MagicMock()
agent._abort_request_openai_client = MagicMock()
with pytest.raises(InterruptedError):
cch.interruptible_api_call(agent, {"model": "x", "messages": []})
# Second call: NOT interrupted (turn boundary cleared the flag). A genuine
# error must still surface — the previous call's cancellation must not leak.
agent._interrupt_requested = False
fake_client_2 = MagicMock()
fake_client_2.chat.completions.create.side_effect = httpx.RemoteProtocolError(
"genuine drop turn B"
)
agent._create_request_openai_client.return_value = fake_client_2
with pytest.raises(httpx.RemoteProtocolError):
cch.interruptible_api_call(agent, {"model": "x", "messages": []})