fix(agent): always prefer streaming for API calls to prevent hung subagents (#3120)

The non-streaming API call path (_interruptible_api_call) had no
wall-clock timeout. When providers keep connections alive with SSE
keep-alive pings but never deliver a response, httpx's inactivity
timeout never fires and the call hangs indefinitely.

Subagents always used the non-streaming path because they have no
stream consumers (quiet_mode=True). This caused delegate_task to
hang for 40+ minutes in production.

The streaming path has two layers of protection:
- httpx read timeout (60s, HERMES_STREAM_READ_TIMEOUT)
- Stale stream detection (90s, HERMES_STREAM_STALE_TIMEOUT)

Both work because streaming sends chunks continuously — a 90-second
gap between chunks genuinely means the connection is broken, even for
reasoning models that take minutes to complete.

Now run_conversation() always prefers the streaming path. The streaming
method falls back to non-streaming automatically if the provider
doesn't support it. Stream delta callbacks are no-ops when no
consumers are registered, so there's no overhead for subagents.
This commit is contained in:
Teknium 2026-03-26 01:22:31 -07:00 committed by GitHub
parent c07c17f5f2
commit c511e087e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 31 additions and 10 deletions

View file

@ -6028,17 +6028,35 @@ class AIAgent:
if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
self._dump_api_request_debug(api_kwargs, reason="preflight")
if self._has_stream_consumers():
# Streaming path: fire delta callbacks for real-time
# token delivery to CLI display, gateway, or TTS.
def _stop_spinner():
nonlocal thinking_spinner
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
# Always prefer the streaming path — even without stream
# consumers. Streaming gives us fine-grained health
# checking (90s stale-stream detection, 60s read timeout)
# that the non-streaming path lacks. Without this,
# subagents and other quiet-mode callers can hang
# indefinitely when the provider keeps the connection
# alive with SSE pings but never delivers a response.
# The streaming path is a no-op for callbacks when no
# consumers are registered, and falls back to non-
# streaming automatically if the provider doesn't
# support it.
def _stop_spinner():
nonlocal thinking_spinner
if thinking_spinner:
thinking_spinner.stop("")
thinking_spinner = None
if self.thinking_callback:
self.thinking_callback("")
_use_streaming = True
if not self._has_stream_consumers():
# No display/TTS consumer. Still prefer streaming for
# health checking, but skip for Mock clients in tests
# (mocks return SimpleNamespace, not stream iterators).
from unittest.mock import Mock
if isinstance(getattr(self, "client", None), Mock):
_use_streaming = False
if _use_streaming:
response = self._interruptible_streaming_api_call(
api_kwargs, on_first_delta=_stop_spinner
)

View file

@ -273,6 +273,9 @@ def test_401_credential_refresh_recovers(monkeypatch):
return _anthropic_response("Auth refreshed")
self._interruptible_api_call = _fake_api_call
# Also patch streaming path — run_conversation now prefers
# streaming for health checking even without stream consumers.
self._interruptible_streaming_api_call = lambda api_kwargs, **kw: _fake_api_call(api_kwargs)
return super().run_conversation(
user_message, conversation_history=conversation_history, task_id=task_id
)