mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(agent): always prefer streaming for API calls to prevent hung subagents (#3120)
The non-streaming API call path (_interruptible_api_call) had no wall-clock timeout. When providers keep connections alive with SSE keep-alive pings but never deliver a response, httpx's inactivity timeout never fires and the call hangs indefinitely. Subagents always used the non-streaming path because they have no stream consumers (quiet_mode=True). This caused delegate_task to hang for 40+ minutes in production. The streaming path has two layers of protection: - httpx read timeout (60s, HERMES_STREAM_READ_TIMEOUT) - Stale stream detection (90s, HERMES_STREAM_STALE_TIMEOUT) Both work because streaming sends chunks continuously — a 90-second gap between chunks genuinely means the connection is broken, even for reasoning models that take minutes to complete. Now run_conversation() always prefers the streaming path. The streaming method falls back to non-streaming automatically if the provider doesn't support it. Stream delta callbacks are no-ops when no consumers are registered, so there's no overhead for subagents.
This commit is contained in:
parent
c07c17f5f2
commit
c511e087e0
2 changed files with 31 additions and 10 deletions
38
run_agent.py
38
run_agent.py
|
|
@ -6028,17 +6028,35 @@ class AIAgent:
|
|||
if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
|
||||
self._dump_api_request_debug(api_kwargs, reason="preflight")
|
||||
|
||||
if self._has_stream_consumers():
|
||||
# Streaming path: fire delta callbacks for real-time
|
||||
# token delivery to CLI display, gateway, or TTS.
|
||||
def _stop_spinner():
|
||||
nonlocal thinking_spinner
|
||||
if thinking_spinner:
|
||||
thinking_spinner.stop("")
|
||||
thinking_spinner = None
|
||||
if self.thinking_callback:
|
||||
self.thinking_callback("")
|
||||
# Always prefer the streaming path — even without stream
|
||||
# consumers. Streaming gives us fine-grained health
|
||||
# checking (90s stale-stream detection, 60s read timeout)
|
||||
# that the non-streaming path lacks. Without this,
|
||||
# subagents and other quiet-mode callers can hang
|
||||
# indefinitely when the provider keeps the connection
|
||||
# alive with SSE pings but never delivers a response.
|
||||
# The streaming path is a no-op for callbacks when no
|
||||
# consumers are registered, and falls back to non-
|
||||
# streaming automatically if the provider doesn't
|
||||
# support it.
|
||||
def _stop_spinner():
|
||||
nonlocal thinking_spinner
|
||||
if thinking_spinner:
|
||||
thinking_spinner.stop("")
|
||||
thinking_spinner = None
|
||||
if self.thinking_callback:
|
||||
self.thinking_callback("")
|
||||
|
||||
_use_streaming = True
|
||||
if not self._has_stream_consumers():
|
||||
# No display/TTS consumer. Still prefer streaming for
|
||||
# health checking, but skip for Mock clients in tests
|
||||
# (mocks return SimpleNamespace, not stream iterators).
|
||||
from unittest.mock import Mock
|
||||
if isinstance(getattr(self, "client", None), Mock):
|
||||
_use_streaming = False
|
||||
|
||||
if _use_streaming:
|
||||
response = self._interruptible_streaming_api_call(
|
||||
api_kwargs, on_first_delta=_stop_spinner
|
||||
)
|
||||
|
|
|
|||
|
|
@ -273,6 +273,9 @@ def test_401_credential_refresh_recovers(monkeypatch):
|
|||
return _anthropic_response("Auth refreshed")
|
||||
|
||||
self._interruptible_api_call = _fake_api_call
|
||||
# Also patch streaming path — run_conversation now prefers
|
||||
# streaming for health checking even without stream consumers.
|
||||
self._interruptible_streaming_api_call = lambda api_kwargs, **kw: _fake_api_call(api_kwargs)
|
||||
return super().run_conversation(
|
||||
user_message, conversation_history=conversation_history, task_id=task_id
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue