fix(gateway): retry transient send failures and notify user on exhaustion (#3288)

When send() fails due to a network error (ConnectError, ReadTimeout, etc.),
the failure was silently logged and the user received no feedback — appearing
as a hang. In one reported case, a user waited 1+ hour for a response that
had already been generated but failed to deliver (#2910).

Adds _send_with_retry() to BasePlatformAdapter:
- Transient errors: retry up to 2x with exponential backoff + jitter
- On exhaustion: send delivery-failure notice so user knows to retry
- Permanent errors: fall back to plain-text version (preserves existing behavior)
- SendResult.retryable flag for platform-specific transient errors

All adapters benefit automatically via BasePlatformAdapter inheritance.

Cherry-picked from PR #3108 by Mibayy.

Co-authored-by: Mibayy <mibayy@users.noreply.github.com>
This commit is contained in:
Teknium 2026-03-26 17:37:10 -07:00 committed by GitHub
parent 716e616d28
commit bde45f5a2a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 336 additions and 14 deletions

View file

@ -8,6 +8,7 @@ and implement the required methods.
import asyncio
import logging
import os
import random
import re
import uuid
from abc import ABC, abstractmethod
@ -329,6 +330,24 @@ class SendResult:
message_id: Optional[str] = None
error: Optional[str] = None
raw_response: Any = None
retryable: bool = False # True for transient errors (network, timeout) — base will retry automatically
# Error substrings that indicate a transient network failure worth retrying
_RETRYABLE_ERROR_PATTERNS = (
"connecterror",
"connectionerror",
"connectionreset",
"connectionrefused",
"timeout",
"timed out",
"network",
"broken pipe",
"remotedisconnected",
"eoferror",
"readtimeout",
"writetimeout",
)
# Type for message handlers
@ -833,6 +852,91 @@ class BasePlatformAdapter(ABC):
except Exception:
pass
@staticmethod
def _is_retryable_error(error: Optional[str]) -> bool:
"""Return True if the error string looks like a transient network failure."""
if not error:
return False
lowered = error.lower()
return any(pat in lowered for pat in _RETRYABLE_ERROR_PATTERNS)
async def _send_with_retry(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Any = None,
max_retries: int = 2,
base_delay: float = 2.0,
) -> "SendResult":
"""
Send a message with automatic retry for transient network errors.
On permanent failures (e.g. formatting / permission errors) falls back
to a plain-text version before giving up. If all attempts fail due to
network errors, sends the user a brief delivery-failure notice so they
know to retry rather than waiting indefinitely.
"""
result = await self.send(
chat_id=chat_id,
content=content,
reply_to=reply_to,
metadata=metadata,
)
if result.success:
return result
error_str = result.error or ""
is_network = result.retryable or self._is_retryable_error(error_str)
if is_network:
# Retry with exponential backoff for transient errors
for attempt in range(1, max_retries + 1):
delay = base_delay * (2 ** (attempt - 1)) + random.uniform(0, 1)
logger.warning(
"[%s] Send failed (attempt %d/%d, retrying in %.1fs): %s",
self.name, attempt, max_retries, delay, error_str,
)
await asyncio.sleep(delay)
result = await self.send(
chat_id=chat_id,
content=content,
reply_to=reply_to,
metadata=metadata,
)
if result.success:
logger.info("[%s] Send succeeded on retry %d", self.name, attempt)
return result
error_str = result.error or ""
if not (result.retryable or self._is_retryable_error(error_str)):
break # error switched to non-transient — fall through to plain-text fallback
else:
# All retries exhausted (loop completed without break) — notify user
logger.error("[%s] Failed to deliver response after %d retries: %s", self.name, max_retries, error_str)
notice = (
"\u26a0\ufe0f Message delivery failed after multiple attempts. "
"Please try again \u2014 your request was processed but the response could not be sent."
)
try:
await self.send(chat_id=chat_id, content=notice, reply_to=reply_to, metadata=metadata)
except Exception as notify_err:
logger.debug("[%s] Could not send delivery-failure notice: %s", self.name, notify_err)
return result
# Non-network / post-retry formatting failure: try plain text as fallback
logger.warning("[%s] Send failed: %s — trying plain-text fallback", self.name, error_str)
fallback_result = await self.send(
chat_id=chat_id,
content=f"(Response formatting failed, plain text:)\n\n{content[:3500]}",
reply_to=reply_to,
metadata=metadata,
)
if not fallback_result.success:
logger.error("[%s] Fallback send also failed: %s", self.name, fallback_result.error)
return fallback_result
async def handle_message(self, event: MessageEvent) -> None:
"""
Process an incoming message.
@ -982,26 +1086,13 @@ class BasePlatformAdapter(ABC):
# Send the text portion
if text_content:
logger.info("[%s] Sending response (%d chars) to %s", self.name, len(text_content), event.source.chat_id)
result = await self.send(
result = await self._send_with_retry(
chat_id=event.source.chat_id,
content=text_content,
reply_to=event.message_id,
metadata=_thread_metadata,
)
# Log send failures (don't raise - user already saw tool progress)
if not result.success:
print(f"[{self.name}] Failed to send response: {result.error}")
# Try sending without markdown as fallback
fallback_result = await self.send(
chat_id=event.source.chat_id,
content=f"(Response formatting failed, plain text:)\n\n{text_content[:3500]}",
reply_to=event.message_id,
metadata=_thread_metadata,
)
if not fallback_result.success:
print(f"[{self.name}] Fallback send also failed: {fallback_result.error}")
# Human-like pacing delay between text and media
human_delay = self._get_human_delay()