feat(gateway): typed send-error classification (SendResult.error_kind) (#50342)

Add a platform-neutral send-failure vocabulary so consumers can branch on a typed category instead of substring-matching the raw provider message. - base.py: SEND_ERROR_KINDS + classify_send_error() (too_long / bad_format / forbidden / not_found / rate_limited / transient / unknown), and an optional SendResult.error_kind field (defaults None — fully backward compatible). - telegram.py: populate error_kind on send() failures; message_too_long keeps its existing error token plus error_kind='too_long'. Purely additive: no behavioral change to the existing degrade-and-deliver paths (MarkdownV2->plain-text fallback, overflow split, retry classification all untouched). 22 new tests + 210 adapter regression tests green.
2026-06-23 10:42:00 +00:00 · 2026-06-21 12:34:22 -07:00 · 2026-06-21 12:34:22 -07:00 · c0409a87ff
commit c0409a87ff
parent 6bbacc2238
3 changed files with 244 additions and 2 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -1674,6 +1674,105 @@ class SendResult:
    # made up the full payload, in send order.  Empty tuple for the common
    # single-message case.
    continuation_message_ids: tuple = ()
+    # Machine-readable failure category (set only when ``success`` is False).
+    # ``error`` stays the human-readable detail string; ``error_kind`` lets
+    # consumers branch deterministically instead of substring-matching the raw
+    # provider message.  One of the values in :data:`SEND_ERROR_KINDS` or
+    # ``None`` (unset / not classified).  Producers should set this via
+    # :func:`classify_send_error`.
+    error_kind: Optional[str] = None
+
+
+# Machine-readable send-failure categories.  Kept platform-neutral so every
+# adapter can populate ``SendResult.error_kind`` from the same vocabulary and
+# the gateway can decide — once, in one place — whether a failure is worth
+# surfacing to the user.
+#
+#   too_long      content exceeded the platform's per-message size cap; the
+#                 adapter typically recovers via continuation/split, so this is
+#                 informational rather than a hard failure.
+#   bad_format    the platform rejected the message markup/entities (parse
+#                 error); a plain-text retry is the actionable fix.
+#   forbidden     the bot is blocked, kicked, or lacks permission to post to the
+#                 target — the bot CANNOT reach the user, so there is nowhere to
+#                 surface a notice.
+#   not_found     the target chat/thread/message no longer exists.
+#   rate_limited  the platform throttled the send (flood control).
+#   transient     a connection-level failure that is safe to retry.
+#   unknown       classification did not match any known shape.
+SEND_ERROR_KINDS = frozenset(
+    {
+        "too_long",
+        "bad_format",
+        "forbidden",
+        "not_found",
+        "rate_limited",
+        "transient",
+        "unknown",
+    }
+)
+
+
+def classify_send_error(exc: Optional[BaseException], error_text: str = "") -> str:
+    """Map a send exception / error string to a :data:`SEND_ERROR_KINDS` value.
+
+    Platform-neutral: matches on the lowercased text of ``exc`` (and/or the
+    explicit ``error_text``) against the substrings the major messaging APIs
+    use.  Conservative — anything unrecognized returns ``"unknown"`` so callers
+    never mistake an unclassified failure for a benign one.
+    """
+    parts = []
+    if error_text:
+        parts.append(error_text)
+    if exc is not None:
+        parts.append(str(exc))
+        parts.append(exc.__class__.__name__)
+    blob = " ".join(parts).lower()
+    if not blob.strip():
+        return "unknown"
+    if "message_too_long" in blob or "too long" in blob or "message is too long" in blob:
+        return "too_long"
+    if (
+        "can't parse entities" in blob
+        or "cant parse entities" in blob
+        or "can't find end" in blob
+        or "unsupported start tag" in blob
+        or ("entity" in blob and "parse" in blob)
+        or ("bad request" in blob and "entit" in blob)
+    ):
+        return "bad_format"
+    if (
+        "forbidden" in blob
+        or "bot was blocked" in blob
+        or "blocked by the user" in blob
+        or "user is deactivated" in blob
+        or "not enough rights" in blob
+        or "have no rights" in blob
+        or "not a member" in blob
+    ):
+        return "forbidden"
+    if (
+        "chat not found" in blob
+        or "message to edit not found" in blob
+        or "message to reply not found" in blob
+        or "thread not found" in blob
+        or "topic_deleted" in blob
+        or "message_id_invalid" in blob
+    ):
+        return "not_found"
+    if (
+        "flood" in blob
+        or "too many requests" in blob
+        or "retry after" in blob
+        or "rate limit" in blob
+    ):
+        return "rate_limited"
+    for pat in _RETRYABLE_ERROR_PATTERNS:
+        if pat in blob:
+            return "transient"
+    if "connecttimeout" in blob:
+        return "transient"
+    return "unknown"


 class EphemeralReply(str):
--- a/plugins/platforms/telegram/adapter.py
+++ b/plugins/platforms/telegram/adapter.py
@ -72,6 +72,7 @@ from gateway.platforms.base import (
    MessageType,
    ProcessingOutcome,
    SendResult,
+    classify_send_error,
    cache_image_from_bytes,
    cache_audio_from_bytes,
    cache_video_from_bytes,
@ -2763,6 +2764,7 @@ class TelegramAdapter(BasePlatformAdapter):
        except Exception as e:
            logger.error("[%s] Failed to send Telegram message: %s", self.name, e, exc_info=True)
            err_str = str(e).lower()
+            error_kind = classify_send_error(e)
            # Message too long — content exceeded 4096 chars. Return failure so
            # stream consumer enters fallback mode and sends the remainder.
            if "message_too_long" in err_str or "too long" in err_str:
@ -2770,7 +2772,7 @@ class TelegramAdapter(BasePlatformAdapter):
                    "[%s] send() content too long, falling back to new-message continuation",
                    self.name,
                )
-                return SendResult(success=False, error="message_too_long")
+                return SendResult(success=False, error="message_too_long", error_kind="too_long")
            # TimedOut usually means the request may have reached Telegram —
            # mark as non-retryable so _send_with_retry() doesn't re-send.
            # Exceptions: a wrapped ConnectTimeout (no connection established)
@ -2780,7 +2782,12 @@ class TelegramAdapter(BasePlatformAdapter):
            is_timeout = (_to and isinstance(e, _to)) or "timed out" in err_str
            is_connect_timeout = self._looks_like_connect_timeout(e)
            is_pool_timeout = self._looks_like_pool_timeout(e)
-            return SendResult(success=False, error=str(e), retryable=(is_connect_timeout or is_pool_timeout or not is_timeout))
+            return SendResult(
+                success=False,
+                error=str(e),
+                retryable=(is_connect_timeout or is_pool_timeout or not is_timeout),
+                error_kind=error_kind,
+            )

    async def send_or_update_status(
        self,
--- a/tests/gateway/test_send_error_classification.py
+++ b/tests/gateway/test_send_error_classification.py
@ -0,0 +1,136 @@
+"""Tests for structured send-error classification (SendResult.error_kind).
+
+Covers the platform-neutral ``classify_send_error`` vocabulary in
+``gateway/platforms/base.py`` and its wiring into the Telegram adapter's
+``send()`` failure path, so consumers can branch on a typed category instead
+of substring-matching the raw provider message.
+"""
+
+import pytest
+
+from gateway.platforms.base import (
+    SEND_ERROR_KINDS,
+    SendResult,
+    classify_send_error,
+)
+
+
+class _FakeBadRequest(Exception):
+    """Stand-in for a provider BadRequest carrying a message string."""
+
+
+@pytest.mark.parametrize(
+    "text,expected",
+    [
+        ("Message_too_long", "too_long"),
+        ("Bad Request: message is too long", "too_long"),
+        ("Bad Request: can't parse entities: unsupported start tag", "bad_format"),
+        ("Bad Request: can't find end of the entity", "bad_format"),
+        ("Forbidden: bot was blocked by the user", "forbidden"),
+        ("Forbidden: user is deactivated", "forbidden"),
+        ("Bad Request: not enough rights to send text messages", "forbidden"),
+        ("Bad Request: chat not found", "not_found"),
+        ("Bad Request: message to edit not found", "not_found"),
+        ("Too Many Requests: retry after 12", "rate_limited"),
+        ("Flood control exceeded", "rate_limited"),
+        ("ConnectError: connection refused", "transient"),
+        ("ConnectTimeout", "transient"),
+        ("some entirely novel provider message", "unknown"),
+        ("", "unknown"),
+    ],
+)
+def test_classify_send_error_text(text, expected):
+    assert classify_send_error(None, text) == expected
+
+
+def test_classify_uses_exception_class_name():
+    # The class name participates in classification even when str(exc) is empty.
+    exc = type("Forbidden", (Exception,), {})()
+    assert classify_send_error(exc) == "forbidden"
+
+
+def test_classify_prefers_explicit_text_and_exception_together():
+    exc = _FakeBadRequest("chat not found")
+    assert classify_send_error(exc) == "not_found"
+
+
+def test_every_classification_is_in_the_vocabulary():
+    samples = [
+        "message_too_long",
+        "can't parse entities",
+        "forbidden",
+        "chat not found",
+        "flood",
+        "connecterror",
+        "mystery",
+        "",
+    ]
+    for s in samples:
+        assert classify_send_error(None, s) in SEND_ERROR_KINDS
+
+
+def test_unknown_never_masquerades_as_benign():
+    # An unrecognized failure must classify as "unknown", never as a benign
+    # category like too_long that a consumer might treat as a soft recovery.
+    assert classify_send_error(None, "kaboom 500 internal") == "unknown"
+
+
+def test_sendresult_error_kind_defaults_none_and_is_backward_compatible():
+    # Existing call sites that never set error_kind keep working unchanged.
+    ok = SendResult(success=True, message_id="42")
+    assert ok.error_kind is None
+    legacy_fail = SendResult(success=False, error="boom")
+    assert legacy_fail.error_kind is None
+
+
+def test_telegram_send_failure_populates_error_kind():
+    """Telegram send() failures carry a typed error_kind alongside error."""
+    import asyncio
+    from unittest.mock import AsyncMock, MagicMock
+
+    from gateway.config import PlatformConfig
+    from plugins.platforms.telegram.adapter import TelegramAdapter
+
+    cfg = PlatformConfig(enabled=True, token="fake-token", extra={})
+    adapter = TelegramAdapter(cfg)
+
+    # Minimal bot whose send_message raises a parse/entity rejection.
+    bot = MagicMock()
+    bot.send_message = AsyncMock(
+        side_effect=Exception("Bad Request: can't parse entities: bad tag")
+    )
+    bot.send_chat_action = AsyncMock()
+    # Force the legacy (non-rich) path and a connected bot.
+    adapter._bot = bot
+    adapter._rich_messages_enabled = False
+
+    result = asyncio.run(adapter.send("123", "<b>broken"))
+    assert result.success is False
+    # Telegram has a plain-text fallback for parse errors inside the send loop,
+    # so a raw parse failure that still escapes is classified for consumers.
+    assert result.error_kind in SEND_ERROR_KINDS
+    assert result.error_kind != "unknown" or result.error
+
+
+def test_telegram_too_long_sets_too_long_kind():
+    import asyncio
+    from unittest.mock import AsyncMock, MagicMock
+
+    from gateway.config import PlatformConfig
+    from plugins.platforms.telegram.adapter import TelegramAdapter
+
+    cfg = PlatformConfig(enabled=True, token="fake-token", extra={})
+    adapter = TelegramAdapter(cfg)
+
+    bot = MagicMock()
+    bot.send_message = AsyncMock(
+        side_effect=Exception("Bad Request: message is too long")
+    )
+    bot.send_chat_action = AsyncMock()
+    adapter._bot = bot
+    adapter._rich_messages_enabled = False
+
+    result = asyncio.run(adapter.send("123", "x" * 5000))
+    assert result.success is False
+    assert result.error == "message_too_long"
+    assert result.error_kind == "too_long"