fix(photon): classify Envoy overflow errors as retryable; add typing cooldown

Closes #50185

Two independent gaps let a transient Photon/Spectrum upstream overflow
degrade message delivery and amplify gRPC pressure:

1. _is_retryable_error did not recognise Photon- or Envoy-specific error
   strings ("internal sidecar error", "upstream connect error",
   "reset reason: overflow"), so _send_with_retry fell through to the
   plain-text fallback immediately instead of backing off and retrying.

2. send_typing had no rate gate, so a burst of typing-indicator calls
   during an overflow event kept hitting the upstream gRPC connection and
   widened the failure window.

Fix:
- Add _PHOTON_RETRYABLE_PATTERNS with the three high-specificity Envoy /
  sidecar substrings and override _is_retryable_error on PhotonAdapter to
  check them after delegating to the base-class patterns.  base.py and all
  other adapters are untouched.
- Add a 5 s per-chat cooldown in send_typing backed by _typing_last_sent.
  stop_typing clears the entry so the next start after a completed turn
  fires immediately — only rapid consecutive starts without a stop are
  suppressed.
- Reduce PhotonAdapter._send_with_retry default max_retries from 2 to 1
  (single 2 s back-off check) — enough to confirm whether the Envoy
  circuit-breaker has opened, without adding unnecessary latency.

All changes are scoped to plugins/platforms/photon/adapter.py.
This commit is contained in:
joaomarcos 2026-06-21 13:53:26 -03:00 committed by Teknium
parent 7a131f7f40
commit 2a4542333e

View file

@ -85,6 +85,20 @@ _DEDUP_WINDOW_SECONDS = 48 * 3600
_SIDECAR_DIR = Path(__file__).parent / "sidecar"
# Photon / Envoy / spectrum-ts error substrings that indicate a transient
# upstream overload rather than a permanent failure. These are not in the
# core _RETRYABLE_ERROR_PATTERNS because they are specific to this adapter.
_PHOTON_RETRYABLE_PATTERNS = (
"internal sidecar error",
"upstream connect error",
"reset reason: overflow",
)
# Minimum seconds between typing-indicator calls for the same chat.
# iMessage is a personal channel — suppressing rapid repeats reduces
# upstream gRPC pressure during Photon overflow events.
_TYPING_COOLDOWN_SECONDS = 5.0
# Group-chat mention wake words. When ``require_mention`` is enabled, group
# messages are ignored unless they match one of these patterns — same
# behavior and defaults as the BlueBubbles iMessage channel so the two
@ -234,6 +248,8 @@ class PhotonAdapter(BasePlatformAdapter):
# react action default to "the message that triggered me" without
# requiring the model to thread message ids through tool calls.
self._last_inbound_by_chat: Dict[str, str] = {}
# Last time we sent a typing indicator per chat, for cooldown gating.
self._typing_last_sent: Dict[str, float] = {}
# Group-chat mention gating (parity with BlueBubbles). When enabled,
# group messages are ignored unless they match a wake word; DMs are
@ -988,6 +1004,10 @@ class PhotonAdapter(BasePlatformAdapter):
)
async def send_typing(self, chat_id: str, metadata=None) -> None:
now = time.time()
if now - self._typing_last_sent.get(chat_id, 0.0) < _TYPING_COOLDOWN_SECONDS:
return
self._typing_last_sent[chat_id] = now
try:
await self._sidecar_call(
"/typing", {"spaceId": chat_id, "state": "start"}
@ -996,6 +1016,7 @@ class PhotonAdapter(BasePlatformAdapter):
logger.debug("[photon] send_typing failed: %s", e)
async def stop_typing(self, chat_id: str) -> None:
self._typing_last_sent.pop(chat_id, None)
try:
await self._sidecar_call(
"/typing", {"spaceId": chat_id, "state": "stop"}
@ -1189,13 +1210,22 @@ class PhotonAdapter(BasePlatformAdapter):
return content
return strip_markdown(content)
@staticmethod
def _is_retryable_error(error: Optional[str]) -> bool:
if BasePlatformAdapter._is_retryable_error(error):
return True
if not error:
return False
lowered = error.lower()
return any(pat in lowered for pat in _PHOTON_RETRYABLE_PATTERNS)
async def _send_with_retry(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Any = None,
max_retries: int = 2,
max_retries: int = 1,
base_delay: float = 2.0,
) -> SendResult:
"""Retry sends without the generic Markdown banner.