hermes-agent/gateway/platforms/whatsapp_cloud.py
teknium1 52c7976f40
fix(whatsapp-cloud): review follow-ups for #43921
- nous_subscription: gate the STT managed-default flip on openai-audio
  entitlement and skip when a local backend (faster-whisper or custom
  command) works; new _local_stt_backend_available() helper + tests
- whatsapp_cloud: WHATSAPP_CLOUD_{DM_POLICY,ALLOW_FROM,GROUP_POLICY,
  GROUP_ALLOW_FROM} env overrides so both adapters can run in parallel;
  normalize allowlist entries (JID/punctuation) to bare wa_id
- whatsapp_cloud: wrap per-message event build in try/except (dedup-marked
  wamids would be silently dropped on Meta's batch retry otherwise)
- whatsapp_cloud: validate media_id before URL/filename interpolation,
  delete transient .ogg after voice upload, FIFO-cap interactive-button
  state dicts and per-chat wamid cache
- whatsapp_common: '# **Title**' headers no longer double-wrap asterisks
- setup wizard: read access token / app secret via getpass on TTYs
- docs: new WHATSAPP_CLOUD_* gating env vars
2026-06-11 07:51:01 -07:00

1956 lines
81 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
WhatsApp Cloud API adapter — official Meta WhatsApp Business Platform.
This adapter is a *complement* to ``whatsapp.py`` (the Baileys bridge), not
a replacement. The two are independent:
- ``whatsapp.py`` — unofficial Baileys bridge, personal accounts, no
public URL needed, account-ban risk.
- ``whatsapp_cloud.py`` (this file) — official Meta Cloud API, Business
account required, public webhook URL required,
token-based auth.
Both share gating / mention / formatting behavior via ``WhatsAppBehaviorMixin``.
Phase scope (this file evolves across phases):
- Phase 2 — outbound text via Graph API + webhook server with verify-token
handshake.
- Phase 3 — X-Hub-Signature-256 HMAC verification (raw body, constant-time)
+ wamid replay protection + dispatch via handle_message. Phase 3
adapter is end-to-end usable for text DMs.
- Phase 4 — media upload + send (image/video/audio/document), inbound
media download via the Graph media endpoint, voice-note opus
conversion via ffmpeg with graceful MP3 fallback when ffmpeg
isn't on PATH. Document text injection for readable types.
- Phase 5 — 24-hour conversation window + template fallback.
Required env vars to enable the adapter:
- WHATSAPP_CLOUD_PHONE_NUMBER_ID (the Graph URL path component)
- WHATSAPP_CLOUD_ACCESS_TOKEN (System User permanent token)
Optional / Phase-3+:
- WHATSAPP_CLOUD_APP_ID
- WHATSAPP_CLOUD_APP_SECRET (HMAC key for X-Hub-Signature-256)
- WHATSAPP_CLOUD_WABA_ID (analytics / future use)
- WHATSAPP_CLOUD_VERIFY_TOKEN (hub.verify_token shared secret)
- WHATSAPP_CLOUD_WEBHOOK_HOST (default 0.0.0.0)
- WHATSAPP_CLOUD_WEBHOOK_PORT (default 8090)
- WHATSAPP_CLOUD_WEBHOOK_PATH (default /whatsapp/webhook)
- WHATSAPP_CLOUD_API_VERSION (default v20.0)
"""
from __future__ import annotations
import asyncio
import hashlib
import hmac
import logging
import mimetypes
import os
import re
import shutil
import uuid
from collections import OrderedDict
from pathlib import Path
from typing import Any, Dict, Optional
try:
from aiohttp import web
AIOHTTP_AVAILABLE = True
except ImportError:
AIOHTTP_AVAILABLE = False
web = None # type: ignore[assignment]
try:
import httpx
HTTPX_AVAILABLE = True
except ImportError:
HTTPX_AVAILABLE = False
httpx = None # type: ignore[assignment]
from gateway.config import Platform, PlatformConfig
from gateway.platforms.base import (
BasePlatformAdapter,
MessageEvent,
MessageType,
SendResult,
SUPPORTED_DOCUMENT_TYPES,
)
from gateway.platforms.whatsapp_common import WhatsAppBehaviorMixin
from hermes_constants import get_hermes_dir
logger = logging.getLogger(__name__)
DEFAULT_API_VERSION = "v20.0"
DEFAULT_WEBHOOK_HOST = "0.0.0.0"
DEFAULT_WEBHOOK_PORT = 8090
DEFAULT_WEBHOOK_PATH = "/whatsapp/webhook"
GRAPH_API_BASE = "https://graph.facebook.com"
# Meta retries failed webhooks for up to 7 days. We don't need to remember
# every wamid for the full retry window — the practical risk is duplicate
# delivery within minutes, not days. 5000 entries with FIFO eviction is
# plenty for normal traffic and bounds memory.
WAMID_DEDUP_CACHE_SIZE = 5000
# Cap for the interactive-button state dicts and the per-chat last-wamid
# cache. Generous for any realistic number of in-flight prompts / chats.
INTERACTIVE_STATE_CACHE_SIZE = 1000
# Per-type size caps documented by Meta for the Cloud API /media endpoint.
# These are the hard limits; we refuse uploads above them with a clean
# error instead of round-tripping to Graph just to be rejected.
# https://developers.facebook.com/docs/whatsapp/cloud-api/reference/media
_MEDIA_SIZE_LIMITS = {
"image": 5 * 1024 * 1024, # 5 MB (JPEG, PNG)
"video": 16 * 1024 * 1024, # 16 MB
"audio": 16 * 1024 * 1024, # 16 MB (MP3, AAC, AMR, OGG opus)
"document": 100 * 1024 * 1024, # 100 MB
"sticker": 100 * 1024, # 100 KB animated, 500 KB static
}
# Default mime types when we can't guess from the path's extension.
_DEFAULT_MIME = {
"image": "image/jpeg",
"video": "video/mp4",
"audio": "audio/mpeg",
"document": "application/octet-stream",
"sticker": "image/webp",
}
# ffmpeg location at import time. ``shutil.which`` honours PATHEXT on
# Windows so a user's ``ffmpeg.exe`` is picked up. None means MP3 voice
# falls back to "audio file attachment" rendering in WhatsApp.
_FFMPEG_PATH = shutil.which("ffmpeg")
# Python's mimetypes module returns RFC-correct but real-world-uncommon
# extensions for some types (audio/ogg → .oga since RFC 5334; audio/mp4
# → .mp4 instead of the de-facto .m4a for voice notes). Our downstream
# STT pipeline whitelists the common-in-the-wild extensions, so override
# the few Meta sends that don't match those defaults.
_WHATSAPP_MIME_EXTENSION_OVERRIDES: Dict[str, str] = {
# WhatsApp voice notes — opus codec inside an Ogg container.
"audio/ogg": ".ogg",
"audio/x-opus+ogg": ".ogg",
"audio/opus": ".ogg",
# iOS voice memos — AAC inside an MP4 container; STT tools expect .m4a.
"audio/mp4": ".m4a",
"audio/x-m4a": ".m4a",
# Image — mimetypes occasionally returns .jpe (legacy IANA) instead
# of .jpg, which trips up tools that switch on extension.
"image/jpeg": ".jpg",
}
def _ext_for_mime(mime: str) -> Optional[str]:
"""Resolve a mime type to the file extension we want on disk.
Consults the override map first so types like ``audio/ogg`` produce
the extension downstream tools actually accept (``.ogg``, not the
technically-correct-but-broken ``.oga``). Falls back to Python's
``mimetypes.guess_extension`` for anything we haven't pinned.
"""
if not mime:
return None
primary = mime.split(";")[0].strip().lower()
override = _WHATSAPP_MIME_EXTENSION_OVERRIDES.get(primary)
if override:
return override
return mimetypes.guess_extension(primary) or None
# Inbound media cache lives under the user's hermes dir so it survives
# restarts and gateway reloads — same convention the Baileys bridge uses.
_INBOUND_MEDIA_CACHE = Path(get_hermes_dir("platforms/whatsapp_cloud/media", "whatsapp_cloud/media"))
def check_whatsapp_cloud_requirements() -> bool:
"""Return whether transport dependencies are available.
aiohttp is needed for the webhook server (inbound). httpx is needed
for Graph API calls (outbound). Both ship with hermes-agent's default
dependency set, so this should always be True in normal installs.
"""
return AIOHTTP_AVAILABLE and HTTPX_AVAILABLE
class WhatsAppCloudAdapter(WhatsAppBehaviorMixin, BasePlatformAdapter):
"""WhatsApp Business Cloud API adapter.
Outbound: HTTPS POST to ``graph.facebook.com/<api_version>/<phone_id>/messages``.
Inbound: aiohttp server accepting Meta's webhook payloads.
The mixin must come first in the bases list so its ``format_message``
overrides ``BasePlatformAdapter.format_message`` (the base provides a
generic implementation that does not convert markdown to WhatsApp
syntax). The Baileys adapter does the same.
"""
def __init__(self, config: PlatformConfig):
super().__init__(config, Platform.WHATSAPP_CLOUD)
extra = config.extra or {}
# Required
self._phone_number_id: str = str(extra.get("phone_number_id", "")).strip()
self._access_token: str = str(extra.get("access_token", "")).strip()
# Optional / used in later phases
self._app_id: str = str(extra.get("app_id", "")).strip()
self._app_secret: str = str(extra.get("app_secret", "")).strip()
self._waba_id: str = str(extra.get("waba_id", "")).strip()
self._verify_token: str = str(extra.get("verify_token", "")).strip()
# Webhook server config
self._webhook_host: str = str(extra.get("webhook_host", DEFAULT_WEBHOOK_HOST))
self._webhook_port: int = int(extra.get("webhook_port", DEFAULT_WEBHOOK_PORT))
self._webhook_path: str = self._normalize_path(
extra.get("webhook_path", DEFAULT_WEBHOOK_PATH)
)
self._health_path: str = self._normalize_path(
extra.get("health_path", "/health")
)
# Graph API
self._api_version: str = str(extra.get("api_version", DEFAULT_API_VERSION))
# Behavior-mixin contract: these names are read by the mixin's
# gating methods. WHATSAPP_CLOUD_* env vars take precedence so the
# two adapters can run in parallel with independent policies; the
# shared WHATSAPP_* names remain as fallback for single-adapter
# setups.
import os
self._reply_prefix: Optional[str] = extra.get("reply_prefix")
self._dm_policy: str = str(
extra.get("dm_policy")
or os.getenv("WHATSAPP_CLOUD_DM_POLICY")
or os.getenv("WHATSAPP_DM_POLICY", "open")
).strip().lower()
self._allow_from: set[str] = self._normalize_allow_ids(
self._coerce_allow_list(
extra.get("allow_from")
or extra.get("allowFrom")
or os.getenv("WHATSAPP_CLOUD_ALLOW_FROM")
)
)
self._group_policy: str = str(
extra.get("group_policy")
or os.getenv("WHATSAPP_CLOUD_GROUP_POLICY")
or os.getenv("WHATSAPP_GROUP_POLICY", "open")
).strip().lower()
self._group_allow_from: set[str] = self._normalize_allow_ids(
self._coerce_allow_list(
extra.get("group_allow_from")
or extra.get("groupAllowFrom")
or os.getenv("WHATSAPP_CLOUD_GROUP_ALLOW_FROM")
)
)
self._mention_patterns = self._compile_mention_patterns()
# Webhook dedup state — wamid → True. OrderedDict gives O(1) FIFO
# eviction. In-memory only; Phase 5 may promote to SessionDB if we
# decide we need replay protection across gateway restarts.
self._seen_wamids: "OrderedDict[str, bool]" = OrderedDict()
self._duplicate_count: int = 0
self._accepted_count: int = 0
self._rejected_signature_count: int = 0
# One-shot flags for warnings that would otherwise spam the log.
self._warned_no_ffmpeg: bool = False
# Per-chat cache of the latest inbound wamid. Meta's typing
# indicator + read-receipt API requires a specific message_id
# to attach to (typically "the latest message in the
# conversation"). We refresh this on every accepted inbound
# message so ``send_typing`` always has a valid target without
# threading an extra kwarg through the gateway's base contract.
# In-memory only; on gateway restart the next inbound message
# repopulates it.
self._last_inbound_wamid_by_chat: "OrderedDict[str, str]" = OrderedDict()
# Interactive-button state. Each maps a short id (embedded in the
# outbound button payload) → the session/correlation key needed
# by the gateway's resolver. See ``_handle_interactive_reply`` for
# the dispatch table. Entries are popped when the user taps a
# button; ignored prompts would otherwise accumulate forever, so
# each dict is FIFO-capped via _bounded_put (oldest pending prompt
# evicted first — an evicted button tap degrades to the plain-text
# fallback path, same as after a gateway restart).
# _clarify_state: clarify_id → session_key (resolves via
# tools.clarify_gateway.resolve_gateway_clarify)
# _exec_approval_state: approval_id → session_key (resolves via
# tools.approval.resolve_gateway_approval)
# _slash_confirm_state: confirm_id → session_key (resolves via
# tools.slash_confirm.resolve)
self._clarify_state: "OrderedDict[str, str]" = OrderedDict()
self._exec_approval_state: "OrderedDict[str, str]" = OrderedDict()
self._slash_confirm_state: "OrderedDict[str, str]" = OrderedDict()
# Runtime
self._runner = None
self._http_client: Optional["httpx.AsyncClient"] = None
# ------------------------------------------------------------------ helpers
@staticmethod
def _normalize_path(path: Any) -> str:
raw = str(path or "").strip() or "/"
return raw if raw.startswith("/") else f"/{raw}"
def _graph_url(self, path: str) -> str:
"""Build a Graph API URL for this adapter's phone-number scope."""
if path.startswith("/"):
path = path[1:]
return f"{GRAPH_API_BASE}/{self._api_version}/{self._phone_number_id}/{path}"
@staticmethod
def _bounded_put(cache: "OrderedDict[str, str]", key: str, value: str) -> None:
"""Insert into a FIFO-capped OrderedDict, evicting oldest entries."""
cache[key] = value
while len(cache) > INTERACTIVE_STATE_CACHE_SIZE:
cache.popitem(last=False)
def _effective_reply_prefix(self) -> str:
"""Cloud API has no self-chat concept — never prepend a reply prefix.
Override the mixin default which keys off WHATSAPP_MODE=self-chat
(a Baileys-only setting).
"""
if self._reply_prefix is not None:
return self._reply_prefix.replace("\\n", "\n")
return ""
@staticmethod
def _normalize_allow_ids(ids: set[str]) -> set[str]:
"""Normalize allowlist entries to bare wa_id form.
The Cloud API identifies users by bare wa_id (digits, no JID
suffix), while Baileys uses ``<digits>@s.whatsapp.net`` JIDs.
Users sharing an allowlist between both adapters (or pasting a
JID/phone number with ``+`` or separators) should still match,
so strip any ``@...`` suffix and non-digit characters.
"""
normalized: set[str] = set()
for entry in ids:
bare = entry.split("@", 1)[0]
digits = re.sub(r"\D", "", bare)
normalized.add(digits or entry)
return normalized
def _is_dm_allowed(self, sender_id: str) -> bool:
"""Allowlist check against the normalized bare wa_id."""
if self._dm_policy == "allowlist":
bare = re.sub(r"\D", "", str(sender_id).split("@", 1)[0])
return (bare or sender_id) in self._allow_from
return super()._is_dm_allowed(sender_id)
# ------------------------------------------------------------------ lifecycle
async def connect(self) -> bool:
if not check_whatsapp_cloud_requirements():
self._set_fatal_error(
"whatsapp_cloud_deps_missing",
"aiohttp and httpx are required for whatsapp_cloud — "
"reinstall hermes-agent.",
retryable=False,
)
return False
if not self._phone_number_id or not self._access_token:
self._set_fatal_error(
"whatsapp_cloud_unconfigured",
"WHATSAPP_CLOUD_PHONE_NUMBER_ID and WHATSAPP_CLOUD_ACCESS_TOKEN "
"are required.",
retryable=False,
)
return False
# Outbound HTTP client. Tighter keepalive matches other platform
# adapters so idle CLOSE_WAIT drains promptly (#18451).
from gateway.platforms._http_client_limits import platform_httpx_limits
self._http_client = httpx.AsyncClient(
timeout=30.0, limits=platform_httpx_limits()
)
# Inbound webhook server.
app = web.Application()
app.router.add_get(self._health_path, self._handle_health)
app.router.add_get(self._webhook_path, self._handle_verify)
app.router.add_post(self._webhook_path, self._handle_webhook)
self._runner = web.AppRunner(app)
await self._runner.setup()
site = web.TCPSite(self._runner, self._webhook_host, self._webhook_port)
await site.start()
self._mark_connected()
logger.info(
"[whatsapp_cloud] Listening on %s:%d%s (Graph %s, phone_id=%s)",
self._webhook_host,
self._webhook_port,
self._webhook_path,
self._api_version,
self._phone_number_id,
)
if not self._verify_token:
logger.warning(
"[whatsapp_cloud] WHATSAPP_CLOUD_VERIFY_TOKEN is not set — "
"the GET subscription handshake will fail until it is."
)
if not self._app_secret:
logger.warning(
"[whatsapp_cloud] WHATSAPP_CLOUD_APP_SECRET is not set — "
"incoming webhook POSTs will be refused with 503. Set "
"the app secret to enable inbound message delivery."
)
return True
async def disconnect(self) -> None:
if self._runner is not None:
try:
await self._runner.cleanup()
except Exception:
logger.exception("[whatsapp_cloud] webhook server cleanup failed")
self._runner = None
if self._http_client is not None:
try:
await self._http_client.aclose()
except Exception:
logger.exception("[whatsapp_cloud] http client close failed")
self._http_client = None
self._mark_disconnected()
# ------------------------------------------------------------------ outbound
async def send(
self,
chat_id: str,
content: str,
reply_to: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> SendResult:
"""Send a text message via Graph API.
``chat_id`` is the recipient's WhatsApp ID (``wa_id``) — typically
their phone number with country code, no plus sign.
"""
if self._http_client is None:
return SendResult(success=False, error="Not connected")
if not content or not content.strip():
return SendResult(success=True, message_id=None)
formatted = self.format_message(content)
chunks = self.truncate_message(formatted, self._outgoing_chunk_limit())
url = self._graph_url("messages")
headers = {
"Authorization": f"Bearer {self._access_token}",
"Content-Type": "application/json",
}
last_message_id: Optional[str] = None
for idx, chunk in enumerate(chunks):
payload: Dict[str, Any] = {
"messaging_product": "whatsapp",
"recipient_type": "individual",
"to": chat_id,
"type": "text",
"text": {"body": chunk, "preview_url": True},
}
if reply_to and idx == 0:
# Quote the user's message on the first chunk only.
payload["context"] = {"message_id": reply_to}
try:
resp = await self._http_client.post(url, headers=headers, json=payload)
except Exception as exc:
logger.exception("[whatsapp_cloud] send failed")
return SendResult(success=False, error=str(exc))
if resp.status_code != 200:
# Meta returns structured errors in the body — surface them
# to the caller so log lines have actionable context.
try:
body = resp.json()
except Exception:
body = {"raw": resp.text[:500]}
error_msg = self._format_graph_error(body, resp.status_code)
logger.warning(
"[whatsapp_cloud] send rejected (status=%d): %s",
resp.status_code,
error_msg,
)
return SendResult(success=False, error=error_msg)
try:
data = resp.json()
ids = data.get("messages") or []
if ids:
last_message_id = ids[0].get("id")
except Exception:
pass
return SendResult(success=True, message_id=last_message_id)
# ------------------------------------------------------------------ typing indicator + read receipts
#
# Meta couples these into a single API call: a POST to /messages
# with ``status: "read"`` marks the message read (blue double
# checkmarks), and the optional ``typing_indicator`` field
# additionally shows the user a "typing..." pip in their chat UI.
# The indicator auto-dismisses when we respond OR after 25 seconds,
# whichever comes first — so this matches "I see your message and
# I'm working on a reply" UX exactly.
#
# The API requires a specific message_id to attach to. We cache the
# latest inbound wamid per chat in _last_inbound_wamid_by_chat
# (refreshed in _build_message_event_from_cloud) so this method can
# look it up without needing the gateway base contract to plumb
# event.message_id into send_typing's signature.
async def send_typing(self, chat_id: str, metadata=None) -> None:
"""Mark the latest inbound message as read AND show a typing
indicator in the user's chat UI.
Best-effort: any error (no inbound wamid yet, network failure,
stale token, message older than 30 days) is swallowed silently
so the agent's main reply path isn't blocked by UX polish.
"""
if self._http_client is None:
return
wamid = self._last_inbound_wamid_by_chat.get(chat_id)
if not wamid:
# No inbound message yet for this chat (or cache cleared on
# restart) — skip. The next inbound message will repopulate.
return
url = self._graph_url("messages")
headers = {
"Authorization": f"Bearer {self._access_token}",
"Content-Type": "application/json",
}
payload = {
"messaging_product": "whatsapp",
"status": "read",
"message_id": wamid,
"typing_indicator": {"type": "text"},
}
try:
resp = await self._http_client.post(url, headers=headers, json=payload)
except Exception:
# Network / connection error — silent fail. Typing UX must
# never block message dispatch.
return
# Best-effort: surface 4xx for ops visibility but don't raise.
# Code 131009 = "Parameter value is not valid" (typically wamid
# > 30 days old) — common after a long-quiet conversation, log
# at info not warning.
if resp.status_code != 200:
try:
body = resp.json()
code = ((body or {}).get("error") or {}).get("code")
except Exception:
code = None
if code == 131009:
logger.info(
"[whatsapp_cloud] typing/read indicator rejected: "
"wamid %s likely older than 30 days", wamid,
)
else:
logger.debug(
"[whatsapp_cloud] typing/read indicator returned %d (%s)",
resp.status_code, code,
)
# ------------------------------------------------------------------ interactive messages
#
# WhatsApp Cloud supports two interactive primitives we use here:
# * ``interactive.type=button`` — up to 3 quick-reply buttons. Each
# button has an ``id`` (≤256 chars, returned verbatim on tap) and
# a ``title`` (≤20 chars, the label shown). Used for clarify with
# ≤3 choices, exec_approval, and slash_confirm.
# * ``interactive.type=list`` — a single "Tap to choose" button
# that opens a sheet with up to 10 rows. Used for clarify with
# >3 choices and the model picker.
#
# Unlike utility templates these are FREE-FORM and need no Meta-side
# approval. They only work *inside* the 24-hour conversation window —
# which is fine because all five senders below fire in direct response
# to a user message (clarify mid-conversation, approval mid-tool-call,
# etc.) so we're always inside the window when they're invoked.
async def _post_interactive(
self,
chat_id: str,
interactive_body: Dict[str, Any],
reply_to: Optional[str] = None,
) -> SendResult:
"""Low-level POST for an ``interactive`` message payload.
``interactive_body`` is the inner ``interactive: {...}`` dict —
the caller supplies ``type``, ``body``, and ``action``. This
wrapper handles auth, error mapping, and message_id extraction so
each send_* method stays focused on its own button shape.
"""
if self._http_client is None:
return SendResult(success=False, error="Not connected")
url = self._graph_url("messages")
headers = {
"Authorization": f"Bearer {self._access_token}",
"Content-Type": "application/json",
}
payload: Dict[str, Any] = {
"messaging_product": "whatsapp",
"recipient_type": "individual",
"to": chat_id,
"type": "interactive",
"interactive": interactive_body,
}
if reply_to:
payload["context"] = {"message_id": reply_to}
try:
resp = await self._http_client.post(url, headers=headers, json=payload)
except Exception as exc:
logger.exception("[whatsapp_cloud] interactive send failed")
return SendResult(success=False, error=str(exc))
if resp.status_code != 200:
try:
body = resp.json()
except Exception:
body = {"raw": resp.text[:500]}
error_msg = self._format_graph_error(body, resp.status_code)
logger.warning(
"[whatsapp_cloud] interactive rejected (status=%d): %s",
resp.status_code, error_msg,
)
return SendResult(success=False, error=error_msg)
last_message_id: Optional[str] = None
try:
data = resp.json()
ids = data.get("messages") or []
if ids:
last_message_id = ids[0].get("id")
except Exception:
pass
return SendResult(success=True, message_id=last_message_id)
@staticmethod
def _truncate_button_label(text: str, limit: int = 20) -> str:
"""WhatsApp caps quick-reply button titles at 20 chars and list-row
titles at 24. Truncate with an ellipsis so we surface as much of
the choice as fits."""
text = str(text or "").strip()
if len(text) <= limit:
return text
# Reserve 1 char for the ellipsis. WhatsApp counts the ellipsis
# toward the limit.
return text[: max(1, limit - 1)] + ""
@staticmethod
def _truncate_body(text: str, limit: int = 1024) -> str:
"""``interactive.body.text`` caps at 1024 chars."""
text = str(text or "")
if len(text) <= limit:
return text
return text[: limit - 3] + "..."
async def send_clarify(
self,
chat_id: str,
question: str,
choices: Optional[list],
clarify_id: str,
session_key: str,
metadata: Optional[Dict[str, Any]] = None,
) -> SendResult:
"""Render a clarify prompt as native WhatsApp interactive buttons.
- 13 choices → ``interactive.type=button`` (inline pill buttons).
- 4+ choices → ``interactive.type=list`` (tap-to-open sheet with
up to 10 rows). Telegram's "Other (type answer)" escape hatch
is appended as the final row, picking it flips the entry into
text-capture mode handled by the gateway's text intercept.
- 0 choices (open-ended) → plain text question; the next message
in the session is captured by the gateway and resolves clarify.
The button ``id`` field carries ``cl:<clarify_id>:<idx>`` (or
``:other``); inbound webhook parsing dispatches on the prefix.
"""
if self._http_client is None:
return SendResult(success=False, error="Not connected")
question = (question or "").strip()
reply_to = (metadata or {}).get("reply_to_message_id") if metadata else None
# Open-ended → just send the question, gateway captures next msg.
if not choices:
return await self.send(chat_id, f"{question}", reply_to=reply_to)
# Mirror Telegram: render full choice text in body so long
# options aren't truncated to the 20-char button label cap.
# Truncate choices to MAX_CHOICES (4) — the tool layer enforces
# this already, but be defensive.
choices_list = [str(c).strip() for c in choices[:10] if str(c).strip()]
option_lines = "\n".join(
f"{i + 1}. {c}" for i, c in enumerate(choices_list)
)
body_text = self._truncate_body(f"{question}\n\n{option_lines}")
if len(choices_list) <= 3:
buttons = [
{
"type": "reply",
"reply": {
"id": f"cl:{clarify_id}:{idx}",
"title": self._truncate_button_label(str(idx + 1)),
},
}
for idx in range(len(choices_list))
]
interactive: Dict[str, Any] = {
"type": "button",
"body": {"text": body_text},
"action": {"buttons": buttons},
}
else:
# List mode: rows must each have id + title (≤24 chars).
# Description (≤72 chars) renders below the title — we put
# the truncated choice text there for skimmability.
rows = []
for idx, choice_text in enumerate(choices_list):
rows.append({
"id": f"cl:{clarify_id}:{idx}",
"title": self._truncate_button_label(f"{idx + 1}", limit=24),
"description": self._truncate_button_label(choice_text, limit=72),
})
rows.append({
"id": f"cl:{clarify_id}:other",
"title": "✏️ Other",
"description": "Type your own answer",
})
interactive = {
"type": "list",
"body": {"text": body_text},
"action": {
"button": "Choose",
"sections": [{"title": "Options", "rows": rows}],
},
}
result = await self._post_interactive(chat_id, interactive, reply_to=reply_to)
if result.success:
self._bounded_put(self._clarify_state, clarify_id, session_key)
return result
async def send_exec_approval(
self,
chat_id: str,
command: str,
session_key: str,
description: str = "dangerous command",
metadata: Optional[Dict[str, Any]] = None,
) -> SendResult:
"""Render a dangerous-command approval prompt with native buttons.
Two quick-reply buttons (Approve / Deny). Tapping resolves the
waiting agent via ``tools.approval.resolve_gateway_approval`` —
same mechanism as the text ``/approve`` flow. The agent thread
is blocked until the user taps or types a response.
"""
if self._http_client is None:
return SendResult(success=False, error="Not connected")
# WhatsApp body caps at 1024 chars; reserve room for the
# framing prose around the command.
cmd = command or ""
cmd_preview = cmd if len(cmd) <= 800 else cmd[:800] + "..."
body_text = self._truncate_body(
f"⚠️ *Command Approval Required*\n\n"
f"```\n{cmd_preview}\n```\n\n"
f"Reason: {description}"
)
approval_id = uuid.uuid4().hex[:12]
reply_to = (metadata or {}).get("reply_to_message_id") if metadata else None
interactive = {
"type": "button",
"body": {"text": body_text},
"action": {
"buttons": [
{
"type": "reply",
"reply": {"id": f"appr:{approval_id}:approve", "title": "✅ Approve"},
},
{
"type": "reply",
"reply": {"id": f"appr:{approval_id}:deny", "title": "❌ Deny"},
},
],
},
}
result = await self._post_interactive(chat_id, interactive, reply_to=reply_to)
if result.success:
self._bounded_put(self._exec_approval_state, approval_id, session_key)
return result
async def send_slash_confirm(
self,
chat_id: str,
title: str,
message: str,
session_key: str,
confirm_id: str,
metadata: Optional[Dict[str, Any]] = None,
) -> SendResult:
"""Render a 3-button slash-command confirmation prompt.
Mirrors Telegram's send_slash_confirm: Approve Once / Always /
Cancel. The confirm_id is supplied by the caller (slash command
handler) — we just store the session_key mapping for the inbound
resolver to look up.
"""
if self._http_client is None:
return SendResult(success=False, error="Not connected")
body_text = self._truncate_body(f"*{title}*\n\n{message}")
reply_to = (metadata or {}).get("reply_to_message_id") if metadata else None
interactive = {
"type": "button",
"body": {"text": body_text},
"action": {
"buttons": [
{
"type": "reply",
"reply": {"id": f"sc:once:{confirm_id}", "title": "✅ Approve Once"},
},
{
"type": "reply",
"reply": {"id": f"sc:always:{confirm_id}", "title": "🔒 Always"},
},
{
"type": "reply",
"reply": {"id": f"sc:cancel:{confirm_id}", "title": "❌ Cancel"},
},
],
},
}
result = await self._post_interactive(chat_id, interactive, reply_to=reply_to)
if result.success:
self._bounded_put(self._slash_confirm_state, confirm_id, session_key)
return result
@staticmethod
def _format_graph_error(body: Dict[str, Any], status_code: int) -> str:
err = (body or {}).get("error") or {}
# Graph API error shape:
# {"error": {"message": "...", "type": "...", "code": ..., "fbtrace_id": "..."}}
message = err.get("message") or body.get("raw") or "unknown error"
code = err.get("code")
if code is not None:
return f"graph error {code} (HTTP {status_code}): {message}"
return f"HTTP {status_code}: {message}"
async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
# Cloud API doesn't expose a direct "chat info" endpoint the way
# Slack/Discord do — we just echo the wa_id. Profile name (when
# known) flows in via webhook ``contacts[].profile.name`` and is
# cached on the MessageEvent, not here.
return {"name": chat_id, "type": "dm"}
# ------------------------------------------------------------------ outbound media
async def _upload_media(
self,
file_path: str,
media_kind: str,
mime_type: Optional[str] = None,
) -> tuple[Optional[str], Optional[str]]:
"""Upload a local file to the Graph /media endpoint.
Returns ``(media_id, None)`` on success, ``(None, error_string)``
on failure. Two-step send: this gets the id, then ``_send_media``
references it. Used when we have a local file and no public URL.
``media_kind`` is one of "image", "video", "audio", "document",
"sticker" — selects size cap + default mime fallback.
"""
if self._http_client is None:
return None, "Not connected"
if not os.path.exists(file_path):
return None, f"File not found: {file_path}"
size = os.path.getsize(file_path)
cap = _MEDIA_SIZE_LIMITS.get(media_kind, _MEDIA_SIZE_LIMITS["document"])
if size > cap:
return None, (
f"File {os.path.basename(file_path)} is {size} bytes; "
f"Cloud API {media_kind} cap is {cap} bytes"
)
if not mime_type:
mime_type, _ = mimetypes.guess_type(file_path)
if not mime_type:
mime_type = _DEFAULT_MIME.get(media_kind, "application/octet-stream")
url = self._graph_url("media")
headers = {"Authorization": f"Bearer {self._access_token}"}
try:
with open(file_path, "rb") as fh:
files = {
"file": (os.path.basename(file_path), fh, mime_type),
"messaging_product": (None, "whatsapp"),
"type": (None, mime_type),
}
resp = await self._http_client.post(url, headers=headers, files=files)
except Exception as exc:
logger.exception("[whatsapp_cloud] media upload failed")
return None, str(exc)
if resp.status_code != 200:
try:
body = resp.json()
except Exception:
body = {"raw": resp.text[:500]}
return None, self._format_graph_error(body, resp.status_code)
try:
data = resp.json()
media_id = data.get("id")
except Exception:
media_id = None
if not media_id:
return None, "Upload response missing 'id'"
return media_id, None
async def _send_media(
self,
chat_id: str,
media_kind: str,
*,
media_id: Optional[str] = None,
media_link: Optional[str] = None,
caption: Optional[str] = None,
filename: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""POST a media message referencing either an uploaded media_id or
a public ``link``.
Exactly one of ``media_id`` or ``media_link`` must be set. Captions
and filenames are passed through where Meta accepts them (caption
on image/video/document; filename on document only).
"""
if self._http_client is None:
return SendResult(success=False, error="Not connected")
if bool(media_id) == bool(media_link):
return SendResult(
success=False,
error="Exactly one of media_id or media_link must be set",
)
url = self._graph_url("messages")
headers = {
"Authorization": f"Bearer {self._access_token}",
"Content-Type": "application/json",
}
media_block: Dict[str, Any] = {}
if media_id:
media_block["id"] = media_id
else:
media_block["link"] = media_link
if caption and media_kind in {"image", "video", "document"}:
media_block["caption"] = caption
if filename and media_kind == "document":
media_block["filename"] = filename
payload: Dict[str, Any] = {
"messaging_product": "whatsapp",
"recipient_type": "individual",
"to": chat_id,
"type": media_kind,
media_kind: media_block,
}
if reply_to:
payload["context"] = {"message_id": reply_to}
try:
resp = await self._http_client.post(url, headers=headers, json=payload)
except Exception as exc:
logger.exception("[whatsapp_cloud] media send failed")
return SendResult(success=False, error=str(exc))
if resp.status_code != 200:
try:
body = resp.json()
except Exception:
body = {"raw": resp.text[:500]}
error_msg = self._format_graph_error(body, resp.status_code)
logger.warning(
"[whatsapp_cloud] media send rejected (status=%d, kind=%s): %s",
resp.status_code, media_kind, error_msg,
)
return SendResult(success=False, error=error_msg)
try:
data = resp.json()
ids = data.get("messages") or []
wamid = ids[0].get("id") if ids else None
except Exception:
wamid = None
return SendResult(success=True, message_id=wamid)
async def _send_media_from_path_or_link(
self,
chat_id: str,
source: str,
media_kind: str,
*,
caption: Optional[str] = None,
filename: Optional[str] = None,
reply_to: Optional[str] = None,
mime_type: Optional[str] = None,
) -> SendResult:
"""Smart dispatcher: HTTPS URL → ``link`` send; local path → upload + ``id`` send.
Prefers the ``link`` path when possible (one fewer Graph round
trip). Meta fetches from the URL themselves. Used as the common
backend for ``send_image`` / ``send_video`` / etc. — keeps the
public method bodies thin.
"""
if source.startswith(("http://", "https://")):
return await self._send_media(
chat_id,
media_kind,
media_link=source,
caption=caption,
filename=filename,
reply_to=reply_to,
)
media_id, err = await self._upload_media(source, media_kind, mime_type)
if err:
return SendResult(success=False, error=err)
return await self._send_media(
chat_id,
media_kind,
media_id=media_id,
caption=caption,
filename=filename,
reply_to=reply_to,
)
async def send_image(
self,
chat_id: str,
image_url: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
**kwargs,
) -> SendResult:
"""Send an image by public URL. Prefers Meta's ``link`` mode.
``**kwargs`` absorbs platform-agnostic args the base class passes
(e.g. ``metadata``) that the Cloud API doesn't have a use for.
Mirrors send_image_file / send_video / send_voice / send_document.
"""
return await self._send_media_from_path_or_link(
chat_id, image_url, "image", caption=caption, reply_to=reply_to
)
async def send_image_file(
self,
chat_id: str,
image_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
**kwargs,
) -> SendResult:
"""Send a local image file via two-step upload + id."""
return await self._send_media_from_path_or_link(
chat_id, image_path, "image", caption=caption, reply_to=reply_to
)
async def send_video(
self,
chat_id: str,
video_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
**kwargs,
) -> SendResult:
"""Send a video. Local path → upload; HTTPS URL → link mode."""
return await self._send_media_from_path_or_link(
chat_id, video_path, "video", caption=caption, reply_to=reply_to
)
async def send_voice(
self,
chat_id: str,
audio_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
**kwargs,
) -> SendResult:
"""Send an audio file as a WhatsApp voice message.
WhatsApp renders ``audio/ogg; codecs=opus`` as the green
voice-note bubble; other audio types (MP3, AAC, etc.) appear as
a generic audio attachment. Hermes TTS produces MP3, so we try
ffmpeg conversion to opus first and fall back to sending the
MP3 as-is when ffmpeg is unavailable.
"""
source = audio_path
mime_type: Optional[str] = None
is_local_mp3 = (
not audio_path.startswith(("http://", "https://"))
and audio_path.lower().endswith(".mp3")
and os.path.exists(audio_path)
)
if is_local_mp3:
opus_path = await self._convert_to_opus(audio_path)
if opus_path:
try:
result = await self._send_media_from_path_or_link(
chat_id, opus_path, "audio",
caption=caption, reply_to=reply_to,
mime_type="audio/ogg; codecs=opus",
)
finally:
# The .ogg is a transient conversion artifact next to
# the source MP3 — clean it up after upload so voice
# sends don't leak a file per message.
try:
os.unlink(opus_path)
except OSError:
pass
return result
# Will deliver as MP3 attachment, not voice bubble.
# Warn-once is logged inside _convert_to_opus.
mime_type = "audio/mpeg"
return await self._send_media_from_path_or_link(
chat_id, source, "audio",
caption=caption, reply_to=reply_to, mime_type=mime_type,
)
async def send_document(
self,
chat_id: str,
file_path: str,
caption: Optional[str] = None,
file_name: Optional[str] = None,
reply_to: Optional[str] = None,
**kwargs,
) -> SendResult:
"""Send a document attachment with optional filename + caption."""
return await self._send_media_from_path_or_link(
chat_id, file_path, "document",
caption=caption,
filename=file_name or os.path.basename(file_path),
reply_to=reply_to,
)
# ------------------------------------------------------------------ opus conversion
async def _convert_to_opus(self, mp3_path: str) -> Optional[str]:
"""Convert an MP3 to ``audio/ogg; codecs=opus`` for voice bubbles.
Returns the path to the converted file, or None if ffmpeg is
missing / conversion fails (caller falls back to sending the
original MP3 as an audio file).
``-application voip`` tunes the opus encoder for speech.
``-b:a 32k -vbr on`` matches the bitrate WhatsApp produces for
native voice notes (small files, good intelligibility).
"""
if not _FFMPEG_PATH:
self._warn_once_no_ffmpeg()
return None
out_path = mp3_path.rsplit(".", 1)[0] + ".ogg"
try:
proc = await asyncio.create_subprocess_exec(
_FFMPEG_PATH, "-y", "-i", mp3_path,
"-c:a", "libopus", "-b:a", "32k", "-vbr", "on",
"-application", "voip", out_path,
stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.PIPE,
)
_, stderr = await proc.communicate()
if proc.returncode != 0 or not Path(out_path).exists():
logger.error(
"[whatsapp_cloud] ffmpeg opus conversion failed "
"(returncode=%s): %s",
proc.returncode,
(stderr or b"").decode("utf-8", errors="replace")[:500],
)
return None
return out_path
except Exception:
logger.exception("[whatsapp_cloud] ffmpeg subprocess raised")
return None
def _warn_once_no_ffmpeg(self) -> None:
if self._warned_no_ffmpeg:
return
self._warned_no_ffmpeg = True
logger.warning(
"[whatsapp_cloud] ffmpeg not found on PATH — voice messages will "
"be delivered as MP3 audio attachments instead of native voice "
"notes (green waveform bubble). Install ffmpeg to enable: "
"Windows `winget install Gyan.FFmpeg`, macOS `brew install ffmpeg`, "
"Linux package manager."
)
# ------------------------------------------------------------------ inbound media
async def _download_media_to_cache(
self,
media_id: str,
*,
ext_hint: Optional[str] = None,
) -> tuple[Optional[str], Optional[str]]:
"""Two-step Graph media download: ``GET /<id>`` → temp URL → bytes.
Returns ``(local_path, mime_type)`` on success. ``mime_type``
falls back to what Graph reports in the metadata response.
Returns ``(None, None)`` on any failure (logged).
The temporary URL from step 1 is signed and expires in ~5
minutes; we download immediately and never persist the URL.
"""
if self._http_client is None:
return None, None
# Defense in depth: media_id comes from the (signature-verified)
# webhook payload, but it's interpolated into both a Graph URL and
# a cache filename below — refuse anything that isn't a plain
# Meta-style media id so a hostile payload can't traverse paths.
media_id = str(media_id).strip()
if not re.fullmatch(r"[A-Za-z0-9._-]+", media_id):
logger.warning(
"[whatsapp_cloud] refusing malformed media id %r", media_id[:64]
)
return None, None
headers = {"Authorization": f"Bearer {self._access_token}"}
# Step 1 — metadata (gives us a temporary signed URL + mime)
try:
meta_resp = await self._http_client.get(
f"{GRAPH_API_BASE}/{self._api_version}/{media_id}",
headers=headers,
)
except Exception:
logger.exception(
"[whatsapp_cloud] media metadata fetch raised (id=%s)", media_id
)
return None, None
if meta_resp.status_code != 200:
logger.warning(
"[whatsapp_cloud] media metadata fetch failed (id=%s, status=%d)",
media_id, meta_resp.status_code,
)
return None, None
try:
meta = meta_resp.json()
except Exception:
return None, None
temp_url = meta.get("url")
mime = meta.get("mime_type") or ""
if not temp_url:
return None, None
# Step 2 — bytes (auth required even though URL is signed; Meta
# documents this explicitly — the URL alone is not enough).
try:
blob_resp = await self._http_client.get(temp_url, headers=headers)
except Exception:
logger.exception(
"[whatsapp_cloud] media bytes fetch raised (id=%s)", media_id
)
return None, None
if blob_resp.status_code != 200:
logger.warning(
"[whatsapp_cloud] media bytes fetch failed (id=%s, status=%d)",
media_id, blob_resp.status_code,
)
return None, None
# Decide the extension. Prefer the override map so audio/ogg
# produces .ogg (not the technically-correct-but-broken .oga
# mimetypes returns by default). Fall back to ext_hint then
# ``.bin`` for unknown types.
ext = ext_hint
if not ext and mime:
ext = _ext_for_mime(mime)
if not ext:
ext = ".bin"
_INBOUND_MEDIA_CACHE.mkdir(parents=True, exist_ok=True)
out_path = _INBOUND_MEDIA_CACHE / f"{media_id}{ext}"
try:
out_path.write_bytes(blob_resp.content)
except OSError:
logger.exception(
"[whatsapp_cloud] failed to write cached media (id=%s)", media_id
)
return None, None
return str(out_path), mime or None
# ------------------------------------------------------------------ inbound
async def _handle_health(self, request: "web.Request") -> "web.Response":
return web.json_response(
{
"status": "ok",
"platform": self.platform.value,
"phone_number_id": self._phone_number_id,
"webhook_path": self._webhook_path,
"verify_token_configured": bool(self._verify_token),
"app_secret_configured": bool(self._app_secret),
"ffmpeg_present": _FFMPEG_PATH is not None,
"accepted": self._accepted_count,
"duplicates": self._duplicate_count,
"rejected_signature": self._rejected_signature_count,
}
)
async def _handle_verify(self, request: "web.Request") -> "web.Response":
"""Meta subscription verification handshake.
Meta calls GET ``<webhook>?hub.mode=subscribe&hub.verify_token=...
&hub.challenge=...``. We must echo the challenge as plain text iff
``hub.mode == "subscribe"`` AND ``hub.verify_token`` matches the
shared secret. Constant-time comparison.
"""
if not self._verify_token:
# Misconfigured server — refuse rather than silently accepting
# any verify_token, which would let an attacker subscribe.
return web.Response(status=503, text="verify_token not configured")
mode = request.query.get("hub.mode", "")
token = request.query.get("hub.verify_token", "")
challenge = request.query.get("hub.challenge", "")
if mode != "subscribe":
return web.Response(status=400, text="bad mode")
# Constant-time compare to avoid token-length / token-content leaks
# via timing. ``hmac.compare_digest`` works on str.
import hmac as _hmac
if not _hmac.compare_digest(token, self._verify_token):
return web.Response(status=403, text="verify_token mismatch")
if not challenge:
return web.Response(status=400, text="missing challenge")
return web.Response(text=challenge, content_type="text/plain")
async def _handle_webhook(self, request: "web.Request") -> "web.Response":
"""Inbound webhook POST handler.
Lifecycle:
1. Read raw bytes (signature is over the raw body — JSON parsing
must NOT happen first, or the bytes change).
2. Verify ``X-Hub-Signature-256`` HMAC against ``app_secret``.
3. Parse JSON.
4. Walk ``entry[].changes[].value.{messages, statuses, contacts}``.
5. Per-message: dedup by wamid, build MessageEvent, dispatch via
``handle_message`` (which runs the mixin's gating).
6. Always respond 200 once we've ack'd a valid request — Meta
retries on non-200 for up to 7 days, and we don't want to
multiply downstream agent work because of a transient bug
during dispatch.
"""
try:
raw = await request.read()
except Exception:
return web.Response(status=400)
# Meta's documented max payload is 3MB. Reject earlier than aiohttp
# would so we don't even compute HMAC over giant junk.
if len(raw) > 3 * 1024 * 1024:
return web.Response(status=413)
# Refuse to accept anything if app_secret isn't configured. Without
# it we can't authenticate the sender, and the handler would be a
# data-injection point. Same defensive posture as the GET verify
# handshake refusing when verify_token is empty.
if not self._app_secret:
logger.error(
"[whatsapp_cloud] webhook POST refused: app_secret unset. "
"Set WHATSAPP_CLOUD_APP_SECRET to enable inbound delivery."
)
return web.Response(status=503, text="app_secret not configured")
signature_header = request.headers.get("X-Hub-Signature-256", "")
if not self._verify_signature(raw, signature_header):
self._rejected_signature_count += 1
logger.warning(
"[whatsapp_cloud] rejected webhook: invalid X-Hub-Signature-256 "
"(header=%r, body_len=%d)",
signature_header,
len(raw),
)
return web.Response(status=401)
# Parse only AFTER signature passes — bad JSON from an attacker is
# already filtered out, this just guards against Meta sending
# something malformed.
import json as _json
try:
payload = _json.loads(raw)
except Exception:
logger.warning("[whatsapp_cloud] webhook body is not valid JSON")
return web.Response(status=400)
if not isinstance(payload, dict):
return web.Response(status=400)
await self._dispatch_payload(payload)
return web.Response(status=200)
# ------------------------------------------------------------------ signature
def _verify_signature(self, raw_body: bytes, header: str) -> bool:
"""Verify the X-Hub-Signature-256 HMAC.
Meta sends ``sha256=<hex>``; we compute the same HMAC with
``app_secret`` as the key and ``raw_body`` (UTF-8 bytes, not
re-serialized JSON) as the message. Constant-time compare.
"""
if not self._app_secret or not header:
return False
if not header.startswith("sha256="):
return False
expected_hex = header[len("sha256="):].strip()
if not expected_hex:
return False
computed = hmac.new(
self._app_secret.encode("utf-8"),
raw_body,
hashlib.sha256,
).hexdigest()
return hmac.compare_digest(computed.lower(), expected_hex.lower())
# ------------------------------------------------------------------ dispatch
def _dedup_wamid(self, wamid: str) -> bool:
"""Return True if this wamid is being seen for the first time.
Returns False (and increments duplicate counter) if the wamid is
already in the in-memory cache. Cache is FIFO-evicted at
``WAMID_DEDUP_CACHE_SIZE``.
"""
if not wamid:
# No wamid means we can't dedup — let it through. Meta should
# always populate ``id``, but be defensive.
return True
if wamid in self._seen_wamids:
self._duplicate_count += 1
return False
self._seen_wamids[wamid] = True
# Trim oldest entries to stay under the cap.
while len(self._seen_wamids) > WAMID_DEDUP_CACHE_SIZE:
self._seen_wamids.popitem(last=False)
return True
async def _dispatch_payload(self, payload: Dict[str, Any]) -> None:
"""Walk a verified Meta webhook payload and dispatch each message.
Payload shape (truncated):
{object, entry: [{id, changes: [{value: {messages, contacts,
statuses, metadata}, field: "messages"}]}]}
We surface ``messages`` events as MessageEvents; ``statuses``
events (sent/delivered/read/failed) are logged but not dispatched
— the agent doesn't currently consume delivery receipts and
forwarding them would create noisy synthetic events.
"""
if payload.get("object") != "whatsapp_business_account":
logger.debug(
"[whatsapp_cloud] ignoring non-WABA payload (object=%r)",
payload.get("object"),
)
return
for entry in payload.get("entry") or []:
if not isinstance(entry, dict):
continue
for change in entry.get("changes") or []:
if not isinstance(change, dict):
continue
if change.get("field") != "messages":
# Other fields (account_alerts, template_status_update,
# etc.) are subscription-dependent and not message
# ingress. Silent skip.
continue
value = change.get("value") or {}
contacts = value.get("contacts") or []
metadata = value.get("metadata") or {}
# Build a wa_id → profile-name index for the messages we're
# about to surface.
contacts_by_waid: Dict[str, str] = {}
for contact in contacts:
if not isinstance(contact, dict):
continue
wa_id = str(contact.get("wa_id") or "").strip()
profile = contact.get("profile") or {}
name = str(profile.get("name") or "").strip()
if wa_id:
contacts_by_waid[wa_id] = name
for raw_message in value.get("messages") or []:
if not isinstance(raw_message, dict):
continue
wamid = str(raw_message.get("id") or "").strip()
if not self._dedup_wamid(wamid):
logger.debug(
"[whatsapp_cloud] duplicate wamid %s, skipping",
wamid,
)
continue
try:
event = await self._build_message_event_from_cloud(
raw_message, contacts_by_waid, metadata
)
except Exception:
# Build errors must not bubble out either: the wamid
# is already dedup-marked above, so a 500 here would
# make Meta retry the batch and every message in it
# (including this one) would be silently dropped as
# a duplicate. Log and move on to the next message.
logger.exception(
"[whatsapp_cloud] failed to build event for wamid %s",
wamid,
)
continue
if event is None:
continue
self._accepted_count += 1
try:
await self.handle_message(event)
except Exception:
# Dispatch errors must not bubble out — Meta would
# retry the whole batch, multiplying the bug.
logger.exception(
"[whatsapp_cloud] handle_message raised for wamid %s",
wamid,
)
# Log status updates at debug level — useful for diagnosing
# "did Meta accept my outbound" without flooding INFO logs.
for status in value.get("statuses") or []:
if isinstance(status, dict):
logger.debug(
"[whatsapp_cloud] status %s for %s",
status.get("status"),
status.get("id"),
)
async def _dispatch_interactive_reply(
self,
raw_message: Dict[str, Any],
contacts_by_waid: Dict[str, str],
) -> bool:
"""Route an inbound interactive reply to the matching resolver.
Returns True if the tap was claimed (caller should drop the
webhook entry without dispatching a fresh conversation turn).
Returns False when the id has no recognized prefix, no live
state entry, or the resolver itself reports no waiter — in
those cases the caller falls back to standard text-event
dispatch, which treats the button title as a normal user
message. That graceful fallback covers stale-tap and
cross-process-restart scenarios.
Dispatch table:
``cl:<clarify_id>:<idx|other>`` → resolve_gateway_clarify
``appr:<approval_id>:approve|deny`` → resolve_gateway_approval
``sc:<once|always|cancel>:<confirm_id>`` → slash_confirm.resolve
"""
inter = raw_message.get("interactive") or {}
# button_reply (interactive.type=button) and list_reply
# (interactive.type=list) carry id+title in different sub-objects.
inner = inter.get("button_reply") or inter.get("list_reply") or {}
button_id = str(inner.get("id") or "").strip()
if not button_id:
return False
# Clarify: cl:<clarify_id>:<idx|other>
if button_id.startswith("cl:"):
parts = button_id.split(":", 2)
if len(parts) != 3:
return False
_, clarify_id, choice = parts
session_key = self._clarify_state.pop(clarify_id, None)
if not session_key:
logger.info(
"[whatsapp_cloud] clarify tap with no matching state "
"(clarify_id=%s) — likely stale; falling back to text",
clarify_id,
)
return False
try:
from tools.clarify_gateway import resolve_gateway_clarify
except ImportError:
logger.warning(
"[whatsapp_cloud] clarify resolver unavailable; "
"falling back to text dispatch"
)
return False
if choice == "other":
# User wants to type a free-form answer. Flip the entry
# into text-capture mode so the gateway's text-intercept
# (in _handle_message) picks up their next message and
# resolves the clarify. Without this flip,
# ``get_pending_for_session`` won't return the entry —
# the next text would fall through to the regular agent
# path, which collides with the agent thread still
# blocked in clarify and produces an "Interrupting
# current task" loop.
try:
from tools.clarify_gateway import mark_awaiting_text
flipped = mark_awaiting_text(clarify_id)
except Exception:
logger.exception(
"[whatsapp_cloud] mark_awaiting_text failed for %s",
clarify_id,
)
flipped = False
if not flipped:
# Entry vanished between the user tap and our handler
# (timeout, /new, gateway restart). Drop the stale
# state and fall through to text dispatch so the
# user's tap isn't completely ignored.
logger.info(
"[whatsapp_cloud] clarify 'Other' tap but entry "
"missing (clarify_id=%s); falling back to text",
clarify_id,
)
return False
# Put state back since we popped it earlier — keep the
# clarify_id → session_key mapping live in case future
# taps land on the same prompt.
self._clarify_state[clarify_id] = session_key
try:
await self.send(
str(raw_message.get("from") or ""),
"✏️ Type your answer:",
)
except Exception:
logger.exception("[whatsapp_cloud] clarify other-prompt failed")
return True # claim so we don't also dispatch the tap as text
try:
idx = int(choice)
except ValueError:
logger.warning(
"[whatsapp_cloud] clarify tap had non-int choice: %r",
choice,
)
# Put state back so a follow-up text can still resolve.
self._clarify_state[clarify_id] = session_key
return False
# Use the title text as the resolved response so the agent
# sees the human-readable answer, not the index. Title is
# the numeric label ("1", "2", ...) so we look up the
# full choice from the original prompt — but we didn't
# persist that. Fall back to passing the index; the agent
# has the prompt in context and can interpret it.
response_text = str(inner.get("title") or str(idx + 1))
resolved = resolve_gateway_clarify(clarify_id, response_text)
if not resolved:
# Resolver couldn't find a waiter (e.g. agent already
# timed out). Fall through to text dispatch.
logger.info(
"[whatsapp_cloud] clarify resolver reported no waiter "
"(clarify_id=%s) — falling back to text", clarify_id,
)
return False
return True
# Exec approval: appr:<approval_id>:approve|deny
if button_id.startswith("appr:"):
parts = button_id.split(":", 2)
if len(parts) != 3:
return False
_, approval_id, choice = parts
session_key = self._exec_approval_state.pop(approval_id, None)
if not session_key:
logger.info(
"[whatsapp_cloud] approval tap with no matching state "
"(approval_id=%s) — likely stale; falling back to text",
approval_id,
)
return False
if choice not in ("approve", "deny"):
self._exec_approval_state[approval_id] = session_key
return False
try:
from tools.approval import resolve_gateway_approval
except ImportError:
logger.warning(
"[whatsapp_cloud] approval resolver unavailable"
)
return False
count = resolve_gateway_approval(session_key, choice)
if not count:
logger.info(
"[whatsapp_cloud] approval resolver reported no waiter "
"(session_key=%s) — likely already resolved",
session_key,
)
# Send confirmation message — paralleling Telegram's UX.
try:
confirm_text = (
"✅ Approved." if choice == "approve" else "❌ Denied."
)
await self.send(str(raw_message.get("from") or ""), confirm_text)
except Exception:
logger.exception("[whatsapp_cloud] approval confirm failed")
return True
# Slash confirm: sc:<once|always|cancel>:<confirm_id>
if button_id.startswith("sc:"):
parts = button_id.split(":", 2)
if len(parts) != 3:
return False
_, choice, confirm_id = parts
session_key = self._slash_confirm_state.pop(confirm_id, None)
if not session_key:
logger.info(
"[whatsapp_cloud] slash_confirm tap with no matching state "
"(confirm_id=%s) — likely stale", confirm_id,
)
return False
if choice not in ("once", "always", "cancel"):
self._slash_confirm_state[confirm_id] = session_key
return False
try:
from tools import slash_confirm as _slash_confirm_mod
except ImportError:
logger.warning(
"[whatsapp_cloud] slash_confirm resolver unavailable"
)
return False
try:
result_text = await _slash_confirm_mod.resolve(
session_key, confirm_id, choice
)
except Exception:
logger.exception("[whatsapp_cloud] slash_confirm.resolve failed")
return True # still claim the tap; surfacing it as text wouldn't help
if result_text:
try:
await self.send(str(raw_message.get("from") or ""), result_text)
except Exception:
logger.exception("[whatsapp_cloud] slash_confirm reply failed")
return True
# Unknown prefix — let text dispatch handle the title as a
# regular message. Could be a tap from a plugin-defined adapter
# we don't know about; treating it as text is the safe default.
return False
async def _build_message_event_from_cloud(
self,
raw_message: Dict[str, Any],
contacts_by_waid: Dict[str, str],
metadata: Dict[str, Any],
) -> Optional[MessageEvent]:
"""Convert a Cloud-API message object into a Hermes MessageEvent.
Phase 4 expands beyond text to download inbound media (image,
video, audio/voice, document, sticker) by ``media_id`` via the
two-step Graph endpoint. Cached files are populated into
``media_urls`` / ``media_types`` so the agent's vision and STT
layers see them. Text-readable documents (.txt, .md, .json,
source code, etc.) are read and prepended to the message body
up to 100KB — same heuristic the Baileys adapter uses.
Returns None if the message is filtered out by the mixin's
gating (broadcast filter, allow-list, mention requirements).
"""
msg_type_str = str(raw_message.get("type") or "text").lower()
# Interactive replies (button taps, list selections) carry an ``id``
# we set when sending the prompt. Route those to the appropriate
# gateway resolver BEFORE falling through to text dispatch — the
# resolver unblocks the waiting agent thread, so we don't want to
# also kick a fresh conversation turn off the same tap.
if msg_type_str == "interactive":
handled = await self._dispatch_interactive_reply(
raw_message, contacts_by_waid
)
if handled:
return None
body = ""
if msg_type_str == "text":
text = raw_message.get("text") or {}
body = str(text.get("body") or "")
elif msg_type_str in {"button", "interactive"}:
# Quick-reply buttons. Treat the button payload as text so the
# agent can reason about the user's choice.
if msg_type_str == "button":
body = str((raw_message.get("button") or {}).get("text") or "")
else:
inter = raw_message.get("interactive") or {}
# button_reply / list_reply both expose ``title``
inner = inter.get("button_reply") or inter.get("list_reply") or {}
body = str(inner.get("title") or "")
elif msg_type_str in {"image", "video", "audio", "voice", "document", "sticker"}:
# Captions live on image / video / document. Other media types
# don't carry a caption in Meta's spec, but be defensive.
inner = raw_message.get(msg_type_str) or {}
body = str(inner.get("caption") or "")
message_type = {
"text": MessageType.TEXT,
"image": MessageType.PHOTO,
"video": MessageType.VIDEO,
"audio": MessageType.VOICE,
"voice": MessageType.VOICE,
"document": MessageType.DOCUMENT,
"sticker": MessageType.PHOTO,
"button": MessageType.TEXT,
"interactive": MessageType.TEXT,
"location": MessageType.TEXT,
"contacts": MessageType.TEXT,
}.get(msg_type_str, MessageType.TEXT)
sender_id = str(raw_message.get("from") or "").strip()
sender_name = contacts_by_waid.get(sender_id, "")
# Cloud API doesn't have a separate "chat" entity for DMs — chat_id
# equals the sender's wa_id. Group support is deferred to v2.
#
# Defensive guard: if Meta ever delivers a group-shaped payload
# (group support is capability-tier gated by Meta; some WABAs
# have it enabled), refuse rather than silently treating it as
# a DM. Group messages carry a ``chat`` field on the message
# object identifying the group JID — its absence signals DM.
chat_field = raw_message.get("chat")
if chat_field:
logger.warning(
"[whatsapp_cloud] received group-shaped message (chat=%s, "
"wamid=%s) — group support is not yet implemented; dropping. "
"Use the Baileys whatsapp adapter for group chats.",
chat_field, raw_message.get("id"),
)
return None
chat_id = sender_id
# Build the data dict the mixin's _should_process_message expects.
# Cloud API uses different field names from Baileys, so we adapt.
gating_data = {
"chatId": chat_id,
"senderId": sender_id,
"isGroup": False, # Phase 3 = DM only
"body": body,
}
if not self._should_process_message(gating_data):
return None
# Download media if this is a non-text message type. Inbound media
# arrives as ``{type: "image", image: {id, mime_type, sha256, ...}}``.
media_urls: list[str] = []
media_types: list[str] = []
if msg_type_str in {"image", "video", "audio", "voice", "document", "sticker"}:
inner = raw_message.get(msg_type_str) or {}
media_id = str(inner.get("id") or "").strip()
inbound_mime = str(inner.get("mime_type") or "").strip()
if media_id:
ext_hint = None
if inbound_mime:
ext_hint = _ext_for_mime(inbound_mime)
local_path, dl_mime = await self._download_media_to_cache(
media_id, ext_hint=ext_hint
)
if local_path:
media_urls.append(local_path)
media_types.append(dl_mime or inbound_mime or "application/octet-stream")
logger.info(
"[whatsapp_cloud] cached inbound %s media: %s",
msg_type_str, local_path,
)
else:
logger.warning(
"[whatsapp_cloud] failed to download inbound %s (id=%s) — "
"agent will see message metadata but not the binary",
msg_type_str, media_id,
)
# Document: original filename for the agent's UX.
if msg_type_str == "document":
fname = str(inner.get("filename") or "").strip()
if fname and not body:
body = f"[Document: {fname}]"
# For text-readable documents, inject the file content directly into
# the message body so the agent can reason about it without a
# separate read_file call. Same heuristic the Baileys adapter uses.
# 100KB cap matches Telegram/Discord/Slack.
MAX_TEXT_INJECT_BYTES = 100 * 1024
if msg_type_str == "document" and media_urls:
for doc_path in media_urls:
ext = Path(doc_path).suffix.lower()
if ext in {
".txt", ".md", ".csv", ".json", ".xml", ".yaml", ".yml",
".log", ".py", ".js", ".ts", ".html", ".css",
}:
try:
file_size = Path(doc_path).stat().st_size
if file_size > MAX_TEXT_INJECT_BYTES:
logger.info(
"[whatsapp_cloud] skipping text injection for %s "
"(%d bytes > %d)",
doc_path, file_size, MAX_TEXT_INJECT_BYTES,
)
continue
content = Path(doc_path).read_text(
encoding="utf-8", errors="replace"
)
display_name = Path(doc_path).name
injection = f"[Content of {display_name}]:\n{content}"
body = f"{injection}\n\n{body}" if body else injection
except OSError:
logger.exception(
"[whatsapp_cloud] failed to read document text: %s",
doc_path,
)
# context.id is set when the user replied to one of our messages.
context = raw_message.get("context") or {}
reply_to_id = str(context.get("id") or "").strip() or None
source = self.build_source(
chat_id=chat_id,
chat_name=sender_name or chat_id,
chat_type="dm",
user_id=sender_id,
user_name=sender_name or None,
)
# Cloud API timestamps are unix seconds (string). MessageEvent
# doesn't enforce a type but downstream code formats with it.
wamid = str(raw_message.get("id") or "") or None
if wamid and chat_id:
# Refresh the per-chat latest-wamid cache so a subsequent
# send_typing call can attach the indicator + read receipt
# to this message. Done HERE (after _should_process_message
# gating) so filtered messages don't leak typing on
# unwanted inbound traffic.
self._bounded_put(self._last_inbound_wamid_by_chat, chat_id, wamid)
return MessageEvent(
text=body,
message_type=message_type,
source=source,
raw_message=raw_message,
message_id=wamid,
reply_to_message_id=reply_to_id,
media_urls=media_urls,
media_types=media_types,
)