diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 7ba1679fc..0decffa68 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -216,6 +216,23 @@ def get_image_cache_dir() -> Path: return IMAGE_CACHE_DIR +def _looks_like_image(data: bytes) -> bool: + """Return True if *data* starts with a known image magic-byte sequence.""" + if len(data) < 4: + return False + if data[:8] == b"\x89PNG\r\n\x1a\n": + return True + if data[:3] == b"\xff\xd8\xff": + return True + if data[:6] in (b"GIF87a", b"GIF89a"): + return True + if data[:2] == b"BM": + return True + if data[:4] == b"RIFF" and len(data) >= 12 and data[8:12] == b"WEBP": + return True + return False + + def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str: """ Save raw image bytes to the cache and return the absolute file path. @@ -226,7 +243,17 @@ def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str: Returns: Absolute path to the cached image file as a string. + + Raises: + ValueError: If *data* does not look like a valid image (e.g. an HTML + error page returned by the upstream server). """ + if not _looks_like_image(data): + snippet = data[:80].decode("utf-8", errors="replace") + raise ValueError( + f"Refusing to cache non-image data as {ext} " + f"(starts with: {snippet!r})" + ) cache_dir = get_image_cache_dir() filename = f"img_{uuid.uuid4().hex[:12]}{ext}" filepath = cache_dir / filename diff --git a/gateway/platforms/email.py b/gateway/platforms/email.py index a54bd94bb..d4261ccfb 100644 --- a/gateway/platforms/email.py +++ b/gateway/platforms/email.py @@ -195,7 +195,11 @@ def _extract_attachments( ext = Path(filename).suffix.lower() if ext in _IMAGE_EXTS: - cached_path = cache_image_from_bytes(payload, ext) + try: + cached_path = cache_image_from_bytes(payload, ext) + except ValueError: + logger.debug("Skipping non-image attachment %s (invalid magic bytes)", filename) + continue attachments.append({ "path": cached_path, "filename": filename, diff --git a/gateway/platforms/slack.py b/gateway/platforms/slack.py index b4973bbbd..906b54ed5 100644 --- a/gateway/platforms/slack.py +++ b/gateway/platforms/slack.py @@ -1596,6 +1596,18 @@ class SlackAdapter(BasePlatformAdapter): ) response.raise_for_status() + # Slack may return an HTML sign-in/redirect page + # instead of actual media bytes (e.g. expired token, + # restricted file access). Detect this early so we + # don't cache bogus data and confuse downstream tools. + ct = response.headers.get("content-type", "") + if "text/html" in ct: + raise ValueError( + "Slack returned HTML instead of media " + f"(content-type: {ct}); " + "check bot token scopes and file permissions" + ) + if audio: from gateway.platforms.base import cache_audio_from_bytes return cache_audio_from_bytes(response.content, ext) diff --git a/gateway/platforms/wecom.py b/gateway/platforms/wecom.py index 70dcc1887..6fde73927 100644 --- a/gateway/platforms/wecom.py +++ b/gateway/platforms/wecom.py @@ -696,7 +696,11 @@ class WeComAdapter(BasePlatformAdapter): if kind == "image": ext = self._detect_image_ext(raw) - return cache_image_from_bytes(raw, ext), self._mime_for_ext(ext, fallback="image/jpeg") + try: + return cache_image_from_bytes(raw, ext), self._mime_for_ext(ext, fallback="image/jpeg") + except ValueError as exc: + logger.warning("[%s] Rejected non-image bytes: %s", self.name, exc) + return None filename = str(media.get("filename") or media.get("name") or "wecom_file") return cache_document_from_bytes(raw, filename), mimetypes.guess_type(filename)[0] or "application/octet-stream" @@ -722,7 +726,11 @@ class WeComAdapter(BasePlatformAdapter): content_type = str(headers.get("content-type") or "").split(";", 1)[0].strip() or "application/octet-stream" if kind == "image": ext = self._guess_extension(url, content_type, fallback=self._detect_image_ext(raw)) - return cache_image_from_bytes(raw, ext), content_type or self._mime_for_ext(ext, fallback="image/jpeg") + try: + return cache_image_from_bytes(raw, ext), content_type or self._mime_for_ext(ext, fallback="image/jpeg") + except ValueError as exc: + logger.warning("[%s] Rejected non-image bytes from %s: %s", self.name, url, exc) + return None filename = self._guess_filename(url, headers.get("content-disposition"), content_type) return cache_document_from_bytes(raw, filename), content_type diff --git a/tests/gateway/test_media_download_retry.py b/tests/gateway/test_media_download_retry.py index f0147dfb4..8a5e16953 100644 --- a/tests/gateway/test_media_download_retry.py +++ b/tests/gateway/test_media_download_retry.py @@ -34,6 +34,45 @@ def _make_timeout_error() -> httpx.TimeoutException: return httpx.TimeoutException("timed out") +# --------------------------------------------------------------------------- +# cache_image_from_bytes (base.py) +# --------------------------------------------------------------------------- + + +class TestCacheImageFromBytes: + """Tests for gateway.platforms.base.cache_image_from_bytes""" + + def test_caches_valid_jpeg(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + path = cache_image_from_bytes(b"\xff\xd8\xff fake jpeg data", ".jpg") + assert path.endswith(".jpg") + + def test_caches_valid_png(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + path = cache_image_from_bytes(b"\x89PNG\r\n\x1a\n fake png data", ".png") + assert path.endswith(".png") + + def test_rejects_html_content(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + with pytest.raises(ValueError, match="non-image data"): + cache_image_from_bytes(b"