diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 7ba1679fc..0decffa68 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -216,6 +216,23 @@ def get_image_cache_dir() -> Path: return IMAGE_CACHE_DIR +def _looks_like_image(data: bytes) -> bool: + """Return True if *data* starts with a known image magic-byte sequence.""" + if len(data) < 4: + return False + if data[:8] == b"\x89PNG\r\n\x1a\n": + return True + if data[:3] == b"\xff\xd8\xff": + return True + if data[:6] in (b"GIF87a", b"GIF89a"): + return True + if data[:2] == b"BM": + return True + if data[:4] == b"RIFF" and len(data) >= 12 and data[8:12] == b"WEBP": + return True + return False + + def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str: """ Save raw image bytes to the cache and return the absolute file path. @@ -226,7 +243,17 @@ def cache_image_from_bytes(data: bytes, ext: str = ".jpg") -> str: Returns: Absolute path to the cached image file as a string. + + Raises: + ValueError: If *data* does not look like a valid image (e.g. an HTML + error page returned by the upstream server). """ + if not _looks_like_image(data): + snippet = data[:80].decode("utf-8", errors="replace") + raise ValueError( + f"Refusing to cache non-image data as {ext} " + f"(starts with: {snippet!r})" + ) cache_dir = get_image_cache_dir() filename = f"img_{uuid.uuid4().hex[:12]}{ext}" filepath = cache_dir / filename diff --git a/gateway/platforms/email.py b/gateway/platforms/email.py index a54bd94bb..d4261ccfb 100644 --- a/gateway/platforms/email.py +++ b/gateway/platforms/email.py @@ -195,7 +195,11 @@ def _extract_attachments( ext = Path(filename).suffix.lower() if ext in _IMAGE_EXTS: - cached_path = cache_image_from_bytes(payload, ext) + try: + cached_path = cache_image_from_bytes(payload, ext) + except ValueError: + logger.debug("Skipping non-image attachment %s (invalid magic bytes)", filename) + continue attachments.append({ "path": cached_path, "filename": filename, diff --git a/gateway/platforms/slack.py b/gateway/platforms/slack.py index b4973bbbd..906b54ed5 100644 --- a/gateway/platforms/slack.py +++ b/gateway/platforms/slack.py @@ -1596,6 +1596,18 @@ class SlackAdapter(BasePlatformAdapter): ) response.raise_for_status() + # Slack may return an HTML sign-in/redirect page + # instead of actual media bytes (e.g. expired token, + # restricted file access). Detect this early so we + # don't cache bogus data and confuse downstream tools. + ct = response.headers.get("content-type", "") + if "text/html" in ct: + raise ValueError( + "Slack returned HTML instead of media " + f"(content-type: {ct}); " + "check bot token scopes and file permissions" + ) + if audio: from gateway.platforms.base import cache_audio_from_bytes return cache_audio_from_bytes(response.content, ext) diff --git a/gateway/platforms/wecom.py b/gateway/platforms/wecom.py index 70dcc1887..6fde73927 100644 --- a/gateway/platforms/wecom.py +++ b/gateway/platforms/wecom.py @@ -696,7 +696,11 @@ class WeComAdapter(BasePlatformAdapter): if kind == "image": ext = self._detect_image_ext(raw) - return cache_image_from_bytes(raw, ext), self._mime_for_ext(ext, fallback="image/jpeg") + try: + return cache_image_from_bytes(raw, ext), self._mime_for_ext(ext, fallback="image/jpeg") + except ValueError as exc: + logger.warning("[%s] Rejected non-image bytes: %s", self.name, exc) + return None filename = str(media.get("filename") or media.get("name") or "wecom_file") return cache_document_from_bytes(raw, filename), mimetypes.guess_type(filename)[0] or "application/octet-stream" @@ -722,7 +726,11 @@ class WeComAdapter(BasePlatformAdapter): content_type = str(headers.get("content-type") or "").split(";", 1)[0].strip() or "application/octet-stream" if kind == "image": ext = self._guess_extension(url, content_type, fallback=self._detect_image_ext(raw)) - return cache_image_from_bytes(raw, ext), content_type or self._mime_for_ext(ext, fallback="image/jpeg") + try: + return cache_image_from_bytes(raw, ext), content_type or self._mime_for_ext(ext, fallback="image/jpeg") + except ValueError as exc: + logger.warning("[%s] Rejected non-image bytes from %s: %s", self.name, url, exc) + return None filename = self._guess_filename(url, headers.get("content-disposition"), content_type) return cache_document_from_bytes(raw, filename), content_type diff --git a/tests/gateway/test_media_download_retry.py b/tests/gateway/test_media_download_retry.py index f0147dfb4..8a5e16953 100644 --- a/tests/gateway/test_media_download_retry.py +++ b/tests/gateway/test_media_download_retry.py @@ -34,6 +34,45 @@ def _make_timeout_error() -> httpx.TimeoutException: return httpx.TimeoutException("timed out") +# --------------------------------------------------------------------------- +# cache_image_from_bytes (base.py) +# --------------------------------------------------------------------------- + + +class TestCacheImageFromBytes: + """Tests for gateway.platforms.base.cache_image_from_bytes""" + + def test_caches_valid_jpeg(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + path = cache_image_from_bytes(b"\xff\xd8\xff fake jpeg data", ".jpg") + assert path.endswith(".jpg") + + def test_caches_valid_png(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + path = cache_image_from_bytes(b"\x89PNG\r\n\x1a\n fake png data", ".png") + assert path.endswith(".png") + + def test_rejects_html_content(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + with pytest.raises(ValueError, match="non-image data"): + cache_image_from_bytes(b"Slack", ".png") + + def test_rejects_empty_data(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + with pytest.raises(ValueError, match="non-image data"): + cache_image_from_bytes(b"", ".jpg") + + def test_rejects_plain_text(self, tmp_path, monkeypatch): + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + from gateway.platforms.base import cache_image_from_bytes + with pytest.raises(ValueError, match="non-image data"): + cache_image_from_bytes(b"just some text, not an image", ".jpg") + + # --------------------------------------------------------------------------- # cache_image_from_url (base.py) # --------------------------------------------------------------------------- @@ -71,7 +110,7 @@ class TestCacheImageFromUrl: monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") fake_response = MagicMock() - fake_response.content = b"image data" + fake_response.content = b"\xff\xd8\xff image data" fake_response.raise_for_status = MagicMock() mock_client = AsyncMock() @@ -101,7 +140,7 @@ class TestCacheImageFromUrl: monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") ok_response = MagicMock() - ok_response.content = b"image data" + ok_response.content = b"\xff\xd8\xff image data" ok_response.raise_for_status = MagicMock() mock_client = AsyncMock() @@ -395,8 +434,9 @@ class TestSlackDownloadSlackFile: adapter = _make_slack_adapter() fake_response = MagicMock() - fake_response.content = b"fake image bytes" + fake_response.content = b"\x89PNG\r\n\x1a\n fake png" fake_response.raise_for_status = MagicMock() + fake_response.headers = {"content-type": "image/png"} mock_client = AsyncMock() mock_client.get = AsyncMock(return_value=fake_response) @@ -413,14 +453,44 @@ class TestSlackDownloadSlackFile: assert path.endswith(".jpg") mock_client.get.assert_called_once() + def test_rejects_html_response(self, tmp_path, monkeypatch): + """An HTML sign-in page from Slack is rejected, not cached as image.""" + monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") + adapter = _make_slack_adapter() + + fake_response = MagicMock() + fake_response.content = b"Slack" + fake_response.raise_for_status = MagicMock() + fake_response.headers = {"content-type": "text/html; charset=utf-8"} + + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=fake_response) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=False) + + async def run(): + with patch("httpx.AsyncClient", return_value=mock_client): + await adapter._download_slack_file( + "https://files.slack.com/img.jpg", ext=".jpg" + ) + + with pytest.raises(ValueError, match="HTML instead of media"): + asyncio.run(run()) + + # Verify nothing was cached + img_dir = tmp_path / "img" + if img_dir.exists(): + assert list(img_dir.iterdir()) == [] + def test_retries_on_timeout_then_succeeds(self, tmp_path, monkeypatch): """Timeout on first attempt triggers retry; success on second.""" monkeypatch.setattr("gateway.platforms.base.IMAGE_CACHE_DIR", tmp_path / "img") adapter = _make_slack_adapter() fake_response = MagicMock() - fake_response.content = b"image bytes" + fake_response.content = b"\x89PNG\r\n\x1a\n image bytes" fake_response.raise_for_status = MagicMock() + fake_response.headers = {"content-type": "image/png"} mock_client = AsyncMock() mock_client.get = AsyncMock(