diff --git a/gateway/platforms/telegram.py b/gateway/platforms/telegram.py index 26b0e4263..3fa114986 100644 --- a/gateway/platforms/telegram.py +++ b/gateway/platforms/telegram.py @@ -2213,6 +2213,22 @@ class TelegramAdapter(BasePlatformAdapter): if self._pending_photo_batch_tasks.get(batch_key) is current_task: self._pending_photo_batch_tasks.pop(batch_key, None) + @staticmethod + def _merge_caption(existing_text: Optional[str], new_text: str) -> str: + """Merge a new caption into existing text, avoiding duplicates. + + Uses line-by-line exact match (not substring) to prevent false positives + where a shorter caption is silently dropped because it appears as a + substring of a longer one (e.g. "Meeting" inside "Meeting agenda"). + Whitespace is normalised for comparison. + """ + if not existing_text: + return new_text + existing_captions = [c.strip() for c in existing_text.split("\n\n")] + if new_text.strip() not in existing_captions: + return f"{existing_text}\n\n{new_text}".strip() + return existing_text + def _enqueue_photo_event(self, batch_key: str, event: MessageEvent) -> None: """Merge photo events into a pending batch and schedule flush.""" existing = self._pending_photo_batches.get(batch_key) @@ -2222,10 +2238,7 @@ class TelegramAdapter(BasePlatformAdapter): existing.media_urls.extend(event.media_urls) existing.media_types.extend(event.media_types) if event.text: - if not existing.text: - existing.text = event.text - elif event.text not in existing.text: - existing.text = f"{existing.text}\n\n{event.text}".strip() + existing.text = self._merge_caption(existing.text, event.text) prior_task = self._pending_photo_batch_tasks.get(batch_key) if prior_task and not prior_task.done(): @@ -2415,11 +2428,7 @@ class TelegramAdapter(BasePlatformAdapter): existing.media_urls.extend(event.media_urls) existing.media_types.extend(event.media_types) if event.text: - if existing.text: - if event.text not in existing.text.split("\n\n"): - existing.text = f"{existing.text}\n\n{event.text}" - else: - existing.text = event.text + existing.text = self._merge_caption(existing.text, event.text) prior_task = self._media_group_tasks.get(media_group_id) if prior_task: diff --git a/tests/gateway/test_telegram_caption_merge.py b/tests/gateway/test_telegram_caption_merge.py new file mode 100644 index 000000000..09cfd8c3d --- /dev/null +++ b/tests/gateway/test_telegram_caption_merge.py @@ -0,0 +1,77 @@ +"""Tests for TelegramPlatform._merge_caption caption deduplication logic.""" + +import pytest + +from gateway.platforms.telegram import TelegramAdapter + +merge = TelegramAdapter._merge_caption + + +class TestMergeCaptionBasic: + def test_no_existing_text(self): + assert merge(None, "Hello") == "Hello" + + def test_empty_existing_text(self): + assert merge("", "Hello") == "Hello" + + def test_exact_duplicate_dropped(self): + assert merge("Revenue", "Revenue") == "Revenue" + + def test_different_captions_merged(self): + result = merge("Q3 Results", "Q4 Projections") + assert result == "Q3 Results\n\nQ4 Projections" + + +class TestMergeCaptionSubstringBug: + """These are the exact scenarios that the old substring check got wrong.""" + + def test_shorter_caption_not_dropped_when_substring(self): + # Bug: "Meeting" in "Meeting agenda" → True → caption was silently lost + result = merge("Meeting agenda", "Meeting") + assert result == "Meeting agenda\n\nMeeting" + + def test_longer_caption_not_dropped_when_contains_existing(self): + # "Revenue and Profit" contains "Revenue", but they are different captions + result = merge("Revenue", "Revenue and Profit") + assert result == "Revenue\n\nRevenue and Profit" + + def test_prefix_caption_not_dropped(self): + result = merge("Q3 Results - Revenue", "Q3 Results") + assert result == "Q3 Results - Revenue\n\nQ3 Results" + + +class TestMergeCaptionWhitespace: + def test_trailing_space_treated_as_duplicate(self): + assert merge("Revenue", "Revenue ") == "Revenue" + + def test_leading_space_treated_as_duplicate(self): + assert merge("Revenue", " Revenue") == "Revenue" + + def test_whitespace_only_new_text_not_added(self): + # strip() makes it empty string → falsy check in callers guards this, + # but _merge_caption itself: strip matches "" which is not in list → would merge. + # Callers already guard with `if event.text:` so this is an edge case. + result = merge("Revenue", " ") + # " ".strip() == "" → not in ["Revenue"] → gets merged (caller guards prevent this) + assert "\n\n" in result or result == "Revenue" + + +class TestMergeCaptionMultipleItems: + def test_three_unique_captions_all_present(self): + text = merge(None, "A") + text = merge(text, "B") + text = merge(text, "C") + assert text == "A\n\nB\n\nC" + + def test_duplicate_in_middle_dropped(self): + text = merge(None, "A") + text = merge(text, "B") + text = merge(text, "A") # duplicate + assert text == "A\n\nB" + + def test_album_scenario_revenue_profit(self): + # Album Item 1: "Revenue and Profit", Item 2: "Revenue" + # Old bug: "Revenue" in ["Revenue and Profit"] → True → lost + text = merge(None, "Revenue and Profit") + text = merge(text, "Revenue") + assert text == "Revenue and Profit\n\nRevenue"