refactor(telegram): use entity-only mention detection

Replaces the word-boundary regex scan with pure MessageEntity-based
detection. Telegram's server emits MENTION entities for real @username
mentions and TEXT_MENTION entities for @FirstName mentions; the text-
scanning fallback was both redundant (entities are always present for
real mentions) and broken (matched raw substrings like email addresses,
URLs, code-block contents, and forwarded literal text).

Entity-only detection:
- Closes bug #12545 ("foo@hermes_bot.example" false positive).
- Also fixes edge cases the regex fix would still miss: @handles inside
  URLs and code blocks, where Telegram does not emit mention entities.

Tests rewritten to exercise realistic Telegram payloads (real mentions
carry entities; substring false positives don't).
This commit is contained in:
Teknium 2026-04-19 22:51:56 -07:00 committed by Teknium
parent 1e18e0503f
commit e330112aa8
2 changed files with 151 additions and 72 deletions

View file

@ -2258,23 +2258,27 @@ class TelegramAdapter(BasePlatformAdapter):
bot_username = (getattr(self._bot, "username", None) or "").lstrip("@").lower() bot_username = (getattr(self._bot, "username", None) or "").lstrip("@").lower()
bot_id = getattr(self._bot, "id", None) bot_id = getattr(self._bot, "id", None)
expected = f"@{bot_username}" if bot_username else None
def _iter_sources(): def _iter_sources():
yield getattr(message, "text", None) or "", getattr(message, "entities", None) or [] yield getattr(message, "text", None) or "", getattr(message, "entities", None) or []
yield getattr(message, "caption", None) or "", getattr(message, "caption_entities", None) or [] yield getattr(message, "caption", None) or "", getattr(message, "caption_entities", None) or []
# Telegram parses mentions server-side and emits MessageEntity objects
# (type=mention for @username, type=text_mention for @FirstName targeting
# a user without a public username). Only those entities are authoritative —
# raw substring matches like "foo@hermes_bot.example" are not mentions
# (bug #12545). Entities also correctly handle @handles inside URLs, code
# blocks, and quoted text, where a regex scan would over-match.
for source_text, entities in _iter_sources(): for source_text, entities in _iter_sources():
if bot_username:
if re.search(rf'(?<!\w)@{re.escape(bot_username)}(?!\w)', source_text, re.IGNORECASE):
return True
for entity in entities: for entity in entities:
entity_type = str(getattr(entity, "type", "")).split(".")[-1].lower() entity_type = str(getattr(entity, "type", "")).split(".")[-1].lower()
if entity_type == "mention" and bot_username: if entity_type == "mention" and expected:
offset = int(getattr(entity, "offset", -1)) offset = int(getattr(entity, "offset", -1))
length = int(getattr(entity, "length", 0)) length = int(getattr(entity, "length", 0))
if offset < 0 or length <= 0: if offset < 0 or length <= 0:
continue continue
if source_text[offset:offset + length].strip().lower() == f"@{bot_username}": if source_text[offset:offset + length].strip().lower() == expected:
return True return True
elif entity_type == "text_mention": elif entity_type == "text_mention":
user = getattr(entity, "user", None) user = getattr(entity, "user", None)

View file

@ -1,19 +1,23 @@
"""Tests for Telegram bot mention word-boundary detection (bug #12545). """Tests for Telegram bot mention detection (bug #12545).
The old implementation used a naive substring check (`f"@{bot_username}" in text.lower()`), The old implementation used a naive substring check
which incorrectly matched partial substrings like 'foo@hermes_bot.example'. (`f"@{bot_username}" in text.lower()`), which incorrectly matched partial
substrings like 'foo@hermes_bot.example'.
These tests verify that the regex-based word-boundary check correctly delimits mentions. Detection now relies entirely on the MessageEntity objects Telegram's server
emits for real mentions. A bare `@username` substring in message text without
a corresponding `MENTION` entity is NOT a mention this correctly ignores
@handles that appear inside URLs, code blocks, email-like strings, or quoted
text, because Telegram's parser does not emit mention entities for any of
those contexts.
""" """
from types import SimpleNamespace from types import SimpleNamespace
from gateway.config import Platform, PlatformConfig
from gateway.platforms.telegram import TelegramAdapter from gateway.platforms.telegram import TelegramAdapter
def _make_adapter(): def _make_adapter():
"""Build a minimal TelegramAdapter with a mocked bot."""
from gateway.config import Platform, PlatformConfig
adapter = object.__new__(TelegramAdapter) adapter = object.__new__(TelegramAdapter)
adapter.platform = Platform.TELEGRAM adapter.platform = Platform.TELEGRAM
adapter.config = PlatformConfig(enabled=True, token="***", extra={}) adapter.config = PlatformConfig(enabled=True, token="***", extra={})
@ -21,8 +25,23 @@ def _make_adapter():
return adapter return adapter
def _group_message(text, entities=None, caption=None, caption_entities=None): def _mention_entity(text, mention="@hermes_bot"):
"""Produce a minimal group-message-like SimpleNamespace.""" """Build a MENTION entity pointing at a literal `@username` in `text`."""
offset = text.index(mention)
return SimpleNamespace(type="mention", offset=offset, length=len(mention))
def _text_mention_entity(offset, length, user_id):
"""Build a TEXT_MENTION entity (used when the target user has no public @handle)."""
return SimpleNamespace(
type="text_mention",
offset=offset,
length=length,
user=SimpleNamespace(id=user_id),
)
def _message(text=None, caption=None, entities=None, caption_entities=None):
return SimpleNamespace( return SimpleNamespace(
text=text, text=text,
caption=caption, caption=caption,
@ -34,77 +53,133 @@ def _group_message(text, entities=None, caption=None, caption_entities=None):
) )
class TestTelegramMentionBoundaries: class TestRealMentionsAreDetected:
"""Test that _message_mentions_bot correctly respects word boundaries.""" """A real Telegram mention always comes with a MENTION entity — detect those."""
def test_exact_mention_is_recognized(self): def test_mention_at_start_of_message(self):
"""'@hermes_bot' at any position should be detected."""
adapter = _make_adapter() adapter = _make_adapter()
msg = _group_message("hello @hermes_bot") text = "@hermes_bot hello world"
msg = _message(text=text, entities=[_mention_entity(text)])
assert adapter._message_mentions_bot(msg) is True assert adapter._message_mentions_bot(msg) is True
def test_mention_at_start_of_string(self): def test_mention_mid_sentence(self):
"""'@hermes_bot hello' should be detected."""
adapter = _make_adapter() adapter = _make_adapter()
msg = _group_message("@hermes_bot hello world") text = "hey @hermes_bot, can you help?"
msg = _message(text=text, entities=[_mention_entity(text)])
assert adapter._message_mentions_bot(msg) is True assert adapter._message_mentions_bot(msg) is True
def test_mention_followed_by_punctuation(self): def test_mention_at_end_of_message(self):
"""'@hermes_bot,' should be detected."""
adapter = _make_adapter() adapter = _make_adapter()
msg = _group_message("@hermes_bot, how are you?") text = "thanks for looking @hermes_bot"
msg = _message(text=text, entities=[_mention_entity(text)])
assert adapter._message_mentions_bot(msg) is True assert adapter._message_mentions_bot(msg) is True
def test_mention_in_subdomain_is_not_recognized(self):
"""'foo@hermes_bot.example' should NOT match (bug #12545)."""
adapter = _make_adapter()
msg = _group_message("foo@hermes_bot.example")
assert adapter._message_mentions_bot(msg) is False
def test_mention_in_longer_hostname_is_not_recognized(self):
"""'email me at user@hermes_bot.domain.com' should NOT match."""
adapter = _make_adapter()
msg = _group_message("email me at user@hermes_bot.domain.com")
assert adapter._message_mentions_bot(msg) is False
def test_superstring_username_is_not_recognized(self):
"""'@hermes_botx' should NOT match (different username)."""
adapter = _make_adapter()
msg = _group_message("@hermes_botx hello")
assert adapter._message_mentions_bot(msg) is False
def test_prefixed_superstring_is_not_recognized(self):
"""'foo@hermes_bot_bar' should NOT match."""
adapter = _make_adapter()
msg = _group_message("foo@hermes_bot_bar")
assert adapter._message_mentions_bot(msg) is False
def test_mention_case_insensitive(self):
"""'@HERMES_BOT' should be detected (case-insensitive)."""
adapter = _make_adapter()
msg = _group_message("@HERMES_BOT hello")
assert adapter._message_mentions_bot(msg) is True
def test_mention_mixed_case(self):
"""'@Hermes_Bot' should be detected."""
adapter = _make_adapter()
msg = _group_message("@Hermes_Bot hello")
assert adapter._message_mentions_bot(msg) is True
def test_no_mention_returns_false(self):
"""Plain text with no mention should return False."""
adapter = _make_adapter()
msg = _group_message("just a regular message in the group")
assert adapter._message_mentions_bot(msg) is False
def test_mention_in_caption(self): def test_mention_in_caption(self):
"""Mention in caption should be detected."""
adapter = _make_adapter() adapter = _make_adapter()
msg = _group_message(None, caption="check this out @hermes_bot") caption = "photo for @hermes_bot"
msg = _message(caption=caption, caption_entities=[_mention_entity(caption)])
assert adapter._message_mentions_bot(msg) is True assert adapter._message_mentions_bot(msg) is True
def test_subdomain_mention_in_caption_not_recognized(self): def test_text_mention_entity_targets_bot(self):
"""'foo@hermes_bot.example' in caption should NOT match.""" """TEXT_MENTION is Telegram's entity type for @FirstName -> user without a public handle."""
adapter = _make_adapter() adapter = _make_adapter()
msg = _group_message(None, caption="foo@hermes_bot.example") msg = _message(text="hey you", entities=[_text_mention_entity(4, 3, user_id=999)])
assert adapter._message_mentions_bot(msg) is True
class TestSubstringFalsePositivesAreRejected:
"""Bare `@bot_username` substrings without a MENTION entity must NOT match.
These are all inputs where the OLD substring check returned True incorrectly.
A word-boundary regex would still over-match some of these (code blocks,
URLs). Entity-based detection handles them all correctly because Telegram's
parser does not emit mention entities for non-mention contexts.
"""
def test_email_like_substring(self):
"""bug #12545 exact repro: 'foo@hermes_bot.example'."""
adapter = _make_adapter()
msg = _message(text="email me at foo@hermes_bot.example")
assert adapter._message_mentions_bot(msg) is False assert adapter._message_mentions_bot(msg) is False
def test_hostname_substring(self):
adapter = _make_adapter()
msg = _message(text="contact user@hermes_bot.domain.com")
assert adapter._message_mentions_bot(msg) is False
def test_superstring_username(self):
"""`@hermes_botx` is a different username; Telegram would emit a mention
entity for `@hermes_botx`, not `@hermes_bot`."""
adapter = _make_adapter()
msg = _message(text="@hermes_botx hello")
assert adapter._message_mentions_bot(msg) is False
def test_underscore_suffix_substring(self):
adapter = _make_adapter()
msg = _message(text="see @hermes_bot_admin for help")
assert adapter._message_mentions_bot(msg) is False
def test_substring_inside_url_without_entity(self):
"""@handle inside a URL produces a URL entity, not a MENTION entity."""
adapter = _make_adapter()
msg = _message(text="see https://example.com/@hermes_bot for details")
assert adapter._message_mentions_bot(msg) is False
def test_substring_inside_code_block_without_entity(self):
"""Telegram doesn't emit mention entities inside code/pre entities."""
adapter = _make_adapter()
msg = _message(text="use the string `@hermes_bot` in config")
assert adapter._message_mentions_bot(msg) is False
def test_plain_text_with_no_at_sign(self):
adapter = _make_adapter()
msg = _message(text="just a normal group message")
assert adapter._message_mentions_bot(msg) is False
def test_email_substring_in_caption(self):
adapter = _make_adapter()
msg = _message(caption="foo@hermes_bot.example")
assert adapter._message_mentions_bot(msg) is False
class TestEntityEdgeCases:
"""Malformed or mismatched entities should not crash or over-match."""
def test_mention_entity_for_different_username(self):
adapter = _make_adapter()
text = "@someone_else hi"
msg = _message(text=text, entities=[_mention_entity(text, mention="@someone_else")])
assert adapter._message_mentions_bot(msg) is False
def test_text_mention_entity_for_different_user(self):
adapter = _make_adapter()
msg = _message(text="hi there", entities=[_text_mention_entity(0, 2, user_id=12345)])
assert adapter._message_mentions_bot(msg) is False
def test_malformed_entity_with_negative_offset(self):
adapter = _make_adapter()
msg = _message(text="@hermes_bot hi",
entities=[SimpleNamespace(type="mention", offset=-1, length=11)])
assert adapter._message_mentions_bot(msg) is False
def test_malformed_entity_with_zero_length(self):
adapter = _make_adapter()
msg = _message(text="@hermes_bot hi",
entities=[SimpleNamespace(type="mention", offset=0, length=0)])
assert adapter._message_mentions_bot(msg) is False
class TestCaseInsensitivity:
"""Telegram usernames are case-insensitive; the slice-compare normalizes both sides."""
def test_uppercase_mention(self):
adapter = _make_adapter()
text = "hi @HERMES_BOT"
msg = _message(text=text, entities=[_mention_entity(text, mention="@HERMES_BOT")])
assert adapter._message_mentions_bot(msg) is True
def test_mixed_case_mention(self):
adapter = _make_adapter()
text = "hi @Hermes_Bot"
msg = _message(text=text, entities=[_mention_entity(text, mention="@Hermes_Bot")])
assert adapter._message_mentions_bot(msg) is True