refactor(telegram): use entity-only mention detection

Replaces the word-boundary regex scan with pure MessageEntity-based
detection. Telegram's server emits MENTION entities for real @username
mentions and TEXT_MENTION entities for @FirstName mentions; the text-
scanning fallback was both redundant (entities are always present for
real mentions) and broken (matched raw substrings like email addresses,
URLs, code-block contents, and forwarded literal text).

Entity-only detection:
- Closes bug #12545 ("foo@hermes_bot.example" false positive).
- Also fixes edge cases the regex fix would still miss: @handles inside
  URLs and code blocks, where Telegram does not emit mention entities.

Tests rewritten to exercise realistic Telegram payloads (real mentions
carry entities; substring false positives don't).
This commit is contained in:
Teknium 2026-04-19 22:51:56 -07:00 committed by Teknium
parent 1e18e0503f
commit e330112aa8
2 changed files with 151 additions and 72 deletions

View file

@ -2258,23 +2258,27 @@ class TelegramAdapter(BasePlatformAdapter):
bot_username = (getattr(self._bot, "username", None) or "").lstrip("@").lower()
bot_id = getattr(self._bot, "id", None)
expected = f"@{bot_username}" if bot_username else None
def _iter_sources():
yield getattr(message, "text", None) or "", getattr(message, "entities", None) or []
yield getattr(message, "caption", None) or "", getattr(message, "caption_entities", None) or []
# Telegram parses mentions server-side and emits MessageEntity objects
# (type=mention for @username, type=text_mention for @FirstName targeting
# a user without a public username). Only those entities are authoritative —
# raw substring matches like "foo@hermes_bot.example" are not mentions
# (bug #12545). Entities also correctly handle @handles inside URLs, code
# blocks, and quoted text, where a regex scan would over-match.
for source_text, entities in _iter_sources():
if bot_username:
if re.search(rf'(?<!\w)@{re.escape(bot_username)}(?!\w)', source_text, re.IGNORECASE):
return True
for entity in entities:
entity_type = str(getattr(entity, "type", "")).split(".")[-1].lower()
if entity_type == "mention" and bot_username:
if entity_type == "mention" and expected:
offset = int(getattr(entity, "offset", -1))
length = int(getattr(entity, "length", 0))
if offset < 0 or length <= 0:
continue
if source_text[offset:offset + length].strip().lower() == f"@{bot_username}":
if source_text[offset:offset + length].strip().lower() == expected:
return True
elif entity_type == "text_mention":
user = getattr(entity, "user", None)

View file

@ -1,19 +1,23 @@
"""Tests for Telegram bot mention word-boundary detection (bug #12545).
"""Tests for Telegram bot mention detection (bug #12545).
The old implementation used a naive substring check (`f"@{bot_username}" in text.lower()`),
which incorrectly matched partial substrings like 'foo@hermes_bot.example'.
The old implementation used a naive substring check
(`f"@{bot_username}" in text.lower()`), which incorrectly matched partial
substrings like 'foo@hermes_bot.example'.
These tests verify that the regex-based word-boundary check correctly delimits mentions.
Detection now relies entirely on the MessageEntity objects Telegram's server
emits for real mentions. A bare `@username` substring in message text without
a corresponding `MENTION` entity is NOT a mention this correctly ignores
@handles that appear inside URLs, code blocks, email-like strings, or quoted
text, because Telegram's parser does not emit mention entities for any of
those contexts.
"""
from types import SimpleNamespace
from gateway.config import Platform, PlatformConfig
from gateway.platforms.telegram import TelegramAdapter
def _make_adapter():
"""Build a minimal TelegramAdapter with a mocked bot."""
from gateway.config import Platform, PlatformConfig
adapter = object.__new__(TelegramAdapter)
adapter.platform = Platform.TELEGRAM
adapter.config = PlatformConfig(enabled=True, token="***", extra={})
@ -21,8 +25,23 @@ def _make_adapter():
return adapter
def _group_message(text, entities=None, caption=None, caption_entities=None):
"""Produce a minimal group-message-like SimpleNamespace."""
def _mention_entity(text, mention="@hermes_bot"):
"""Build a MENTION entity pointing at a literal `@username` in `text`."""
offset = text.index(mention)
return SimpleNamespace(type="mention", offset=offset, length=len(mention))
def _text_mention_entity(offset, length, user_id):
"""Build a TEXT_MENTION entity (used when the target user has no public @handle)."""
return SimpleNamespace(
type="text_mention",
offset=offset,
length=length,
user=SimpleNamespace(id=user_id),
)
def _message(text=None, caption=None, entities=None, caption_entities=None):
return SimpleNamespace(
text=text,
caption=caption,
@ -34,77 +53,133 @@ def _group_message(text, entities=None, caption=None, caption_entities=None):
)
class TestTelegramMentionBoundaries:
"""Test that _message_mentions_bot correctly respects word boundaries."""
class TestRealMentionsAreDetected:
"""A real Telegram mention always comes with a MENTION entity — detect those."""
def test_exact_mention_is_recognized(self):
"""'@hermes_bot' at any position should be detected."""
def test_mention_at_start_of_message(self):
adapter = _make_adapter()
msg = _group_message("hello @hermes_bot")
text = "@hermes_bot hello world"
msg = _message(text=text, entities=[_mention_entity(text)])
assert adapter._message_mentions_bot(msg) is True
def test_mention_at_start_of_string(self):
"""'@hermes_bot hello' should be detected."""
def test_mention_mid_sentence(self):
adapter = _make_adapter()
msg = _group_message("@hermes_bot hello world")
text = "hey @hermes_bot, can you help?"
msg = _message(text=text, entities=[_mention_entity(text)])
assert adapter._message_mentions_bot(msg) is True
def test_mention_followed_by_punctuation(self):
"""'@hermes_bot,' should be detected."""
def test_mention_at_end_of_message(self):
adapter = _make_adapter()
msg = _group_message("@hermes_bot, how are you?")
text = "thanks for looking @hermes_bot"
msg = _message(text=text, entities=[_mention_entity(text)])
assert adapter._message_mentions_bot(msg) is True
def test_mention_in_subdomain_is_not_recognized(self):
"""'foo@hermes_bot.example' should NOT match (bug #12545)."""
adapter = _make_adapter()
msg = _group_message("foo@hermes_bot.example")
assert adapter._message_mentions_bot(msg) is False
def test_mention_in_longer_hostname_is_not_recognized(self):
"""'email me at user@hermes_bot.domain.com' should NOT match."""
adapter = _make_adapter()
msg = _group_message("email me at user@hermes_bot.domain.com")
assert adapter._message_mentions_bot(msg) is False
def test_superstring_username_is_not_recognized(self):
"""'@hermes_botx' should NOT match (different username)."""
adapter = _make_adapter()
msg = _group_message("@hermes_botx hello")
assert adapter._message_mentions_bot(msg) is False
def test_prefixed_superstring_is_not_recognized(self):
"""'foo@hermes_bot_bar' should NOT match."""
adapter = _make_adapter()
msg = _group_message("foo@hermes_bot_bar")
assert adapter._message_mentions_bot(msg) is False
def test_mention_case_insensitive(self):
"""'@HERMES_BOT' should be detected (case-insensitive)."""
adapter = _make_adapter()
msg = _group_message("@HERMES_BOT hello")
assert adapter._message_mentions_bot(msg) is True
def test_mention_mixed_case(self):
"""'@Hermes_Bot' should be detected."""
adapter = _make_adapter()
msg = _group_message("@Hermes_Bot hello")
assert adapter._message_mentions_bot(msg) is True
def test_no_mention_returns_false(self):
"""Plain text with no mention should return False."""
adapter = _make_adapter()
msg = _group_message("just a regular message in the group")
assert adapter._message_mentions_bot(msg) is False
def test_mention_in_caption(self):
"""Mention in caption should be detected."""
adapter = _make_adapter()
msg = _group_message(None, caption="check this out @hermes_bot")
caption = "photo for @hermes_bot"
msg = _message(caption=caption, caption_entities=[_mention_entity(caption)])
assert adapter._message_mentions_bot(msg) is True
def test_subdomain_mention_in_caption_not_recognized(self):
"""'foo@hermes_bot.example' in caption should NOT match."""
def test_text_mention_entity_targets_bot(self):
"""TEXT_MENTION is Telegram's entity type for @FirstName -> user without a public handle."""
adapter = _make_adapter()
msg = _group_message(None, caption="foo@hermes_bot.example")
msg = _message(text="hey you", entities=[_text_mention_entity(4, 3, user_id=999)])
assert adapter._message_mentions_bot(msg) is True
class TestSubstringFalsePositivesAreRejected:
"""Bare `@bot_username` substrings without a MENTION entity must NOT match.
These are all inputs where the OLD substring check returned True incorrectly.
A word-boundary regex would still over-match some of these (code blocks,
URLs). Entity-based detection handles them all correctly because Telegram's
parser does not emit mention entities for non-mention contexts.
"""
def test_email_like_substring(self):
"""bug #12545 exact repro: 'foo@hermes_bot.example'."""
adapter = _make_adapter()
msg = _message(text="email me at foo@hermes_bot.example")
assert adapter._message_mentions_bot(msg) is False
def test_hostname_substring(self):
adapter = _make_adapter()
msg = _message(text="contact user@hermes_bot.domain.com")
assert adapter._message_mentions_bot(msg) is False
def test_superstring_username(self):
"""`@hermes_botx` is a different username; Telegram would emit a mention
entity for `@hermes_botx`, not `@hermes_bot`."""
adapter = _make_adapter()
msg = _message(text="@hermes_botx hello")
assert adapter._message_mentions_bot(msg) is False
def test_underscore_suffix_substring(self):
adapter = _make_adapter()
msg = _message(text="see @hermes_bot_admin for help")
assert adapter._message_mentions_bot(msg) is False
def test_substring_inside_url_without_entity(self):
"""@handle inside a URL produces a URL entity, not a MENTION entity."""
adapter = _make_adapter()
msg = _message(text="see https://example.com/@hermes_bot for details")
assert adapter._message_mentions_bot(msg) is False
def test_substring_inside_code_block_without_entity(self):
"""Telegram doesn't emit mention entities inside code/pre entities."""
adapter = _make_adapter()
msg = _message(text="use the string `@hermes_bot` in config")
assert adapter._message_mentions_bot(msg) is False
def test_plain_text_with_no_at_sign(self):
adapter = _make_adapter()
msg = _message(text="just a normal group message")
assert adapter._message_mentions_bot(msg) is False
def test_email_substring_in_caption(self):
adapter = _make_adapter()
msg = _message(caption="foo@hermes_bot.example")
assert adapter._message_mentions_bot(msg) is False
class TestEntityEdgeCases:
"""Malformed or mismatched entities should not crash or over-match."""
def test_mention_entity_for_different_username(self):
adapter = _make_adapter()
text = "@someone_else hi"
msg = _message(text=text, entities=[_mention_entity(text, mention="@someone_else")])
assert adapter._message_mentions_bot(msg) is False
def test_text_mention_entity_for_different_user(self):
adapter = _make_adapter()
msg = _message(text="hi there", entities=[_text_mention_entity(0, 2, user_id=12345)])
assert adapter._message_mentions_bot(msg) is False
def test_malformed_entity_with_negative_offset(self):
adapter = _make_adapter()
msg = _message(text="@hermes_bot hi",
entities=[SimpleNamespace(type="mention", offset=-1, length=11)])
assert adapter._message_mentions_bot(msg) is False
def test_malformed_entity_with_zero_length(self):
adapter = _make_adapter()
msg = _message(text="@hermes_bot hi",
entities=[SimpleNamespace(type="mention", offset=0, length=0)])
assert adapter._message_mentions_bot(msg) is False
class TestCaseInsensitivity:
"""Telegram usernames are case-insensitive; the slice-compare normalizes both sides."""
def test_uppercase_mention(self):
adapter = _make_adapter()
text = "hi @HERMES_BOT"
msg = _message(text=text, entities=[_mention_entity(text, mention="@HERMES_BOT")])
assert adapter._message_mentions_bot(msg) is True
def test_mixed_case_mention(self):
adapter = _make_adapter()
text = "hi @Hermes_Bot"
msg = _message(text=text, entities=[_mention_entity(text, mention="@Hermes_Bot")])
assert adapter._message_mentions_bot(msg) is True