fix(matrix,gateway): Matrix E2EE installs full dep set; plugins respect is_connected

Fixes #31116 — two distinct bugs in fresh-install Matrix gateway:

1. Matrix E2EE setup installed only mautrix[encryption], leaving asyncpg
   / aiosqlite / Markdown / aiohttp-socks uninstalled. The first encrypted
   connect failed with 'No module named asyncpg' deep inside
   MatrixAdapter.connect(). Root cause: the setup wizard hand-rolled a
   pip install of one package instead of using lazy_deps.ensure(
   'platform.matrix'), and check_matrix_requirements() short-circuited the
   runtime installer on 'import mautrix' alone — so the other 4 packages
   were never pulled in.

2. Discord auto-enabled itself on every gateway start, even when the user
   never selected Discord and had no DISCORD_BOT_TOKEN. Root cause:
   gateway/config.py plugin-enablement loop gated enablement on
   entry.check_fn() (just 'is the SDK importable?') and ignored
   entry.is_connected (the 'did the user configure credentials?' probe).
   Same bug class as commit 7849a3d73 fixed for _platform_status in the
   setup wizard; this is the runtime counterpart. Affects Discord, Teams,
   and Google Chat.

Changes:
- hermes_cli/setup.py::_setup_matrix — install via
  lazy_deps.ensure('platform.matrix') to pull the full feature group.
- gateway/platforms/matrix.py::_check_e2ee_deps — verify asyncpg +
  aiosqlite + PgCryptoStore in addition to OlmMachine, so E2EE failures
  surface at startup instead of at first encrypted-room connect.
- gateway/platforms/matrix.py::check_matrix_requirements — use
  feature_missing('platform.matrix') as the install gate instead of a
  single 'import mautrix' check, so partial installs trigger the lazy
  installer correctly.
- gateway/config.py plugin-enablement loop — consult entry.is_connected
  before flipping enabled=True. Explicit YAML enabled=true still wins.

Tests: 3 new in tests/gateway/test_matrix.py (asyncpg-required,
aiosqlite-required, partial-install lazy-runs), 5 new in
tests/gateway/test_platform_registry.py (is_connected=False blocks,
is_connected=True enables, is_connected=None falls back to check_fn,
raising probe doesn't enable, explicit YAML wins).

Validation: 310 tests across affected test modules pass.
This commit is contained in:
Teknium 2026-05-24 04:36:24 -07:00
parent 88834baf50
commit 54e61f9331
5 changed files with 386 additions and 28 deletions

View file

@ -1813,6 +1813,17 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
# need to seed ``PlatformConfig.extra`` from env vars (e.g. Google Chat's
# project_id / subscription_name) can supply ``env_enablement_fn`` on
# their PlatformEntry — called here BEFORE adapter construction.
#
# Enablement gate (#31116): when a plugin registers ``is_connected``
# (the "has the user actually configured credentials for this?" check),
# we MUST consult it before flipping ``enabled = True``. Otherwise
# ``check_fn`` alone — which for adapter plugins typically just
# verifies the SDK is importable / lazy-installs it — silently enables
# platforms the user never opted into, and the gateway then tries to
# connect to Discord / Teams / Google Chat with no token and emits
# noisy retry-forever errors. ``_platform_status`` was already fixed
# for the same bug class in commit 7849a3d73; this is the runtime
# counterpart.
try:
from hermes_cli.plugins import discover_plugins
discover_plugins() # idempotent
@ -1825,6 +1836,29 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
logger.debug("check_fn for %s raised: %s", entry.name, e)
continue
platform = Platform(entry.name)
existing_cfg = config.platforms.get(platform)
# Only consult is_connected for platforms that are NOT already
# explicitly configured in YAML / env (existing_cfg with
# enabled=True means the user wrote it themselves or another
# env-var bridge enabled it — keep that decision).
if existing_cfg is None or not existing_cfg.enabled:
if entry.is_connected is not None:
try:
probe_cfg = existing_cfg or PlatformConfig()
configured = bool(entry.is_connected(probe_cfg))
except Exception as exc:
logger.debug(
"is_connected for %s raised: %s — skipping enablement",
entry.name, exc,
)
configured = False
if not configured:
logger.debug(
"Plugin platform '%s' available but not configured "
"(is_connected returned False) — skipping enable",
entry.name,
)
continue
if platform not in config.platforms:
config.platforms[platform] = PlatformConfig()
config.platforms[platform].enabled = True

View file

@ -138,7 +138,8 @@ _OUTBOUND_MENTION_RE = re.compile(
)
_E2EE_INSTALL_HINT = (
"Install with: pip install 'mautrix[encryption]' (requires libolm C library)"
"Install with: pip install 'mautrix[encryption]' asyncpg aiosqlite "
"(requires libolm C library)"
)
_MATRIX_IMAGE_FILENAME_EXTS = frozenset({
@ -214,9 +215,22 @@ def _create_matrix_session(proxy_url: str | None):
def _check_e2ee_deps() -> bool:
"""Return True if mautrix E2EE dependencies (python-olm) are available."""
"""Return True if mautrix E2EE dependencies are available.
Verifies python-olm (via mautrix.crypto.OlmMachine), the SQLite crypto
store backend (mautrix.crypto.store.asyncpg.PgCryptoStore yes, the
PgCryptoStore class also drives the sqlite backend in mautrix 0.21),
and the database drivers actually used at connect time (``asyncpg`` for
the underlying upgrade_table machinery, ``aiosqlite`` for the
``sqlite:///`` URL we pass to ``Database.create``). Without all four,
encrypted rooms fail at connect time with a confusing
``No module named 'asyncpg'`` (#31116).
"""
try:
from mautrix.crypto import OlmMachine # noqa: F401
from mautrix.crypto.store.asyncpg import PgCryptoStore # noqa: F401
import asyncpg # noqa: F401
import aiosqlite # noqa: F401
return True
except (ImportError, AttributeError):
@ -226,8 +240,13 @@ def _check_e2ee_deps() -> bool:
def check_matrix_requirements() -> bool:
"""Return True if the Matrix adapter can be used.
Lazy-installs mautrix via ``tools.lazy_deps.ensure("platform.matrix")``
on first call if not present. Rebinds all module-level type globals on success.
Lazy-installs the full ``platform.matrix`` feature group via
``tools.lazy_deps.ensure_and_bind`` whenever any of the declared
packages (mautrix, Markdown, aiosqlite, asyncpg, aiohttp-socks) is
missing not just mautrix itself. Previously this short-circuited on
``import mautrix``, which left the other four packages uninstalled
forever and broke E2EE connect with ``No module named 'asyncpg'``
(#31116). Rebinds module-level type globals on success.
"""
token = os.getenv("MATRIX_ACCESS_TOKEN", "")
password = os.getenv("MATRIX_PASSWORD", "")
@ -239,9 +258,20 @@ def check_matrix_requirements() -> bool:
if not homeserver:
logger.warning("Matrix: MATRIX_HOMESERVER not set")
return False
# Check whether any package in the platform.matrix feature group is
# missing. ``feature_missing`` is cheap (per-spec importlib.metadata
# lookups) and correctly handles ``mautrix[encryption]`` by stripping
# the extras marker before checking the bare package.
try:
import mautrix # noqa: F401
except ImportError:
from tools.lazy_deps import feature_missing, ensure_and_bind
missing = feature_missing("platform.matrix")
except Exception as exc: # pragma: no cover — defensive
logger.debug("Matrix: lazy_deps lookup failed: %s", exc)
missing = ()
ensure_and_bind = None # type: ignore[assignment]
if missing or ensure_and_bind is None:
def _import():
from mautrix.types import (
ContentURI, EventID, EventType, PaginationDirection,
@ -261,10 +291,14 @@ def check_matrix_requirements() -> bool:
"UserID": UserID,
}
from tools.lazy_deps import ensure_and_bind
if ensure_and_bind is None:
return False
if not ensure_and_bind("platform.matrix", _import, globals(), prompt=False):
logger.warning(
"Matrix: mautrix not installed. Run: pip install 'mautrix[encryption]'"
"Matrix: required packages not installed (%s). "
"Run: pip install 'mautrix[encryption]' asyncpg aiosqlite "
"Markdown aiohttp-socks",
", ".join(missing) if missing else "platform.matrix",
)
return False

View file

@ -2188,28 +2188,58 @@ def _setup_matrix():
print_success("E2EE enabled")
matrix_pkg = "mautrix[encryption]" if want_e2ee else "mautrix"
# Use the central lazy-deps feature group so we install ALL of
# platform.matrix's dependencies (mautrix, Markdown, aiosqlite,
# asyncpg, aiohttp-socks) — not just mautrix itself. The previous
# hand-rolled ``pip install mautrix[encryption]`` left asyncpg /
# aiosqlite uninstalled and broke E2EE connect with
# ``No module named 'asyncpg'`` on every fresh install (#31116).
try:
__import__("mautrix")
from tools.lazy_deps import ensure as _lazy_ensure, feature_missing
_missing_before = feature_missing("platform.matrix")
if _missing_before:
print_info(
f"Installing {matrix_pkg} (+ {len(_missing_before)} runtime deps)..."
)
try:
_lazy_ensure("platform.matrix", prompt=False)
print_success(f"{matrix_pkg} installed")
except Exception as exc:
print_warning(
f"Install failed — run manually: pip install "
f"'mautrix[encryption]' asyncpg aiosqlite Markdown "
f"aiohttp-socks"
)
print_info(f" Error: {exc}")
except ImportError:
print_info(f"Installing {matrix_pkg}...")
import subprocess
uv_bin = shutil.which("uv")
if uv_bin:
result = subprocess.run(
[uv_bin, "pip", "install", "--python", sys.executable, matrix_pkg],
capture_output=True, text=True,
)
else:
result = subprocess.run(
[sys.executable, "-m", "pip", "install", matrix_pkg],
capture_output=True, text=True,
)
if result.returncode == 0:
print_success(f"{matrix_pkg} installed")
else:
print_warning(f"Install failed — run manually: pip install '{matrix_pkg}'")
if result.stderr:
print_info(f" Error: {result.stderr.strip().splitlines()[-1]}")
# tools.lazy_deps unavailable (extreme edge case — partial
# install). Fall back to the legacy single-package install
# path so the wizard still does *something*.
try:
__import__("mautrix")
except ImportError:
print_info(f"Installing {matrix_pkg}...")
import subprocess
uv_bin = shutil.which("uv")
if uv_bin:
result = subprocess.run(
[uv_bin, "pip", "install", "--python", sys.executable, matrix_pkg],
capture_output=True, text=True,
)
else:
result = subprocess.run(
[sys.executable, "-m", "pip", "install", matrix_pkg],
capture_output=True, text=True,
)
if result.returncode == 0:
print_success(f"{matrix_pkg} installed")
else:
print_warning(
f"Install failed — run manually: pip install "
f"'{matrix_pkg}' asyncpg aiosqlite Markdown aiohttp-socks"
)
if result.stderr:
print_info(f" Error: {result.stderr.strip().splitlines()[-1]}")
print()
print_info("🔒 Security: Restrict who can use your bot")

View file

@ -797,6 +797,79 @@ class TestMatrixRequirements:
with patch("tools.lazy_deps.ensure", side_effect=ImportError("mautrix unavailable")):
assert matrix_mod.check_matrix_requirements() is False
def test_check_e2ee_deps_requires_asyncpg(self, monkeypatch):
"""E2EE deps check must reject when asyncpg is missing — even if olm is present.
Regression for #31116: ``mautrix[encryption]`` extra installs python-olm
but NOT asyncpg/aiosqlite, which are required by mautrix's crypto store
at connect time. ``_check_e2ee_deps`` previously only tested
``OlmMachine`` import and returned True, so the failure manifested as
a confusing ``No module named 'asyncpg'`` deep in
``MatrixAdapter.connect()``.
"""
from gateway.platforms.matrix import _check_e2ee_deps
import builtins
real_import = builtins.__import__
def _blocking_import(name, *args, **kwargs):
if name == "asyncpg" or name.startswith("asyncpg."):
raise ImportError("blocked for test")
return real_import(name, *args, **kwargs)
with patch.object(builtins, "__import__", _blocking_import):
assert _check_e2ee_deps() is False
def test_check_e2ee_deps_requires_aiosqlite(self):
"""E2EE deps check must reject when aiosqlite is missing.
Mautrix's ``Database.create("sqlite:///...")`` driver lookup imports
aiosqlite lazily without it, connect fails at ``crypto_db.start()``.
"""
from gateway.platforms.matrix import _check_e2ee_deps
import builtins
real_import = builtins.__import__
def _blocking_import(name, *args, **kwargs):
if name == "aiosqlite" or name.startswith("aiosqlite."):
raise ImportError("blocked for test")
return real_import(name, *args, **kwargs)
with patch.object(builtins, "__import__", _blocking_import):
assert _check_e2ee_deps() is False
def test_check_requirements_runs_lazy_install_when_partial(self, monkeypatch):
"""When mautrix is installed but asyncpg/aiosqlite are missing,
check_matrix_requirements must still run the lazy installer.
Regression for #31116: the previous ``try: import mautrix`` gate
short-circuited the install of the OTHER 4 platform.matrix packages,
so a partial install (mautrix only) was treated as fully installed.
"""
monkeypatch.setenv("MATRIX_ACCESS_TOKEN", "syt_test")
monkeypatch.setenv("MATRIX_HOMESERVER", "https://matrix.example.org")
monkeypatch.delenv("MATRIX_ENCRYPTION", raising=False)
from gateway.platforms import matrix as matrix_mod
# Simulate "mautrix installed, asyncpg missing" → feature_missing
# returns a non-empty tuple → ensure_and_bind MUST be called.
called = {"ensure_and_bind": False}
def _fake_ensure_and_bind(feature, importer, target_globals, **kwargs):
called["ensure_and_bind"] = True
assert feature == "platform.matrix"
return True # Pretend install succeeded.
with patch("tools.lazy_deps.feature_missing", return_value=("asyncpg==0.31.0",)), \
patch("tools.lazy_deps.ensure_and_bind", side_effect=_fake_ensure_and_bind):
matrix_mod.check_matrix_requirements()
assert called["ensure_and_bind"], (
"check_matrix_requirements must call ensure_and_bind whenever ANY "
"platform.matrix dep is missing, not just when mautrix itself is "
"missing (#31116)"
)
# ---------------------------------------------------------------------------
# Access-token auth / E2EE bootstrap

View file

@ -708,3 +708,190 @@ class TestPluginPlatformSharedKeyBridge:
assert extra.get("allow_from") == ["alice", "bob"]
finally:
_reg.unregister("mysharedplat")
class TestPluginEnablementGate:
"""Plugin platforms must NOT auto-enable on check_fn alone (#31116).
When a plugin registers ``is_connected`` (the "did the user actually
configure credentials" probe), ``load_gateway_config`` must consult it
before flipping ``enabled = True``. Without this gate, ``check_fn``
semantics ("the SDK is importable") get conflated with "the user wants
this platform on", and the gateway tries to connect to e.g. Discord
with no token emitting noisy retry-forever errors on every fresh
install that has the plugin loaded.
"""
def _write_config(self, tmp_path, content: str = ""):
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
(hermes_home / "config.yaml").write_text(content, encoding="utf-8")
return hermes_home
def test_plugin_with_is_connected_false_is_NOT_enabled(
self, tmp_path, monkeypatch
):
"""check_fn=True + is_connected=False must NOT enable the platform.
Reproduces #31116: Discord plugin loads, its check_fn lazy-installs
discord.py and returns True, but the user has no DISCORD_BOT_TOKEN.
Previously this auto-enabled Discord and the gateway spammed
``ERROR ... [Discord] No bot token configured`` on every reconnect.
"""
from gateway.platform_registry import platform_registry as _reg
_reg.register(PlatformEntry(
name="myunconfiguredplat",
label="MyUnconfigured",
adapter_factory=lambda cfg: None,
check_fn=lambda: True, # SDK available
is_connected=lambda cfg: False, # but user hasn't set credentials
source="plugin",
))
try:
home = self._write_config(tmp_path)
monkeypatch.setenv("HERMES_HOME", str(home))
from gateway.config import load_gateway_config, Platform
cfg = load_gateway_config()
plat = Platform("myunconfiguredplat")
# Either absent entirely, or present but explicitly disabled.
if plat in cfg.platforms:
assert cfg.platforms[plat].enabled is False, (
"Plugin with is_connected=False must NOT be auto-enabled"
)
finally:
_reg.unregister("myunconfiguredplat")
def test_plugin_with_is_connected_true_is_enabled(
self, tmp_path, monkeypatch
):
"""check_fn=True + is_connected=True still enables the platform."""
from gateway.platform_registry import platform_registry as _reg
_reg.register(PlatformEntry(
name="myconfiguredplat",
label="MyConfigured",
adapter_factory=lambda cfg: None,
check_fn=lambda: True,
is_connected=lambda cfg: True,
source="plugin",
))
try:
home = self._write_config(tmp_path)
monkeypatch.setenv("HERMES_HOME", str(home))
from gateway.config import load_gateway_config, Platform
cfg = load_gateway_config()
plat = Platform("myconfiguredplat")
assert plat in cfg.platforms
assert cfg.platforms[plat].enabled is True
finally:
_reg.unregister("myconfiguredplat")
def test_plugin_without_is_connected_falls_back_to_check_fn(
self, tmp_path, monkeypatch
):
"""Legacy plugins that don't register is_connected keep working.
For plugins where ``is_connected is None``, gating on ``check_fn``
alone remains the contract that's what callers without a
credential probe have always done.
"""
from gateway.platform_registry import platform_registry as _reg
_reg.register(PlatformEntry(
name="mylegacyplat",
label="MyLegacy",
adapter_factory=lambda cfg: None,
check_fn=lambda: True,
# is_connected intentionally omitted (None)
source="plugin",
))
try:
home = self._write_config(tmp_path)
monkeypatch.setenv("HERMES_HOME", str(home))
from gateway.config import load_gateway_config, Platform
cfg = load_gateway_config()
plat = Platform("mylegacyplat")
assert plat in cfg.platforms
assert cfg.platforms[plat].enabled is True
finally:
_reg.unregister("mylegacyplat")
def test_is_connected_raises_does_not_enable(self, tmp_path, monkeypatch):
"""A buggy is_connected must not silently enable the platform.
Treat a raising is_connected as "configuration unknown" refuse to
enable, log, and move on. Anything else would re-introduce the
#31116 bug for plugins whose probe has a transient failure.
"""
from gateway.platform_registry import platform_registry as _reg
def _bad_probe(cfg):
raise RuntimeError("plugin bug")
_reg.register(PlatformEntry(
name="mybadprobeplat",
label="MyBadProbe",
adapter_factory=lambda cfg: None,
check_fn=lambda: True,
is_connected=_bad_probe,
source="plugin",
))
try:
home = self._write_config(tmp_path)
monkeypatch.setenv("HERMES_HOME", str(home))
from gateway.config import load_gateway_config, Platform
cfg = load_gateway_config()
plat = Platform("mybadprobeplat")
if plat in cfg.platforms:
assert cfg.platforms[plat].enabled is False
finally:
_reg.unregister("mybadprobeplat")
def test_yaml_enabled_true_overrides_is_connected_false(
self, tmp_path, monkeypatch
):
"""Explicit YAML ``enabled: true`` wins over is_connected=False.
If the user wrote ``platforms.X.enabled: true`` themselves, respect
that they may be using a credential mechanism the plugin's
is_connected probe doesn't know about. Don't fight them.
"""
from gateway.platform_registry import platform_registry as _reg
_reg.register(PlatformEntry(
name="myexplicitplat",
label="MyExplicit",
adapter_factory=lambda cfg: None,
check_fn=lambda: True,
is_connected=lambda cfg: False,
source="plugin",
))
try:
home = self._write_config(
tmp_path,
"platforms:\n"
" myexplicitplat:\n"
" enabled: true\n",
)
monkeypatch.setenv("HERMES_HOME", str(home))
from gateway.config import load_gateway_config, Platform
cfg = load_gateway_config()
plat = Platform("myexplicitplat")
assert plat in cfg.platforms
assert cfg.platforms[plat].enabled is True, (
"Explicit YAML enabled: true must win over plugin's "
"is_connected=False — user has the final say"
)
finally:
_reg.unregister("myexplicitplat")