diff --git a/gateway/platforms/matrix.py b/gateway/platforms/matrix.py index 95dc73201c5..50d383f6f22 100644 --- a/gateway/platforms/matrix.py +++ b/gateway/platforms/matrix.py @@ -348,6 +348,17 @@ class MatrixAdapter(BasePlatformAdapter): self._sync_task: Optional[asyncio.Task] = None self._closing = False self._startup_ts: float = 0.0 + # Clock-skew detection: count grace-check drops that happen well + # after startup (i.e. not initial-sync backfill). If the host's + # system clock is set ahead of real time, the startup grace check + # `event_ts < startup_ts - 5` silently drops every live message. + # See #12614 — the symptom is "bot joins rooms but never replies". + # Drops only count when their skew matches the first sampled drop + # (within 60s), so varied-age backfill from freshly-invited rooms + # doesn't trip the heuristic. + self._late_grace_drops: int = 0 + self._late_grace_skew: float = 0.0 + self._clock_skew_warned: bool = False # Cache: room_id → bool (is DM) self._dm_rooms: Dict[str, bool] = {} @@ -842,6 +853,11 @@ class MatrixAdapter(BasePlatformAdapter): # Initial sync to catch up, then start background sync. self._startup_ts = time.time() + # Reset clock-skew detector for each connect cycle so a reconnect + # after the user fixes NTP doesn't inherit stale counters. + self._late_grace_drops = 0 + self._late_grace_skew = 0.0 + self._clock_skew_warned = False self._closing = False try: @@ -1542,6 +1558,49 @@ class MatrixAdapter(BasePlatformAdapter): ) event_ts = raw_ts / 1000.0 if raw_ts else 0.0 if event_ts and event_ts < self._startup_ts - _STARTUP_GRACE_SECONDS: + # If we are well past startup but events are still being dropped + # by the grace check, the host clock is probably set ahead of + # real time — every live event then looks "older than startup". + # Warn once so users can fix NTP instead of chasing a ghost. + # See #12614 (Schnurzel700, April 2026). + # + # Filter out backfill (events legitimately old) by requiring: + # - we are >30s past startup (initial-sync replay window closed) + # - the skew is *consistent* across consecutive drops, which is + # the signature of a constant clock offset rather than a + # variable-age room history. Backfill from a freshly invited + # room can deliver events spanning hours/days — those skews + # will be all over the place and reset the counter. + if not self._clock_skew_warned and ( + time.time() - self._startup_ts > 30 + ): + skew = self._startup_ts - event_ts + # Sanity bound: malformed events with negative or absurd + # timestamps shouldn't count. + if 5 < skew < 86400: + if self._late_grace_drops == 0: + self._late_grace_skew = skew + self._late_grace_drops = 1 + elif abs(skew - self._late_grace_skew) < 60: + # Consistent offset → likely real clock skew. + self._late_grace_drops += 1 + else: + # Varied skew → likely backfill, restart sampling. + self._late_grace_skew = skew + self._late_grace_drops = 1 + if self._late_grace_drops >= 3: + logger.warning( + "Matrix: dropped %d consecutive live events as " + "'too old' more than 30s after startup (skew " + "≈ %.0fs). The host system clock is likely set " + "ahead of real time, which causes the startup " + "grace filter to silently discard every incoming " + "message. Run `timedatectl set-ntp true` (or " + "sync NTP) and restart the bot.", + self._late_grace_drops, + skew, + ) + self._clock_skew_warned = True return # Extract content from the event. diff --git a/tests/gateway/test_matrix.py b/tests/gateway/test_matrix.py index c329441531d..a0fb8f086d8 100644 --- a/tests/gateway/test_matrix.py +++ b/tests/gateway/test_matrix.py @@ -2257,6 +2257,210 @@ class TestMatrixOnRoomMessageFilter: ev = self._mk_event(sender="@alice:example.org", body="hello bot") await self.adapter._on_room_message(ev) self.adapter._handle_text_message.assert_awaited_once() + + +class TestMatrixClockSkewWarning: + """Clock-skew detector for #12614. + + Reporter's host clock was set ~2 hours ahead of real time. The grace + filter `event_ts < startup_ts - 5` then drops every live event because + server timestamps look "older than startup". When this happens well + after startup (>30s), the adapter logs a one-shot WARNING pointing the + user at NTP instead of failing silently. + """ + + def setup_method(self): + self.adapter = _make_adapter() + self.adapter._user_id = "@bot:example.org" + self.adapter._handle_text_message = AsyncMock() + self.adapter._handle_media_message = AsyncMock() + + @staticmethod + def _mk_event(sender, ts_ms, event_id=None): + ev = MagicMock() + ev.room_id = "!room:example.org" + ev.sender = sender + ev.event_id = event_id or f"$evt-{sender}-{ts_ms}" + ev.timestamp = ts_ms + ev.server_timestamp = ts_ms + ev.content = {"msgtype": "m.text", "body": "hi"} + return ev + + @pytest.mark.asyncio + async def test_late_drops_emit_one_shot_clock_skew_warning(self, caplog): + import logging + import time as _t + + # Simulate the reporter's environment: host clock is ~2 hours ahead + # of server time. Startup happened "in the future" relative to the + # real-world events we're now receiving. + now = _t.time() + self.adapter._startup_ts = now - 60 # bot started 60s ago (wall clock) + # Server events are dated 2h before startup_ts (skewed clock). + skewed_event_ts_ms = int((self.adapter._startup_ts - 7200) * 1000) + + with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"): + for i in range(5): + ev = self._mk_event( + sender=f"@alice{i}:example.org", ts_ms=skewed_event_ts_ms + ) + await self.adapter._on_room_message(ev) + + # Handler should never be invoked — all events failed the grace check. + self.adapter._handle_text_message.assert_not_called() + # Exactly one WARNING from THIS logger should be emitted. Filter by + # logger name so unrelated stdlib/library warnings can't satisfy the + # assertion. + skew_warnings = [ + r for r in caplog.records + if r.name == "gateway.platforms.matrix" + and r.levelname == "WARNING" + and "set-ntp" in r.getMessage() + ] + assert len(skew_warnings) == 1, ( + f"expected exactly 1 clock-skew warning, got {len(skew_warnings)}" + ) + msg = skew_warnings[0].getMessage() + assert "7200" in msg, f"skew value missing from message: {msg!r}" + # Pin the counter so a regression in the gating logic (e.g. warning + # at threshold 1 or 5, or not stopping after warn) is caught. + assert self.adapter._late_grace_drops == 3 + assert self.adapter._clock_skew_warned is True + + @pytest.mark.asyncio + async def test_initial_sync_drops_do_not_warn(self, caplog): + """During the first 30s after startup, old events are normal backfill.""" + import logging + import time as _t + + now = _t.time() + # Startup was 1s ago — we're still in the initial-sync window. + self.adapter._startup_ts = now - 1 + old_ts_ms = int((self.adapter._startup_ts - 3600) * 1000) + + with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"): + for i in range(5): + ev = self._mk_event( + sender=f"@alice{i}:example.org", ts_ms=old_ts_ms + ) + await self.adapter._on_room_message(ev) + + # Backfill drops are silent — no clock-skew warning fired. + assert self.adapter._clock_skew_warned is False + skew_warnings = [ + r for r in caplog.records + if r.name == "gateway.platforms.matrix" + and "set-ntp" in r.getMessage() + ] + assert skew_warnings == [] + + @pytest.mark.asyncio + async def test_fewer_than_three_late_drops_do_not_warn(self, caplog): + """A single delayed backfill event after 30s shouldn't trigger NTP advice.""" + import logging + import time as _t + + now = _t.time() + self.adapter._startup_ts = now - 120 # extra slack vs the 30s gate + old_ts_ms = int((self.adapter._startup_ts - 3600) * 1000) + + with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"): + for i in range(2): # only 2 late drops — under the threshold + ev = self._mk_event( + sender=f"@alice{i}:example.org", ts_ms=old_ts_ms + ) + await self.adapter._on_room_message(ev) + + assert self.adapter._late_grace_drops == 2 + assert self.adapter._clock_skew_warned is False + + @pytest.mark.asyncio + async def test_varied_backfill_skews_do_not_warn(self, caplog): + """Backfill from a freshly-invited room delivers events of varied age. + + A genuine clock-skew bug produces drops with a *constant* offset + (every event is ~X seconds older than wall clock). Joining an old + room post-startup delivers events spanning hours-to-days; those + skews vary wildly and must NOT trigger the NTP warning. + """ + import logging + import time as _t + + now = _t.time() + self.adapter._startup_ts = now - 120 + # Each event has a different age, ranging from 1h to 30d ago. + ages_in_hours = [1, 24, 168, 720, 4] # 1h, 1d, 1w, 30d, 4h + with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"): + for i, hrs in enumerate(ages_in_hours): + ts_ms = int((self.adapter._startup_ts - hrs * 3600) * 1000) + ev = self._mk_event( + sender=f"@alice{i}:example.org", ts_ms=ts_ms + ) + await self.adapter._on_room_message(ev) + + # The varied-skew guard should keep the counter from reaching 3. + assert self.adapter._late_grace_drops < 3 + assert self.adapter._clock_skew_warned is False + skew_warnings = [ + r for r in caplog.records + if r.name == "gateway.platforms.matrix" + and "set-ntp" in r.getMessage() + ] + assert skew_warnings == [] + + @pytest.mark.asyncio + async def test_state_reset_allows_warning_to_fire_again(self, caplog): + """After the reset block at top of connect() runs, the warning is rearmed. + + Reconnect lifecycle: the user fixes NTP, restarts the bot, and the + new connect() call resets _late_grace_drops / _clock_skew_warned at + the top. This test exercises the rearm path by: + 1. Tripping the warning once (state: warned=True). + 2. Running the same reset block connect() runs. + 3. Tripping the warning a second time — the second warning should + fire because the state was cleared. + """ + import logging + import time as _t + + now = _t.time() + self.adapter._startup_ts = now - 60 + skewed_ms = int((self.adapter._startup_ts - 7200) * 1000) + + with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"): + for i in range(3): + ev = self._mk_event( + sender=f"@alice{i}:example.org", ts_ms=skewed_ms, + event_id=f"$first-{i}", + ) + await self.adapter._on_room_message(ev) + assert self.adapter._clock_skew_warned is True + + # Mirror the reset block in connect() (matrix.py around line 855). + self.adapter._startup_ts = _t.time() - 60 + self.adapter._late_grace_drops = 0 + self.adapter._late_grace_skew = 0.0 + self.adapter._clock_skew_warned = False + + # Same skewed-clock scenario should warn AGAIN after reset. + skewed_ms2 = int((self.adapter._startup_ts - 7200) * 1000) + for i in range(3): + ev = self._mk_event( + sender=f"@bob{i}:example.org", ts_ms=skewed_ms2, + event_id=f"$second-{i}", + ) + await self.adapter._on_room_message(ev) + + skew_warnings = [ + r for r in caplog.records + if r.name == "gateway.platforms.matrix" + and "set-ntp" in r.getMessage() + ] + assert len(skew_warnings) == 2, ( + f"expected 2 warnings (one per connect cycle), got {len(skew_warnings)}" + ) + + # --------------------------------------------------------------------------- # DM auto-thread # --------------------------------------------------------------------------- diff --git a/website/docs/user-guide/messaging/matrix.md b/website/docs/user-guide/messaging/matrix.md index 255806c01ba..b03f7a655d4 100644 --- a/website/docs/user-guide/messaging/matrix.md +++ b/website/docs/user-guide/messaging/matrix.md @@ -357,6 +357,23 @@ To find a Room ID: in Element, go to the room → **Settings** → **Advanced** **Fix**: Invite the bot to the room — it auto-joins on invite. Verify your User ID is in `MATRIX_ALLOWED_USERS` (use the full `@user:server` format). Restart the gateway. +### Bot joins rooms but silently drops every message (clock skew) + +**Cause**: The host's system clock is set ahead of real time. The Matrix adapter applies a 5-second startup-grace filter (`event_ts < startup_ts - 5`) to ignore events replayed from initial sync. When the wall clock is ahead, every incoming event looks "older than startup" and is dropped before reaching the message handler — the bot appears connected but never replies. See [#12614](https://github.com/NousResearch/hermes-agent/issues/12614). + +**Symptom**: Gateway log shows `Matrix: dropped N live events as 'too old' more than 30s after startup`. + +**Fix**: Sync the host clock with NTP and restart the bot: + +```bash +# Debian/Ubuntu +sudo timedatectl set-ntp true +timedatectl status # confirm "System clock synchronized: yes" + +# macOS +sudo sntp -sS time.apple.com +``` + ### "Failed to authenticate" / "whoami failed" on startup **Cause**: The access token or homeserver URL is incorrect.