diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index f0ee06f8ca..ecce8b8fc0 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080} _DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"} -_DISCORD_COMMAND_SYNC_STATE_FILE = "discord_command_sync_state.json" +_DISCORD_COMMAND_SYNC_STATE_SUBDIR = "gateway" +_DISCORD_COMMAND_SYNC_STATE_FILENAME = "discord_command_sync_state.json" _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5 _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0 @@ -834,7 +835,12 @@ class DiscordAdapter(BasePlatformAdapter): def _command_sync_state_path(self) -> _Path: from hermes_constants import get_hermes_home - return get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_FILE + directory = get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_SUBDIR + try: + directory.mkdir(parents=True, exist_ok=True) + except Exception: + pass + return directory / _DISCORD_COMMAND_SYNC_STATE_FILENAME def _read_command_sync_state(self) -> dict: try: @@ -945,6 +951,40 @@ class DiscordAdapter(BasePlatformAdapter): continue return None + @staticmethod + def _is_discord_rate_limit(exc: BaseException) -> bool: + """True only for exceptions that look like Discord 429 rate limits. + + Narrower than ``hasattr(exc, 'retry_after')``: discord.py's own + ``RateLimited`` exception and any HTTPException with status 429 + qualify. This prevents suppressing unrelated failures that happen + to expose a ``retry_after`` attribute.""" + # discord.py emits RateLimited / HTTPException subclasses for 429s. + # Guard with isinstance-of-class so a mocked ``discord`` module + # (where attrs are MagicMocks, not types) doesn't trip isinstance. + if DISCORD_AVAILABLE and discord is not None: + for attr_name in ("RateLimited", "HTTPException"): + cls = getattr(discord, attr_name, None) + if not isinstance(cls, type): + continue + if isinstance(exc, cls): + if attr_name == "RateLimited": + return True + status = getattr(exc, "status", None) + if status == 429: + return True + # Fallback duck-type: something named like a rate-limit with a + # numeric retry_after. Covers mocked clients in tests and exotic + # transports, without swallowing arbitrary exceptions. + name = type(exc).__name__.lower() + if ("ratelimit" in name or "rate_limit" in name) and getattr(exc, "retry_after", None) is not None: + return True + response = getattr(exc, "response", None) + status = getattr(response, "status", None) or getattr(response, "status_code", None) + if status == 429: + return True + return False + def _command_sync_mutation_interval_seconds(self) -> float: return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS @@ -989,16 +1029,20 @@ class DiscordAdapter(BasePlatformAdapter): # persist Discord's retry-after when it refuses the batch. summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600) except Exception as e: + if not self._is_discord_rate_limit(e): + raise retry_after = self._extract_discord_retry_after(e) - if retry_after is not None: - self._record_command_sync_rate_limit(app_id, fingerprint, retry_after) - logger.warning( - "[%s] Discord rate-limited slash command sync; retrying after %.0fs", - self.name, - retry_after, - ) - return - raise + if retry_after is None: + # Rate-limited but no retry-after signal — back off for a + # conservative default so we don't slam the bucket again. + retry_after = _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS + self._record_command_sync_rate_limit(app_id, fingerprint, retry_after) + logger.warning( + "[%s] Discord rate-limited slash command sync; retrying after %.0fs", + self.name, + retry_after, + ) + return finally: if has_ratelimit_timeout: http.max_ratelimit_timeout = previous_ratelimit_timeout diff --git a/tests/gateway/test_discord_connect.py b/tests/gateway/test_discord_connect.py index 57b3791a05..43f88bcf9d 100644 --- a/tests/gateway/test_discord_connect.py +++ b/tests/gateway/test_discord_connect.py @@ -612,12 +612,59 @@ async def test_post_connect_initialization_respects_discord_retry_after(tmp_path await adapter._run_post_connect_initialization() sync.assert_awaited_once() - state = json.loads((tmp_path / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILE).read_text()) + state_path = ( + tmp_path + / discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR + / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME + ) + state = json.loads(state_path.read_text()) entry = state["999"] assert entry["retry_after"] == 123.0 assert entry["retry_after_until"] > entry["last_attempt_at"] +@pytest.mark.asyncio +async def test_post_connect_initialization_reraises_non_rate_limit_exceptions(tmp_path, monkeypatch): + """Arbitrary failures during sync must surface, not be swallowed as rate-limits.""" + adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token")) + monkeypatch.setattr("hermes_constants.get_hermes_home", lambda: tmp_path) + + class _DesiredCommand: + def to_dict(self, tree): + return {"name": "status", "description": "Show Hermes status", "type": 1, "options": []} + + adapter._client = SimpleNamespace( + tree=SimpleNamespace(get_commands=lambda: [_DesiredCommand()]), + application_id=4242, + user=SimpleNamespace(id=4242), + ) + + # Unrelated failure that happens to expose retry_after. Must NOT be + # caught by the rate-limit handler — it has nothing to do with 429s. + class _UnrelatedError(RuntimeError): + retry_after = 999.0 + + sync = AsyncMock(side_effect=_UnrelatedError("database is down")) + monkeypatch.setattr(adapter, "_safe_sync_slash_commands", sync) + + # The outer _run_post_connect_initialization has a broad except Exception + # that logs defensively — so we assert on state NOT being written. + await adapter._run_post_connect_initialization() + + sync.assert_awaited_once() + state_path = ( + tmp_path + / discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR + / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME + ) + state = json.loads(state_path.read_text()) if state_path.exists() else {} + entry = state.get("4242", {}) + # Attempt was recorded before the sync call, but no rate-limit cooldown + # should have been persisted from the unrelated exception. + assert "retry_after_until" not in entry + assert "retry_after" not in entry + + @pytest.mark.asyncio async def test_safe_sync_slash_commands_paces_mutation_writes(monkeypatch): adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))