fix(discord): narrow rate-limit catch and move sync state under gateway/

Two follow-ups on top of helix4u's slash-command sync hardening:

- Only suppress exceptions that are actually Discord 429 rate limits
  (discord.RateLimited, HTTPException with status 429, or a clearly
  rate-limit-named duck type). Arbitrary failures that happen to expose
  a retry_after attribute now re-raise to the outer handler instead of
  silently swallowing a cooldown.
- Move the sync-state JSON under $HERMES_HOME/gateway/ so the home root
  stops collecting ad-hoc runtime files.

Added a test verifying unrelated exceptions don't get misclassified as
rate limits.
This commit is contained in:
Teknium 2026-05-06 17:29:32 -07:00
parent d797755a1c
commit 5a3cadf6eb
2 changed files with 103 additions and 12 deletions

View file

@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080}
_DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"}
_DISCORD_COMMAND_SYNC_STATE_FILE = "discord_command_sync_state.json"
_DISCORD_COMMAND_SYNC_STATE_SUBDIR = "gateway"
_DISCORD_COMMAND_SYNC_STATE_FILENAME = "discord_command_sync_state.json"
_DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5
_DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0
@ -834,7 +835,12 @@ class DiscordAdapter(BasePlatformAdapter):
def _command_sync_state_path(self) -> _Path:
from hermes_constants import get_hermes_home
return get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_FILE
directory = get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_SUBDIR
try:
directory.mkdir(parents=True, exist_ok=True)
except Exception:
pass
return directory / _DISCORD_COMMAND_SYNC_STATE_FILENAME
def _read_command_sync_state(self) -> dict:
try:
@ -945,6 +951,40 @@ class DiscordAdapter(BasePlatformAdapter):
continue
return None
@staticmethod
def _is_discord_rate_limit(exc: BaseException) -> bool:
"""True only for exceptions that look like Discord 429 rate limits.
Narrower than ``hasattr(exc, 'retry_after')``: discord.py's own
``RateLimited`` exception and any HTTPException with status 429
qualify. This prevents suppressing unrelated failures that happen
to expose a ``retry_after`` attribute."""
# discord.py emits RateLimited / HTTPException subclasses for 429s.
# Guard with isinstance-of-class so a mocked ``discord`` module
# (where attrs are MagicMocks, not types) doesn't trip isinstance.
if DISCORD_AVAILABLE and discord is not None:
for attr_name in ("RateLimited", "HTTPException"):
cls = getattr(discord, attr_name, None)
if not isinstance(cls, type):
continue
if isinstance(exc, cls):
if attr_name == "RateLimited":
return True
status = getattr(exc, "status", None)
if status == 429:
return True
# Fallback duck-type: something named like a rate-limit with a
# numeric retry_after. Covers mocked clients in tests and exotic
# transports, without swallowing arbitrary exceptions.
name = type(exc).__name__.lower()
if ("ratelimit" in name or "rate_limit" in name) and getattr(exc, "retry_after", None) is not None:
return True
response = getattr(exc, "response", None)
status = getattr(response, "status", None) or getattr(response, "status_code", None)
if status == 429:
return True
return False
def _command_sync_mutation_interval_seconds(self) -> float:
return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS
@ -989,16 +1029,20 @@ class DiscordAdapter(BasePlatformAdapter):
# persist Discord's retry-after when it refuses the batch.
summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600)
except Exception as e:
if not self._is_discord_rate_limit(e):
raise
retry_after = self._extract_discord_retry_after(e)
if retry_after is not None:
self._record_command_sync_rate_limit(app_id, fingerprint, retry_after)
logger.warning(
"[%s] Discord rate-limited slash command sync; retrying after %.0fs",
self.name,
retry_after,
)
return
raise
if retry_after is None:
# Rate-limited but no retry-after signal — back off for a
# conservative default so we don't slam the bucket again.
retry_after = _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS
self._record_command_sync_rate_limit(app_id, fingerprint, retry_after)
logger.warning(
"[%s] Discord rate-limited slash command sync; retrying after %.0fs",
self.name,
retry_after,
)
return
finally:
if has_ratelimit_timeout:
http.max_ratelimit_timeout = previous_ratelimit_timeout

View file

@ -612,12 +612,59 @@ async def test_post_connect_initialization_respects_discord_retry_after(tmp_path
await adapter._run_post_connect_initialization()
sync.assert_awaited_once()
state = json.loads((tmp_path / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILE).read_text())
state_path = (
tmp_path
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME
)
state = json.loads(state_path.read_text())
entry = state["999"]
assert entry["retry_after"] == 123.0
assert entry["retry_after_until"] > entry["last_attempt_at"]
@pytest.mark.asyncio
async def test_post_connect_initialization_reraises_non_rate_limit_exceptions(tmp_path, monkeypatch):
"""Arbitrary failures during sync must surface, not be swallowed as rate-limits."""
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))
monkeypatch.setattr("hermes_constants.get_hermes_home", lambda: tmp_path)
class _DesiredCommand:
def to_dict(self, tree):
return {"name": "status", "description": "Show Hermes status", "type": 1, "options": []}
adapter._client = SimpleNamespace(
tree=SimpleNamespace(get_commands=lambda: [_DesiredCommand()]),
application_id=4242,
user=SimpleNamespace(id=4242),
)
# Unrelated failure that happens to expose retry_after. Must NOT be
# caught by the rate-limit handler — it has nothing to do with 429s.
class _UnrelatedError(RuntimeError):
retry_after = 999.0
sync = AsyncMock(side_effect=_UnrelatedError("database is down"))
monkeypatch.setattr(adapter, "_safe_sync_slash_commands", sync)
# The outer _run_post_connect_initialization has a broad except Exception
# that logs defensively — so we assert on state NOT being written.
await adapter._run_post_connect_initialization()
sync.assert_awaited_once()
state_path = (
tmp_path
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME
)
state = json.loads(state_path.read_text()) if state_path.exists() else {}
entry = state.get("4242", {})
# Attempt was recorded before the sync call, but no rate-limit cooldown
# should have been persisted from the unrelated exception.
assert "retry_after_until" not in entry
assert "retry_after" not in entry
@pytest.mark.asyncio
async def test_safe_sync_slash_commands_paces_mutation_writes(monkeypatch):
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))