mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
fix(discord): narrow rate-limit catch and move sync state under gateway/
Two follow-ups on top of helix4u's slash-command sync hardening: - Only suppress exceptions that are actually Discord 429 rate limits (discord.RateLimited, HTTPException with status 429, or a clearly rate-limit-named duck type). Arbitrary failures that happen to expose a retry_after attribute now re-raise to the outer handler instead of silently swallowing a cooldown. - Move the sync-state JSON under $HERMES_HOME/gateway/ so the home root stops collecting ad-hoc runtime files. Added a test verifying unrelated exceptions don't get misclassified as rate limits.
This commit is contained in:
parent
d797755a1c
commit
5a3cadf6eb
2 changed files with 103 additions and 12 deletions
|
|
@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080}
|
VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080}
|
||||||
_DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"}
|
_DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"}
|
||||||
_DISCORD_COMMAND_SYNC_STATE_FILE = "discord_command_sync_state.json"
|
_DISCORD_COMMAND_SYNC_STATE_SUBDIR = "gateway"
|
||||||
|
_DISCORD_COMMAND_SYNC_STATE_FILENAME = "discord_command_sync_state.json"
|
||||||
_DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5
|
_DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5
|
||||||
_DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0
|
_DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0
|
||||||
|
|
||||||
|
|
@ -834,7 +835,12 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||||
def _command_sync_state_path(self) -> _Path:
|
def _command_sync_state_path(self) -> _Path:
|
||||||
from hermes_constants import get_hermes_home
|
from hermes_constants import get_hermes_home
|
||||||
|
|
||||||
return get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_FILE
|
directory = get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_SUBDIR
|
||||||
|
try:
|
||||||
|
directory.mkdir(parents=True, exist_ok=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return directory / _DISCORD_COMMAND_SYNC_STATE_FILENAME
|
||||||
|
|
||||||
def _read_command_sync_state(self) -> dict:
|
def _read_command_sync_state(self) -> dict:
|
||||||
try:
|
try:
|
||||||
|
|
@ -945,6 +951,40 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||||
continue
|
continue
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_discord_rate_limit(exc: BaseException) -> bool:
|
||||||
|
"""True only for exceptions that look like Discord 429 rate limits.
|
||||||
|
|
||||||
|
Narrower than ``hasattr(exc, 'retry_after')``: discord.py's own
|
||||||
|
``RateLimited`` exception and any HTTPException with status 429
|
||||||
|
qualify. This prevents suppressing unrelated failures that happen
|
||||||
|
to expose a ``retry_after`` attribute."""
|
||||||
|
# discord.py emits RateLimited / HTTPException subclasses for 429s.
|
||||||
|
# Guard with isinstance-of-class so a mocked ``discord`` module
|
||||||
|
# (where attrs are MagicMocks, not types) doesn't trip isinstance.
|
||||||
|
if DISCORD_AVAILABLE and discord is not None:
|
||||||
|
for attr_name in ("RateLimited", "HTTPException"):
|
||||||
|
cls = getattr(discord, attr_name, None)
|
||||||
|
if not isinstance(cls, type):
|
||||||
|
continue
|
||||||
|
if isinstance(exc, cls):
|
||||||
|
if attr_name == "RateLimited":
|
||||||
|
return True
|
||||||
|
status = getattr(exc, "status", None)
|
||||||
|
if status == 429:
|
||||||
|
return True
|
||||||
|
# Fallback duck-type: something named like a rate-limit with a
|
||||||
|
# numeric retry_after. Covers mocked clients in tests and exotic
|
||||||
|
# transports, without swallowing arbitrary exceptions.
|
||||||
|
name = type(exc).__name__.lower()
|
||||||
|
if ("ratelimit" in name or "rate_limit" in name) and getattr(exc, "retry_after", None) is not None:
|
||||||
|
return True
|
||||||
|
response = getattr(exc, "response", None)
|
||||||
|
status = getattr(response, "status", None) or getattr(response, "status_code", None)
|
||||||
|
if status == 429:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def _command_sync_mutation_interval_seconds(self) -> float:
|
def _command_sync_mutation_interval_seconds(self) -> float:
|
||||||
return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS
|
return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS
|
||||||
|
|
||||||
|
|
@ -989,16 +1029,20 @@ class DiscordAdapter(BasePlatformAdapter):
|
||||||
# persist Discord's retry-after when it refuses the batch.
|
# persist Discord's retry-after when it refuses the batch.
|
||||||
summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600)
|
summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if not self._is_discord_rate_limit(e):
|
||||||
|
raise
|
||||||
retry_after = self._extract_discord_retry_after(e)
|
retry_after = self._extract_discord_retry_after(e)
|
||||||
if retry_after is not None:
|
if retry_after is None:
|
||||||
self._record_command_sync_rate_limit(app_id, fingerprint, retry_after)
|
# Rate-limited but no retry-after signal — back off for a
|
||||||
logger.warning(
|
# conservative default so we don't slam the bucket again.
|
||||||
"[%s] Discord rate-limited slash command sync; retrying after %.0fs",
|
retry_after = _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS
|
||||||
self.name,
|
self._record_command_sync_rate_limit(app_id, fingerprint, retry_after)
|
||||||
retry_after,
|
logger.warning(
|
||||||
)
|
"[%s] Discord rate-limited slash command sync; retrying after %.0fs",
|
||||||
return
|
self.name,
|
||||||
raise
|
retry_after,
|
||||||
|
)
|
||||||
|
return
|
||||||
finally:
|
finally:
|
||||||
if has_ratelimit_timeout:
|
if has_ratelimit_timeout:
|
||||||
http.max_ratelimit_timeout = previous_ratelimit_timeout
|
http.max_ratelimit_timeout = previous_ratelimit_timeout
|
||||||
|
|
|
||||||
|
|
@ -612,12 +612,59 @@ async def test_post_connect_initialization_respects_discord_retry_after(tmp_path
|
||||||
await adapter._run_post_connect_initialization()
|
await adapter._run_post_connect_initialization()
|
||||||
|
|
||||||
sync.assert_awaited_once()
|
sync.assert_awaited_once()
|
||||||
state = json.loads((tmp_path / discord_platform._DISCORD_COMMAND_SYNC_STATE_FILE).read_text())
|
state_path = (
|
||||||
|
tmp_path
|
||||||
|
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR
|
||||||
|
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME
|
||||||
|
)
|
||||||
|
state = json.loads(state_path.read_text())
|
||||||
entry = state["999"]
|
entry = state["999"]
|
||||||
assert entry["retry_after"] == 123.0
|
assert entry["retry_after"] == 123.0
|
||||||
assert entry["retry_after_until"] > entry["last_attempt_at"]
|
assert entry["retry_after_until"] > entry["last_attempt_at"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_post_connect_initialization_reraises_non_rate_limit_exceptions(tmp_path, monkeypatch):
|
||||||
|
"""Arbitrary failures during sync must surface, not be swallowed as rate-limits."""
|
||||||
|
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))
|
||||||
|
monkeypatch.setattr("hermes_constants.get_hermes_home", lambda: tmp_path)
|
||||||
|
|
||||||
|
class _DesiredCommand:
|
||||||
|
def to_dict(self, tree):
|
||||||
|
return {"name": "status", "description": "Show Hermes status", "type": 1, "options": []}
|
||||||
|
|
||||||
|
adapter._client = SimpleNamespace(
|
||||||
|
tree=SimpleNamespace(get_commands=lambda: [_DesiredCommand()]),
|
||||||
|
application_id=4242,
|
||||||
|
user=SimpleNamespace(id=4242),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Unrelated failure that happens to expose retry_after. Must NOT be
|
||||||
|
# caught by the rate-limit handler — it has nothing to do with 429s.
|
||||||
|
class _UnrelatedError(RuntimeError):
|
||||||
|
retry_after = 999.0
|
||||||
|
|
||||||
|
sync = AsyncMock(side_effect=_UnrelatedError("database is down"))
|
||||||
|
monkeypatch.setattr(adapter, "_safe_sync_slash_commands", sync)
|
||||||
|
|
||||||
|
# The outer _run_post_connect_initialization has a broad except Exception
|
||||||
|
# that logs defensively — so we assert on state NOT being written.
|
||||||
|
await adapter._run_post_connect_initialization()
|
||||||
|
|
||||||
|
sync.assert_awaited_once()
|
||||||
|
state_path = (
|
||||||
|
tmp_path
|
||||||
|
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_SUBDIR
|
||||||
|
/ discord_platform._DISCORD_COMMAND_SYNC_STATE_FILENAME
|
||||||
|
)
|
||||||
|
state = json.loads(state_path.read_text()) if state_path.exists() else {}
|
||||||
|
entry = state.get("4242", {})
|
||||||
|
# Attempt was recorded before the sync call, but no rate-limit cooldown
|
||||||
|
# should have been persisted from the unrelated exception.
|
||||||
|
assert "retry_after_until" not in entry
|
||||||
|
assert "retry_after" not in entry
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_safe_sync_slash_commands_paces_mutation_writes(monkeypatch):
|
async def test_safe_sync_slash_commands_paces_mutation_writes(monkeypatch):
|
||||||
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))
|
adapter = DiscordAdapter(PlatformConfig(enabled=True, token="test-token"))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue