mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix(gateway): wait for systemd restart readiness
This commit is contained in:
parent
3cdbf334d5
commit
d797755a1c
4 changed files with 587 additions and 78 deletions
|
|
@ -10,6 +10,8 @@ Uses discord.py library for:
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import struct
|
||||
|
|
@ -24,6 +26,9 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
VALID_THREAD_AUTO_ARCHIVE_MINUTES = {60, 1440, 4320, 10080}
|
||||
_DISCORD_COMMAND_SYNC_POLICIES = {"safe", "bulk", "off"}
|
||||
_DISCORD_COMMAND_SYNC_STATE_FILE = "discord_command_sync_state.json"
|
||||
_DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS = 4.5
|
||||
_DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS = 30.0
|
||||
|
||||
try:
|
||||
import discord
|
||||
|
|
@ -45,6 +50,7 @@ from gateway.config import Platform, PlatformConfig
|
|||
import re
|
||||
|
||||
from gateway.platforms.helpers import MessageDeduplicator, ThreadParticipationTracker
|
||||
from utils import atomic_json_write
|
||||
from gateway.platforms.base import (
|
||||
BasePlatformAdapter,
|
||||
MessageEvent,
|
||||
|
|
@ -825,6 +831,128 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
|
||||
logger.info("[%s] Disconnected", self.name)
|
||||
|
||||
def _command_sync_state_path(self) -> _Path:
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
return get_hermes_home() / _DISCORD_COMMAND_SYNC_STATE_FILE
|
||||
|
||||
def _read_command_sync_state(self) -> dict:
|
||||
try:
|
||||
path = self._command_sync_state_path()
|
||||
if not path.exists():
|
||||
return {}
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return {}
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
def _write_command_sync_state(self, state: dict) -> None:
|
||||
atomic_json_write(
|
||||
self._command_sync_state_path(),
|
||||
state,
|
||||
indent=None,
|
||||
separators=(",", ":"),
|
||||
)
|
||||
|
||||
def _command_sync_state_key(self, app_id: Any) -> str:
|
||||
return str(app_id or "unknown")
|
||||
|
||||
def _desired_command_sync_fingerprint(self) -> str:
|
||||
tree = self._client.tree if self._client else None
|
||||
desired = []
|
||||
if tree is not None:
|
||||
desired = [
|
||||
self._canonicalize_app_command_payload(command.to_dict(tree))
|
||||
for command in tree.get_commands()
|
||||
]
|
||||
desired.sort(key=lambda item: (item.get("type", 1), item.get("name", "")))
|
||||
payload = json.dumps(desired, sort_keys=True, separators=(",", ":"))
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
def _command_sync_skip_reason(self, app_id: Any, fingerprint: str) -> Optional[str]:
|
||||
entry = self._read_command_sync_state().get(self._command_sync_state_key(app_id))
|
||||
if not isinstance(entry, dict):
|
||||
return None
|
||||
now = time.time()
|
||||
retry_after_until = float(entry.get("retry_after_until") or 0)
|
||||
if retry_after_until > now:
|
||||
remaining = max(1, int(retry_after_until - now))
|
||||
return f"Discord asked us to wait before syncing slash commands; retry in {remaining}s"
|
||||
if entry.get("fingerprint") == fingerprint and entry.get("last_success_at"):
|
||||
return "same slash-command fingerprint already synced"
|
||||
return None
|
||||
|
||||
def _record_command_sync_attempt(self, app_id: Any, fingerprint: str) -> None:
|
||||
state = self._read_command_sync_state()
|
||||
state[self._command_sync_state_key(app_id)] = {
|
||||
**(
|
||||
state.get(self._command_sync_state_key(app_id))
|
||||
if isinstance(state.get(self._command_sync_state_key(app_id)), dict)
|
||||
else {}
|
||||
),
|
||||
"fingerprint": fingerprint,
|
||||
"last_attempt_at": time.time(),
|
||||
}
|
||||
self._write_command_sync_state(state)
|
||||
|
||||
def _record_command_sync_rate_limit(self, app_id: Any, fingerprint: str, retry_after: float) -> None:
|
||||
retry_after = max(1.0, float(retry_after))
|
||||
state = self._read_command_sync_state()
|
||||
state[self._command_sync_state_key(app_id)] = {
|
||||
**(
|
||||
state.get(self._command_sync_state_key(app_id))
|
||||
if isinstance(state.get(self._command_sync_state_key(app_id)), dict)
|
||||
else {}
|
||||
),
|
||||
"fingerprint": fingerprint,
|
||||
"last_attempt_at": time.time(),
|
||||
"retry_after_until": time.time() + retry_after,
|
||||
"retry_after": retry_after,
|
||||
}
|
||||
self._write_command_sync_state(state)
|
||||
|
||||
def _record_command_sync_success(self, app_id: Any, fingerprint: str, summary: dict) -> None:
|
||||
state = self._read_command_sync_state()
|
||||
state[self._command_sync_state_key(app_id)] = {
|
||||
"fingerprint": fingerprint,
|
||||
"last_attempt_at": time.time(),
|
||||
"last_success_at": time.time(),
|
||||
"summary": summary,
|
||||
}
|
||||
self._write_command_sync_state(state)
|
||||
|
||||
@staticmethod
|
||||
def _extract_discord_retry_after(exc: BaseException) -> Optional[float]:
|
||||
value = getattr(exc, "retry_after", None)
|
||||
if value is not None:
|
||||
try:
|
||||
return max(1.0, float(value))
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
response = getattr(exc, "response", None)
|
||||
headers = getattr(response, "headers", None)
|
||||
if headers:
|
||||
for key in ("Retry-After", "X-RateLimit-Reset-After"):
|
||||
try:
|
||||
raw = headers.get(key)
|
||||
except Exception:
|
||||
raw = None
|
||||
if raw is None:
|
||||
continue
|
||||
try:
|
||||
return max(1.0, float(raw))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return None
|
||||
|
||||
def _command_sync_mutation_interval_seconds(self) -> float:
|
||||
return _DISCORD_COMMAND_SYNC_MUTATION_INTERVAL_SECONDS
|
||||
|
||||
async def _sleep_between_command_sync_mutations(self) -> None:
|
||||
interval = self._command_sync_mutation_interval_seconds()
|
||||
if interval > 0:
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
async def _run_post_connect_initialization(self) -> None:
|
||||
"""Finish non-critical startup work after Discord is connected."""
|
||||
if not self._client:
|
||||
|
|
@ -840,14 +968,42 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
logger.info("[%s] Synced %d slash command(s) via bulk tree sync", self.name, len(synced))
|
||||
return
|
||||
|
||||
# Discord's per-app command-management bucket is ~5 writes / 20 s,
|
||||
# so a mass-prune-plus-upsert reconcile (e.g. 77 orphans + 30
|
||||
# desired = 107 writes) takes several minutes of forced waits.
|
||||
# A flat 30 s budget blew up reliably under bucket pressure and
|
||||
# left slash commands broken for ~60 min until the bucket fully
|
||||
# recovered. Use a wide ceiling; the cap still guards against a
|
||||
# true hang. (#16713)
|
||||
summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600)
|
||||
app_id = getattr(self._client, "application_id", None) or getattr(getattr(self._client, "user", None), "id", None)
|
||||
fingerprint = self._desired_command_sync_fingerprint()
|
||||
skip_reason = self._command_sync_skip_reason(app_id, fingerprint)
|
||||
if skip_reason:
|
||||
logger.info("[%s] Skipping Discord slash command sync: %s", self.name, skip_reason)
|
||||
return
|
||||
self._record_command_sync_attempt(app_id, fingerprint)
|
||||
|
||||
http = getattr(self._client, "http", None)
|
||||
has_ratelimit_timeout = http is not None and hasattr(http, "max_ratelimit_timeout")
|
||||
previous_ratelimit_timeout = getattr(http, "max_ratelimit_timeout", None) if has_ratelimit_timeout else None
|
||||
if has_ratelimit_timeout:
|
||||
http.max_ratelimit_timeout = _DISCORD_COMMAND_SYNC_MAX_RATE_LIMIT_SLEEP_SECONDS
|
||||
|
||||
try:
|
||||
# Discord's per-app command-management bucket is small, and
|
||||
# discord.py can otherwise sit inside one long retry sleep
|
||||
# before surfacing the 429. Keep the whole sync bounded and
|
||||
# persist Discord's retry-after when it refuses the batch.
|
||||
summary = await asyncio.wait_for(self._safe_sync_slash_commands(), timeout=600)
|
||||
except Exception as e:
|
||||
retry_after = self._extract_discord_retry_after(e)
|
||||
if retry_after is not None:
|
||||
self._record_command_sync_rate_limit(app_id, fingerprint, retry_after)
|
||||
logger.warning(
|
||||
"[%s] Discord rate-limited slash command sync; retrying after %.0fs",
|
||||
self.name,
|
||||
retry_after,
|
||||
)
|
||||
return
|
||||
raise
|
||||
finally:
|
||||
if has_ratelimit_timeout:
|
||||
http.max_ratelimit_timeout = previous_ratelimit_timeout
|
||||
|
||||
self._record_command_sync_success(app_id, fingerprint, summary)
|
||||
logger.info(
|
||||
"[%s] Safely reconciled %d slash command(s): unchanged=%d updated=%d recreated=%d created=%d deleted=%d",
|
||||
self.name,
|
||||
|
|
@ -1009,11 +1165,20 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
created = 0
|
||||
deleted = 0
|
||||
http = self._client.http
|
||||
mutation_count = 0
|
||||
|
||||
async def mutate(call, *args):
|
||||
nonlocal mutation_count
|
||||
if mutation_count:
|
||||
await self._sleep_between_command_sync_mutations()
|
||||
result = await call(*args)
|
||||
mutation_count += 1
|
||||
return result
|
||||
|
||||
for key, desired in desired_by_key.items():
|
||||
current = existing_by_key.pop(key, None)
|
||||
if current is None:
|
||||
await http.upsert_global_command(app_id, desired)
|
||||
await mutate(http.upsert_global_command, app_id, desired)
|
||||
created += 1
|
||||
continue
|
||||
|
||||
|
|
@ -1025,16 +1190,16 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
continue
|
||||
|
||||
if self._patchable_app_command_payload(current_existing_payload) == self._patchable_app_command_payload(desired):
|
||||
await http.delete_global_command(app_id, current.id)
|
||||
await http.upsert_global_command(app_id, desired)
|
||||
await mutate(http.delete_global_command, app_id, current.id)
|
||||
await mutate(http.upsert_global_command, app_id, desired)
|
||||
recreated += 1
|
||||
continue
|
||||
|
||||
await http.edit_global_command(app_id, current.id, desired)
|
||||
await mutate(http.edit_global_command, app_id, current.id, desired)
|
||||
updated += 1
|
||||
|
||||
for current in existing_by_key.values():
|
||||
await http.delete_global_command(app_id, current.id)
|
||||
await mutate(http.delete_global_command, app_id, current.id)
|
||||
deleted += 1
|
||||
|
||||
return {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue