fix: dedupe kanban notifier delivery claims

This commit is contained in:
Mike Nguyen 2026-05-09 12:41:31 +00:00 committed by Teknium
parent 373c4d6647
commit 861ce7c0b6
5 changed files with 411 additions and 7 deletions

View file

@ -4119,18 +4119,42 @@ class GatewayRunner:
try:
def _collect():
deliveries: list[dict] = []
# Enumerate every board on disk. Cheap: a few
# directory stat calls per tick. Missing/empty
# boards are silently skipped.
active_platforms = {
getattr(platform, "value", str(platform)).lower()
for platform in self.adapters.keys()
}
if not active_platforms:
logger.debug("kanban notifier: no connected adapters; skipping tick")
return deliveries
# Enumerate every board on disk, but poll each resolved DB
# path once. Multiple slugs can point at the same DB when
# HERMES_KANBAN_DB pins the board path; without this guard
# one gateway could collect the same subscription/event
# more than once before advancing the cursor.
try:
boards = _kb.list_boards(include_archived=False)
except Exception:
boards = [_kb.read_board_metadata(_kb.DEFAULT_BOARD)]
seen_db_paths: set[str] = set()
for board_meta in boards:
slug = board_meta.get("slug") or _kb.DEFAULT_BOARD
db_path = board_meta.get("db_path")
try:
resolved_db_path = str(Path(db_path).expanduser().resolve()) if db_path else str(_kb.kanban_db_path(slug).resolve())
except Exception:
resolved_db_path = f"slug:{slug}"
if resolved_db_path in seen_db_paths:
logger.debug(
"kanban notifier: skipping duplicate board slug %s for DB %s",
slug, resolved_db_path,
)
continue
seen_db_paths.add(resolved_db_path)
try:
conn = _kb.connect(board=slug)
except Exception:
except Exception as exc:
logger.debug("kanban notifier: cannot open board %s: %s", slug, exc)
continue
try:
# `connect()` runs the schema + idempotent migration
@ -4146,8 +4170,17 @@ class GatewayRunner:
# tolerates that race, but we still skip the
# redundant call to avoid the wasted work.
subs = _kb.list_notify_subs(conn)
if not subs:
logger.debug("kanban notifier: board %s has no subscriptions", slug)
for sub in subs:
cursor, events = _kb.unseen_events_for_sub(
platform = (sub.get("platform") or "").lower()
if platform not in active_platforms:
logger.debug(
"kanban notifier: subscription for %s on %s skipped; adapter not connected",
sub.get("task_id"), platform or "<missing>",
)
continue
old_cursor, cursor, events = _kb.claim_unseen_events_for_sub(
conn,
task_id=sub["task_id"],
platform=sub["platform"],
@ -4158,8 +4191,13 @@ class GatewayRunner:
if not events:
continue
task = _kb.get_task(conn, sub["task_id"])
logger.debug(
"kanban notifier: claimed %d event(s) for %s on board %s cursor %s%s",
len(events), sub["task_id"], slug, old_cursor, cursor,
)
deliveries.append({
"sub": sub,
"old_cursor": old_cursor,
"cursor": cursor,
"events": events,
"task": task,
@ -4186,7 +4224,18 @@ class GatewayRunner:
continue
adapter = self.adapters.get(plat)
if adapter is None:
continue # platform not currently connected
logger.debug(
"kanban notifier: adapter %s disconnected before delivery for %s; rewinding claim",
platform_str, sub["task_id"],
)
await asyncio.to_thread(
self._kanban_rewind,
sub,
d["cursor"],
d.get("old_cursor", 0),
board_slug,
)
continue
title = (task.title if task else sub["task_id"])[:120]
for ev in d["events"]:
kind = ev.kind
@ -4254,6 +4303,10 @@ class GatewayRunner:
await adapter.send(
sub["chat_id"], msg, metadata=metadata,
)
logger.info(
"kanban notifier: delivered %s event for %s to %s/%s on board %s",
kind, sub["task_id"], platform_str, sub["chat_id"], board_slug,
)
# Reset the failure counter on success.
sub_fail_counts.pop(sub_key, None)
except Exception as exc:
@ -4273,7 +4326,17 @@ class GatewayRunner:
)
await asyncio.to_thread(self._kanban_unsub, sub, board_slug)
sub_fail_counts.pop(sub_key, None)
# Don't advance cursor on send failure — retry next tick.
else:
await asyncio.to_thread(
self._kanban_rewind,
sub,
d["cursor"],
d.get("old_cursor", 0),
board_slug,
)
# Rewind the pre-send claim on transient failure so
# a later tick can retry. After too many failures,
# dropping the subscription is the terminal action.
break
else:
# All events delivered; advance cursor + maybe unsub.
@ -4336,6 +4399,29 @@ class GatewayRunner:
finally:
conn.close()
def _kanban_rewind(
self,
sub: dict,
claimed_cursor: int,
old_cursor: int,
board: Optional[str] = None,
) -> None:
"""Sync helper: undo a claimed notification cursor after send failure."""
from hermes_cli import kanban_db as _kb
conn = _kb.connect(board=board)
try:
_kb.rewind_notify_cursor(
conn,
task_id=sub["task_id"],
platform=sub["platform"],
chat_id=sub["chat_id"],
thread_id=sub.get("thread_id") or "",
claimed_cursor=claimed_cursor,
old_cursor=old_cursor,
)
finally:
conn.close()
async def _kanban_dispatcher_watcher(self) -> None:
"""Embedded kanban dispatcher — one tick every `dispatch_interval_seconds`.