From 6f2d60559e970ba935761b20c2f3c32395e1d017 Mon Sep 17 00:00:00 2001 From: li0near Date: Sat, 9 May 2026 22:36:37 -0700 Subject: [PATCH] fix(kanban): drop redundant init_db() in gateway watchers (#21378) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both `_kanban_notifier_watcher` and `_kanban_dispatcher_watcher`'s `_tick_once_for_board` called `_kb.connect(board=slug)` immediately followed by `_kb.init_db(board=slug)`. Since `connect()` already runs the schema + idempotent migration on first open per process, the explicit `init_db()` was redundant — and worse, `init_db()` deliberately busts the per-process `_INITIALIZED_PATHS` cache and re-runs the migration on a *second* connection that races the first. On every cold gateway start against a legacy DB this surfaced as either `sqlite3.OperationalError: duplicate column name: ` or intermittent `database is locked` errors logged at the first tick. The duplicate-column case is now tolerated by `_add_column_if_missing` (commit 78698381a), but the wasted second migration plus the database-is-locked race remain fixable by skipping the redundant call entirely. Drops `_kb.init_db(board=slug)` at both call sites and adds a regression test in `tests/hermes_cli/test_kanban_notify.py` that pins the absence via source inspection plus a runtime spy. Co-authored-by: Teknium <127238744+teknium1@users.noreply.github.com> --- gateway/run.py | 26 +++++-- tests/hermes_cli/test_kanban_notify.py | 104 +++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 8 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 98d7e90cad6..1b741b6a81a 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3887,10 +3887,18 @@ class GatewayRunner: except Exception: continue try: - try: - _kb.init_db(board=slug) # idempotent; handles first-run - except Exception: - pass + # `connect()` runs the schema + idempotent migration + # on first open per process, so an explicit + # `init_db()` here would be redundant. Worse: + # `init_db()` deliberately busts the per-process + # cache and re-runs the migration on a *second* + # connection, which races the first and used to + # log a benign but noisy `duplicate column name` + # traceback (and intermittent "database is locked" + # — issue #21378) on every gateway start against + # a legacy DB. `_add_column_if_missing` now + # tolerates that race, but we still skip the + # redundant call to avoid the wasted work. subs = _kb.list_notify_subs(conn) for sub in subs: cursor, events = _kb.unseen_events_for_sub( @@ -4183,10 +4191,12 @@ class GatewayRunner: conn = None try: conn = _kb.connect(board=slug) - try: - _kb.init_db(board=slug) # idempotent, handles first-run - except Exception: - pass + # `connect()` runs the schema + idempotent migration on + # first open per process; the previous explicit + # `init_db()` call here busted the per-process cache and + # re-ran the migration on a second connection, racing + # the first. See the matching comment in + # `_kanban_notifier_watcher` and issue #21378. return _kb.dispatch_once( conn, board=slug, diff --git a/tests/hermes_cli/test_kanban_notify.py b/tests/hermes_cli/test_kanban_notify.py index ed8a70c7c08..3b8cf4865d9 100644 --- a/tests/hermes_cli/test_kanban_notify.py +++ b/tests/hermes_cli/test_kanban_notify.py @@ -197,3 +197,107 @@ async def test_notifier_second_blocked_delivers(kanban_home): f"Should receive 2 blocked notification, but only get {len(blocked_deliveries)} count\n" f"Message {delivered_msgs}" ) + + +# --------------------------------------------------------------------------- +# Regression: gateway watchers must not double-init the kanban DB. +# +# Both the notifier watcher (`_kanban_notifier_watcher`) and the dispatcher +# tick (`_tick_once_for_board`) used to call `_kb.connect(board=slug)` +# immediately followed by `_kb.init_db(board=slug)`. Since `connect()` +# already runs the schema + idempotent migration on first open per process, +# the explicit `init_db()` was redundant — and worse, `init_db()` +# deliberately busts the per-process cache and re-runs the migration on a +# *second* connection, which races the first. On legacy DBs this surfaced +# as `duplicate column name: ` (now tolerated by +# `_add_column_if_missing`) and intermittent `database is locked` errors +# (issue #21378). +# +# The fix removes the `init_db()` calls in both watchers; this regression +# test pins that behaviour so we don't reintroduce them. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_notifier_does_not_call_init_db(kanban_home): + """Notifier watcher path must not invoke `_kb.init_db` (issue #21378).""" + import hermes_cli.kanban_db as kb + from gateway.run import GatewayRunner + from gateway.config import Platform + + runner = object.__new__(GatewayRunner) + runner._running = True + runner._kanban_sub_fail_counts = {} + + fake_adapter = MagicMock() + fake_adapter.send = AsyncMock() + runner.adapters = {Platform.TELEGRAM: fake_adapter} + + _orig_sleep = asyncio.sleep + tick_count = 0 + + async def _fast_sleep(_): + nonlocal tick_count + await _orig_sleep(0) + tick_count += 1 + if tick_count >= 3: + runner._running = False + + init_db_calls: list[object] = [] + real_init_db = kb.init_db + + def _spy_init_db(*args, **kwargs): + init_db_calls.append((args, kwargs)) + return real_init_db(*args, **kwargs) + + with patch("gateway.run.asyncio.sleep", side_effect=_fast_sleep), \ + patch("hermes_cli.kanban_db.init_db", side_effect=_spy_init_db): + await asyncio.wait_for( + runner._kanban_notifier_watcher(interval=1), + timeout=10.0, + ) + + assert init_db_calls == [], ( + "_kanban_notifier_watcher must not call init_db on every tick — " + "connect() handles first-run schema init. " + "Reintroducing init_db revives issue #21378. " + f"Got {len(init_db_calls)} call(s): {init_db_calls}" + ) + + +def test_dispatcher_tick_does_not_call_init_db(kanban_home, monkeypatch): + """`_tick_once_for_board` must not invoke `_kb.init_db` (issue #21378). + + `connect()` already runs the schema + idempotent migration on first open + per process. The explicit `init_db()` call was redundant and triggered a + second migration on a second connection that raced the first. + """ + import hermes_cli.kanban_db as kb + from gateway.run import GatewayRunner + from unittest.mock import patch + + runner = object.__new__(GatewayRunner) + + init_db_calls: list[object] = [] + real_init_db = kb.init_db + + def _spy_init_db(*args, **kwargs): + init_db_calls.append((args, kwargs)) + return real_init_db(*args, **kwargs) + + # The dispatcher watcher's tick lives as a local closure inside + # `_kanban_dispatcher_watcher`. Read the source and assert the + # specific patterns that would reintroduce the bug are absent. + import inspect + src = inspect.getsource(GatewayRunner._kanban_dispatcher_watcher) + assert "_kb.init_db(board=slug)" not in src, ( + "_kanban_dispatcher_watcher must not call _kb.init_db(board=slug) — " + "see issue #21378. Use connect() alone; it runs migrations on first " + "open per process." + ) + + notifier_src = inspect.getsource(GatewayRunner._kanban_notifier_watcher) + assert "_kb.init_db(board=slug)" not in notifier_src, ( + "_kanban_notifier_watcher must not call _kb.init_db(board=slug) — " + "see issue #21378." + )