fix(kanban): close kanban.db FD after every connect() in long-lived processes

`sqlite3.Connection.__exit__` commits/rollbacks but does NOT close the
underlying FD. `with kb.connect() as conn:` in long-lived processes
(gateway `run_slash`, dashboard `decompose_task_endpoint`) therefore
leaks one FD to `kanban.db` per call. After enough operations the
gateway dies with `[Errno 24] Too many open files` (~4 days uptime
in the production report — #33159).

Fix: add a `connect_closing()` context manager in `hermes_cli/kanban_db`
that wraps `connect()` with a real `try/finally: conn.close()`. Switch
the 42 leak-prone call sites in `hermes_cli/kanban.py` (35),
`hermes_cli/kanban_decompose.py` (4), and `hermes_cli/kanban_specify.py`
(3) over to it.

`kanban.py` matters because `run_slash` (called from the gateway for
every `/kanban` slash command) parses argparse and dispatches to those
`_cmd_*` functions in-process — each one was leaking one FD per
invocation.

Tests inside `tests/` are untouched: short-lived processes where OS
cleanup masks the leak. Regression tests added in
`test_kanban_db.py` cover both happy-path and exception-path closure,
plus an explicit assertion that bare `with kb.connect()` still does
NOT close (documenting the upstream sqlite3 behaviour we're working
around).

Closes #33159.
This commit is contained in:
teknium1 2026-05-27 18:06:44 -07:00 committed by Teknium
parent 6d947e4d78
commit ebe04c66cd
5 changed files with 140 additions and 42 deletions

View file

@ -3805,3 +3805,66 @@ def test_dispatch_once_still_reaps_via_extracted_fn(kanban_home):
pids = kb.reap_worker_zombies()
assert pids == [99999]
# ---------------------------------------------------------------------------
# connect_closing(): context manager that actually closes the FD
# Regression coverage for #33159 (kanban.db FD leak — gateway crashes after
# ~4 days). sqlite3.Connection's built-in __exit__ commits/rollbacks but
# does NOT close, so `with kb.connect() as conn:` leaks the FD in
# long-lived processes (gateway run_slash, dashboard decompose handler).
# `connect_closing()` is the leak-safe replacement.
# ---------------------------------------------------------------------------
def test_connect_closing_closes_connection_on_exit(tmp_path):
"""The new context manager MUST actually close the underlying FD."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect_closing(db_path=db_path) as conn:
conn.execute("SELECT 1").fetchone()
# After exit, the connection MUST be closed — subsequent execute
# should raise ProgrammingError.
with pytest.raises(sqlite3.ProgrammingError):
conn.execute("SELECT 1")
def test_connect_closing_closes_on_exception(tmp_path):
"""Connection closed even when the body raises."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
captured = []
with pytest.raises(RuntimeError, match="boom"):
with kb.connect_closing(db_path=db_path) as conn:
captured.append(conn)
raise RuntimeError("boom")
with pytest.raises(sqlite3.ProgrammingError):
captured[0].execute("SELECT 1")
def test_connect_closing_yields_usable_connection(tmp_path):
"""Smoke test: schema is initialized and basic ops work."""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect_closing(db_path=db_path) as conn:
tid = kb.create_task(conn, title="closing-cm test")
task = kb.get_task(conn, tid)
assert task is not None
assert task.title == "closing-cm test"
def test_bare_connect_does_not_close_on_context_exit(tmp_path):
"""Document the leak that connect_closing exists to prevent.
sqlite3.Connection's __exit__ commits/rollbacks but doesn't close.
This is the upstream behaviour we cannot change; the regression
guard is to make sure connect_closing() does the right thing.
"""
db_path = tmp_path / "kanban.db"
kb._INITIALIZED_PATHS.discard(str(db_path.resolve()))
with kb.connect(db_path=db_path) as conn:
pass
# Still usable after with-block exit (the leak).
conn.execute("SELECT 1").fetchone()
conn.close() # explicit close to avoid leaking THIS test