fix(kanban): promote blocked tasks when parent dependencies complete

recompute_ready only scanned 'todo' tasks for promotion, ignoring
'blocked' tasks entirely. When a task was blocked (e.g. by the circuit
breaker) and its parent dependencies later completed, the task stayed
stuck in 'blocked' forever unless manually unblocked.

Now recompute_ready also scans 'blocked' tasks. When all parents are
done/archived, the blocked task is promoted to 'ready' with failure
counters reset — equivalent to an automatic unblock.

Includes a regression test for the blocked-parent-done promotion path.
This commit is contained in:
bradhallett 2026-05-18 20:15:49 -07:00 committed by Teknium
parent bc961c13f3
commit 40c1decb3b
2 changed files with 56 additions and 6 deletions

View file

@ -1849,21 +1849,32 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
promoted = 0
with write_txn(conn):
todo_rows = conn.execute(
"SELECT id FROM tasks WHERE status = 'todo'"
"SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')"
).fetchall()
for row in todo_rows:
task_id = row["id"]
cur_status = row["status"]
parents = conn.execute(
"SELECT t.status FROM tasks t "
"JOIN task_links l ON l.parent_id = t.id "
"WHERE l.child_id = ?",
(task_id,),
).fetchall()
if all(p["status"] in {"done", "archived"} for p in parents):
conn.execute(
"UPDATE tasks SET status = 'ready' WHERE id = ? AND status = 'todo'",
(task_id,),
)
if all(p["status"] in ("done", "archived") for p in parents):
# Blocked tasks also get their failure counters reset —
# this is effectively an auto-unblock.
if cur_status == "blocked":
conn.execute(
"UPDATE tasks SET status = 'ready', "
"consecutive_failures = 0, last_failure_error = NULL "
"WHERE id = ? AND status = 'blocked'",
(task_id,),
)
else:
conn.execute(
"UPDATE tasks SET status = 'ready' WHERE id = ? AND status = 'todo'",
(task_id,),
)
_append_event(conn, task_id, "promoted", None)
promoted += 1
return promoted
@ -3554,6 +3565,17 @@ def set_max_runtime(
return cur.rowcount == 1
def _error_fingerprint(error_text: str) -> str:
"""Normalize an error message for grouping identical failures.
Strips host-specific details (PIDs, timestamps) so that errors
with the same root cause produce the same fingerprint.
"""
fp = re.sub(r'\bpid \d+\b', 'pid N', error_text[:80])
fp = re.sub(r'\b\d{10,}\b', '<TS>', fp)
return fp.lower().strip()
def detect_crashed_workers(conn: sqlite3.Connection) -> list[str]:
"""Reclaim ``running`` tasks whose worker PID is no longer alive.

View file

@ -134,6 +134,34 @@ def test_recompute_ready_cascades_through_chain(kanban_home):
assert kb.get_task(conn, c).status == "ready"
def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home):
"""blocked tasks with all parents done should be promoted to ready."""
with kb.connect() as conn:
parent = kb.create_task(conn, title="parent", assignee="a")
child = kb.create_task(
conn, title="child", assignee="a", parents=[parent],
)
# Complete the parent
kb.claim_task(conn, parent)
kb.complete_task(conn, parent, result="ok")
# Manually block the child (simulates a worker that failed
# after the parent finished)
conn.execute(
"UPDATE tasks SET status='blocked', consecutive_failures=5, "
"last_failure_error='persistent error' WHERE id=?",
(child,),
)
conn.commit()
assert kb.get_task(conn, child).status == "blocked"
# recompute_ready should promote blocked → ready and reset failures
promoted = kb.recompute_ready(conn)
assert promoted == 1
task = kb.get_task(conn, child)
assert task.status == "ready"
assert task.consecutive_failures == 0
assert task.last_failure_error is None
def test_recompute_ready_fan_in_waits_for_all_parents(kanban_home):
with kb.connect() as conn:
a = kb.create_task(conn, title="a")