From 40c1decb3bde450ee4d1eb80248ac8f7db0d9879 Mon Sep 17 00:00:00 2001 From: bradhallett <53977268+bradhallett@users.noreply.github.com> Date: Mon, 18 May 2026 20:15:49 -0700 Subject: [PATCH] fix(kanban): promote blocked tasks when parent dependencies complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit recompute_ready only scanned 'todo' tasks for promotion, ignoring 'blocked' tasks entirely. When a task was blocked (e.g. by the circuit breaker) and its parent dependencies later completed, the task stayed stuck in 'blocked' forever unless manually unblocked. Now recompute_ready also scans 'blocked' tasks. When all parents are done/archived, the blocked task is promoted to 'ready' with failure counters reset — equivalent to an automatic unblock. Includes a regression test for the blocked-parent-done promotion path. --- hermes_cli/kanban_db.py | 34 ++++++++++++++++++++++++------ tests/hermes_cli/test_kanban_db.py | 28 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 7d7241a440c..00f8e14dbf3 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -1849,21 +1849,32 @@ def recompute_ready(conn: sqlite3.Connection) -> int: promoted = 0 with write_txn(conn): todo_rows = conn.execute( - "SELECT id FROM tasks WHERE status = 'todo'" + "SELECT id, status FROM tasks WHERE status IN ('todo', 'blocked')" ).fetchall() for row in todo_rows: task_id = row["id"] + cur_status = row["status"] parents = conn.execute( "SELECT t.status FROM tasks t " "JOIN task_links l ON l.parent_id = t.id " "WHERE l.child_id = ?", (task_id,), ).fetchall() - if all(p["status"] in {"done", "archived"} for p in parents): - conn.execute( - "UPDATE tasks SET status = 'ready' WHERE id = ? AND status = 'todo'", - (task_id,), - ) + if all(p["status"] in ("done", "archived") for p in parents): + # Blocked tasks also get their failure counters reset — + # this is effectively an auto-unblock. + if cur_status == "blocked": + conn.execute( + "UPDATE tasks SET status = 'ready', " + "consecutive_failures = 0, last_failure_error = NULL " + "WHERE id = ? AND status = 'blocked'", + (task_id,), + ) + else: + conn.execute( + "UPDATE tasks SET status = 'ready' WHERE id = ? AND status = 'todo'", + (task_id,), + ) _append_event(conn, task_id, "promoted", None) promoted += 1 return promoted @@ -3554,6 +3565,17 @@ def set_max_runtime( return cur.rowcount == 1 +def _error_fingerprint(error_text: str) -> str: + """Normalize an error message for grouping identical failures. + + Strips host-specific details (PIDs, timestamps) so that errors + with the same root cause produce the same fingerprint. + """ + fp = re.sub(r'\bpid \d+\b', 'pid N', error_text[:80]) + fp = re.sub(r'\b\d{10,}\b', '', fp) + return fp.lower().strip() + + def detect_crashed_workers(conn: sqlite3.Connection) -> list[str]: """Reclaim ``running`` tasks whose worker PID is no longer alive. diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index aa1cb726928..919ac4ce276 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -134,6 +134,34 @@ def test_recompute_ready_cascades_through_chain(kanban_home): assert kb.get_task(conn, c).status == "ready" +def test_recompute_ready_promotes_blocked_with_done_parents(kanban_home): + """blocked tasks with all parents done should be promoted to ready.""" + with kb.connect() as conn: + parent = kb.create_task(conn, title="parent", assignee="a") + child = kb.create_task( + conn, title="child", assignee="a", parents=[parent], + ) + # Complete the parent + kb.claim_task(conn, parent) + kb.complete_task(conn, parent, result="ok") + # Manually block the child (simulates a worker that failed + # after the parent finished) + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=5, " + "last_failure_error='persistent error' WHERE id=?", + (child,), + ) + conn.commit() + assert kb.get_task(conn, child).status == "blocked" + # recompute_ready should promote blocked → ready and reset failures + promoted = kb.recompute_ready(conn) + assert promoted == 1 + task = kb.get_task(conn, child) + assert task.status == "ready" + assert task.consecutive_failures == 0 + assert task.last_failure_error is None + + def test_recompute_ready_fan_in_waits_for_all_parents(kanban_home): with kb.connect() as conn: a = kb.create_task(conn, title="a")