From 34120a0ae20ae3fc23eeb304efd750e2d2c83c87 Mon Sep 17 00:00:00 2001 From: xxxigm Date: Tue, 19 May 2026 19:45:10 +0700 Subject: [PATCH] fix(kanban): worker-initiated block must not be auto-promoted (#28712) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a worker calls ``kanban_block(reason="review-required: ...")`` to hand a task off for human review, the dispatcher's ``recompute_ready`` was treating the resulting ``blocked`` status as eligible for auto-promotion — exactly the same as a circuit-breaker block. On the next tick the task flipped back to ``ready``, a fresh worker spawned, found nothing to do (work already applied, review-required comment already posted), exited cleanly, got recorded as ``protocol_violation`` → ``gave_up`` → ``blocked``, and the dispatcher promoted again. Infinite loop until manual ``hermes kanban reclaim`` + ``kanban block``. Add ``_has_sticky_block`` which distinguishes the two block sources using the cheapest available signal: the most recent ``"blocked"``/``"unblocked"`` event in ``task_events``. * Worker / operator ``kanban_block`` emits ``"blocked"`` → ``_has_sticky_block`` returns True → ``recompute_ready`` skips the task entirely. ``unblock_task`` emits ``"unblocked"`` which flips the predicate back, so the only legitimate exit is the documented human-in-the-loop path. * Circuit-breaker ``_record_task_failure`` emits ``"gave_up"`` (not ``"blocked"``) → predicate stays False → original parent-completion-recovery semantics from #40c1decb3 are preserved. * Tasks blocked purely by direct DB manipulation also recover, since they have no ``"blocked"`` event row at all — matches the existing ``test_recompute_ready_promotes_blocked_with_done_parents`` fixture behaviour. --- hermes_cli/kanban_db.py | 56 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 9cbc010ac64..d557354238c 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2002,11 +2002,58 @@ def _synthesize_ended_run( # Dependency resolution (todo -> ready) # --------------------------------------------------------------------------- +def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool: + """Return True when ``task_id`` is sticky-blocked by an explicit + worker/operator ``kanban_block`` call (#28712). + + A ``blocked`` status can come from two very different sources: + + * **Worker- or operator-initiated** — a worker called + ``kanban_block(reason="review-required: ...")`` (or somebody ran + ``hermes kanban block ``). This is a deliberate handoff that + should stay blocked until an operator unblocks it. The block tool + emits a ``"blocked"`` event row in ``task_events``. + + * **Circuit-breaker** — ``_record_task_failure`` tripped after + repeated crashes / spawn failures / timeouts. This emits + ``"gave_up"``, *not* ``"blocked"``, and is meant to recover + automatically once the underlying conditions change (e.g. parents + finish, transient infra error clears). + + The cheapest signal that distinguishes the two is the most recent + ``"blocked"`` / ``"unblocked"`` event for the task. If the most + recent one is ``"blocked"`` (or there is a ``"blocked"`` event and + no ``"unblocked"`` event has fired since), the task is sticky and + ``recompute_ready`` must *not* auto-promote it. + + Returns ``False`` when there is no such event at all (e.g. the task + was set to ``status='blocked'`` by the circuit breaker or by direct + DB manipulation) — preserves the pre-#28712 auto-recover semantics + for that path. + """ + row = conn.execute( + "SELECT kind FROM task_events " + "WHERE task_id = ? AND kind IN ('blocked', 'unblocked') " + "ORDER BY id DESC LIMIT 1", + (task_id,), + ).fetchone() + return bool(row) and row["kind"] == "blocked" + + def recompute_ready(conn: sqlite3.Connection) -> int: """Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``. Returns the number of tasks promoted. Safe to call inside or outside an existing transaction; it opens its own IMMEDIATE txn. + + ``blocked`` tasks are also considered for promotion (so a task + blocked purely by a parent dependency unblocks itself when the + parent completes), *except* when the most recent block event was a + worker-initiated ``kanban_block`` — those stay blocked until an + explicit ``kanban_unblock`` (#28712). Without that guard, a + ``review-required`` handoff would auto-respawn, the fresh worker + would find nothing to do, exit cleanly, get recorded as a protocol + violation, and the cycle would repeat indefinitely. """ promoted = 0 with write_txn(conn): @@ -2016,6 +2063,12 @@ def recompute_ready(conn: sqlite3.Connection) -> int: for row in todo_rows: task_id = row["id"] cur_status = row["status"] + if cur_status == "blocked" and _has_sticky_block(conn, task_id): + # Worker / operator asked for human review — do not + # silently auto-recover. ``unblock_task`` is the only + # legitimate exit (it emits ``"unblocked"`` which flips + # this predicate back). + continue parents = conn.execute( "SELECT t.status FROM tasks t " "JOIN task_links l ON l.parent_id = t.id " @@ -2024,7 +2077,8 @@ def recompute_ready(conn: sqlite3.Connection) -> int: ).fetchall() if all(p["status"] in ("done", "archived") for p in parents): # Blocked tasks also get their failure counters reset — - # this is effectively an auto-unblock. + # this is effectively an auto-unblock (circuit-breaker + # recovery; worker-initiated blocks are skipped above). if cur_status == "blocked": conn.execute( "UPDATE tasks SET status = 'ready', "