diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 9cbc010ac64..d557354238c 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2002,11 +2002,58 @@ def _synthesize_ended_run( # Dependency resolution (todo -> ready) # --------------------------------------------------------------------------- +def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool: + """Return True when ``task_id`` is sticky-blocked by an explicit + worker/operator ``kanban_block`` call (#28712). + + A ``blocked`` status can come from two very different sources: + + * **Worker- or operator-initiated** — a worker called + ``kanban_block(reason="review-required: ...")`` (or somebody ran + ``hermes kanban block ``). This is a deliberate handoff that + should stay blocked until an operator unblocks it. The block tool + emits a ``"blocked"`` event row in ``task_events``. + + * **Circuit-breaker** — ``_record_task_failure`` tripped after + repeated crashes / spawn failures / timeouts. This emits + ``"gave_up"``, *not* ``"blocked"``, and is meant to recover + automatically once the underlying conditions change (e.g. parents + finish, transient infra error clears). + + The cheapest signal that distinguishes the two is the most recent + ``"blocked"`` / ``"unblocked"`` event for the task. If the most + recent one is ``"blocked"`` (or there is a ``"blocked"`` event and + no ``"unblocked"`` event has fired since), the task is sticky and + ``recompute_ready`` must *not* auto-promote it. + + Returns ``False`` when there is no such event at all (e.g. the task + was set to ``status='blocked'`` by the circuit breaker or by direct + DB manipulation) — preserves the pre-#28712 auto-recover semantics + for that path. + """ + row = conn.execute( + "SELECT kind FROM task_events " + "WHERE task_id = ? AND kind IN ('blocked', 'unblocked') " + "ORDER BY id DESC LIMIT 1", + (task_id,), + ).fetchone() + return bool(row) and row["kind"] == "blocked" + + def recompute_ready(conn: sqlite3.Connection) -> int: """Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``. Returns the number of tasks promoted. Safe to call inside or outside an existing transaction; it opens its own IMMEDIATE txn. + + ``blocked`` tasks are also considered for promotion (so a task + blocked purely by a parent dependency unblocks itself when the + parent completes), *except* when the most recent block event was a + worker-initiated ``kanban_block`` — those stay blocked until an + explicit ``kanban_unblock`` (#28712). Without that guard, a + ``review-required`` handoff would auto-respawn, the fresh worker + would find nothing to do, exit cleanly, get recorded as a protocol + violation, and the cycle would repeat indefinitely. """ promoted = 0 with write_txn(conn): @@ -2016,6 +2063,12 @@ def recompute_ready(conn: sqlite3.Connection) -> int: for row in todo_rows: task_id = row["id"] cur_status = row["status"] + if cur_status == "blocked" and _has_sticky_block(conn, task_id): + # Worker / operator asked for human review — do not + # silently auto-recover. ``unblock_task`` is the only + # legitimate exit (it emits ``"unblocked"`` which flips + # this predicate back). + continue parents = conn.execute( "SELECT t.status FROM tasks t " "JOIN task_links l ON l.parent_id = t.id " @@ -2024,7 +2077,8 @@ def recompute_ready(conn: sqlite3.Connection) -> int: ).fetchall() if all(p["status"] in ("done", "archived") for p in parents): # Blocked tasks also get their failure counters reset — - # this is effectively an auto-unblock. + # this is effectively an auto-unblock (circuit-breaker + # recovery; worker-initiated blocks are skipped above). if cur_status == "blocked": conn.execute( "UPDATE tasks SET status = 'ready', "