fix: auto-block repeated kanban retries

2026-05-08 03:01:47 +00:00 · 2026-05-06 15:30:03 -04:00 · 2026-05-06 15:30:03 -04:00 · 411cfa26e3
commit 411cfa26e3
parent 595e906698
5 changed files with 119 additions and 20 deletions
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -1230,6 +1230,10 @@ DEFAULT_CONFIG = {
        # Seconds between dispatcher ticks (idle or not). Lower = snappier
        # pickup of newly-ready tasks; higher = less SQL pressure.
        "dispatch_interval_seconds": 60,
+        # Auto-block after this many consecutive non-success attempts for the
+        # same task/profile (spawn_failed, timed_out, or crashed). Reassignment
+        # resets the streak for the new profile.
+        "failure_limit": 2,
    },

    # execute_code settings — controls the tool used for programmatic tool calls.
--- a/hermes_cli/kanban.py
+++ b/hermes_cli/kanban.py
@ -443,8 +443,8 @@ def build_parser(parent_subparsers: argparse._SubParsersAction) -> argparse.Argu
                        help="Cap number of spawns this pass")
    p_disp.add_argument("--failure-limit", type=int,
                        default=kb.DEFAULT_SPAWN_FAILURE_LIMIT,
-                        help=f"Auto-block a task after this many consecutive spawn failures "
-                             f"(default: {kb.DEFAULT_SPAWN_FAILURE_LIMIT})")
+                        help=f"Auto-block a task after this many consecutive non-success attempts "
+                             f"(spawn_failed, timed_out, or crashed; default: {kb.DEFAULT_SPAWN_FAILURE_LIMIT})")
    p_disp.add_argument("--json", action="store_true")

    # --- daemon (deprecated) ---
@ -1657,6 +1657,7 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
            "    kanban:\n"
            "      dispatch_in_gateway: true      # default\n"
            "      dispatch_interval_seconds: 60\n"
+            "      failure_limit: 2              # consecutive non-success attempts before auto-block\n"
            "\n"
            "Running both the gateway AND this standalone daemon will\n"
            "race for claims. If you truly need the old standalone\n"
--- a/hermes_cli/kanban_db.py
+++ b/hermes_cli/kanban_db.py
@ -1380,7 +1380,7 @@ def assign_task(conn: sqlite3.Connection, task_id: str, profile: Optional[str])
    profile = _canonical_assignee(profile)
    with write_txn(conn):
        row = conn.execute(
-            "SELECT status, claim_lock FROM tasks WHERE id = ?", (task_id,)
+            "SELECT status, claim_lock, assignee FROM tasks WHERE id = ?", (task_id,)
        ).fetchone()
        if not row:
            return False
@ -1389,7 +1389,17 @@ def assign_task(conn: sqlite3.Connection, task_id: str, profile: Optional[str])
                f"cannot reassign {task_id}: currently running (claimed). "
                "Wait for completion or reclaim the stale lock first."
            )
-        conn.execute("UPDATE tasks SET assignee = ? WHERE id = ?", (profile, task_id))
+        if row["assignee"] != profile:
+            # The retry guard is scoped to the task/profile combination. A
+            # human reassigning the task is an explicit recovery action, so the
+            # new profile should not inherit the previous profile's streak.
+            conn.execute(
+                "UPDATE tasks SET assignee = ?, consecutive_failures = 0, "
+                "last_failure_error = NULL WHERE id = ?",
+                (profile, task_id),
+            )
+        else:
+            conn.execute("UPDATE tasks SET assignee = ? WHERE id = ?", (profile, task_id))
        _append_event(conn, task_id, "assigned", {"assignee": profile})
        return True

@ -2569,11 +2579,11 @@ def set_workspace_path(
 # Dispatcher (one-shot pass)
 # ---------------------------------------------------------------------------

-# After this many consecutive `spawn_failed` events on a task, the dispatcher
-# stops retrying and parks the task in ``blocked`` with a reason so a human
-# can investigate. Prevents the dispatcher from thrashing forever on a task
-# whose profile doesn't exist, whose workspace is unmountable, etc.
-DEFAULT_FAILURE_LIMIT = 5
+# After this many consecutive non-success attempts on a task/profile, the
+# dispatcher stops retrying and parks the task in ``blocked`` with a reason so
+# a human can investigate. Prevents retry storms when a worker repeatedly times
+# out, crashes, or cannot spawn.
+DEFAULT_FAILURE_LIMIT = 2
 # Legacy alias — callers / tests still reference the old name.
 DEFAULT_SPAWN_FAILURE_LIMIT = DEFAULT_FAILURE_LIMIT