From 8e5a6854c3bf081c46df2600775f14bbbde9cc2d Mon Sep 17 00:00:00 2001
From: teknium1 <127238744+teknium1@users.noreply.github.com>
Date: Fri, 29 May 2026 21:31:55 -0700
Subject: [PATCH] fix(kanban): align recompute_ready guard with breaker's
 configured failure_limit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to the budget-exhaustion recovery fix. recompute_ready's
new circuit-breaker guard resolved its effective limit from per-task
max_retries -> DEFAULT_FAILURE_LIMIT, skipping the dispatcher's
configured kanban.failure_limit. _record_task_failure resolves
max_retries -> failure_limit(config) -> DEFAULT, so the two disagreed
whenever an operator set kanban.failure_limit != 2:

- config > 2: a task could get stuck at DEFAULT(2) before reaching its
  allowed retry count.
- config < 2: a task the breaker already blocked could be auto-recovered
  back to ready, defeating the stricter limit.

Thread the dispatcher's failure_limit through dispatch_once into
recompute_ready so the guard and the breaker share one resolution order.
Updated test_circuit_breaker_block_still_auto_promotes (it asserted a
failures=5 block auto-recovers and resets the counter — that's the
pre-#35072 behavior the loop fix removes); it now exercises a below-limit
transient block, with the at-limit case covered in test_kanban_db.py.
Added two tests for the config-tier and per-task override resolution.
---
 hermes_cli/kanban_db.py                       | 28 +++++---
 .../hermes_cli/test_kanban_blocked_sticky.py  | 31 ++++++---
 tests/hermes_cli/test_kanban_db.py            | 67 +++++++++++++++++++
 3 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py
index 95ba0d55c00..17fe7476dfe 100644
--- a/hermes_cli/kanban_db.py
+++ b/hermes_cli/kanban_db.py
@@ -2591,7 +2591,9 @@ def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool:
     return bool(row) and row["kind"] == "blocked"
 
 
-def recompute_ready(conn: sqlite3.Connection) -> int:
+def recompute_ready(
+    conn: sqlite3.Connection, failure_limit: int = None,
+) -> int:
     """Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``.
 
     Returns the number of tasks promoted.  Safe to call inside or outside
@@ -2606,12 +2608,22 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
        ``kanban_unblock`` (#28712).
 
     2. The task's ``consecutive_failures`` has reached the effective
-       failure limit (per-task ``max_retries`` or
-       ``DEFAULT_FAILURE_LIMIT``).  This prevents infinite retry
-       loops when a task repeatedly exhausts its iteration budget:
-       without this guard the counter would reset on every recovery
-       cycle and the circuit breaker could never trip.
+       failure limit.  This prevents infinite retry loops when a task
+       repeatedly exhausts its iteration budget: without this guard the
+       counter would reset on every recovery cycle and the circuit
+       breaker could never trip (#35072).
+
+    The effective failure limit resolves in the same order as the
+    circuit breaker in ``_record_task_failure`` so the two never
+    disagree about when a task is permanently blocked:
+
+      1. per-task ``max_retries`` if set
+      2. caller-supplied ``failure_limit`` (the dispatcher passes the
+         ``kanban.failure_limit`` config value through ``dispatch_once``)
+      3. ``DEFAULT_FAILURE_LIMIT``
     """
+    if failure_limit is None:
+        failure_limit = DEFAULT_FAILURE_LIMIT
     promoted = 0
     with write_txn(conn):
         todo_rows = conn.execute(
@@ -2647,7 +2659,7 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
                     task_limit = row["max_retries"]
                     effective_limit = (
                         int(task_limit) if task_limit is not None
-                        else DEFAULT_FAILURE_LIMIT
+                        else int(failure_limit)
                     )
                     if failures >= effective_limit:
                         continue
@@ -5577,7 +5589,7 @@ def dispatch_once(
     if _crash_auto_blocked:
         result.auto_blocked.extend(_crash_auto_blocked)
     result.timed_out = enforce_max_runtime(conn)
-    result.promoted = recompute_ready(conn)
+    result.promoted = recompute_ready(conn, failure_limit=failure_limit)
 
     # Count tasks already running so max_spawn enforces concurrency rather
     # than a per-tick spawn budget. See the docstring above for the full
diff --git a/tests/hermes_cli/test_kanban_blocked_sticky.py b/tests/hermes_cli/test_kanban_blocked_sticky.py
index e6bd093d938..2d7cafef826 100644
--- a/tests/hermes_cli/test_kanban_blocked_sticky.py
+++ b/tests/hermes_cli/test_kanban_blocked_sticky.py
@@ -106,20 +106,30 @@ def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Pa
 
 def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
     """A child that was put into ``blocked`` *without* a worker-issued
-    ``kanban_block`` (e.g. circuit-breaker after repeated spawn
-    failures, manual DB triage) must still get auto-promoted when its
-    parents complete — preserves the pre-#28712 recovery semantics."""
+    ``kanban_block`` (e.g. a transient crash, manual DB triage) and whose
+    ``consecutive_failures`` is still *below* the circuit-breaker limit
+    must get auto-promoted when its parents complete — preserves the
+    pre-#28712 recovery semantics for genuinely transient failures.
+
+    The complementary case — a block whose failure count has *reached*
+    the limit must stay blocked — is covered by
+    ``test_kanban_db.py::test_recompute_ready_skips_tasks_at_failure_limit``
+    (#35072).  Together they pin the contract: ``recompute_ready`` defers
+    the give-up decision to the same effective limit the breaker uses, so
+    the two never disagree.
+    """
     with kb.connect() as conn:
         parent = kb.create_task(conn, title="parent")
         child = kb.create_task(conn, title="child", parents=[parent])
         kb.complete_task(conn, parent, result="ok")
 
-        # Simulate a circuit-breaker / direct triage that flips status
-        # without emitting a ``blocked`` event — exactly what
-        # ``_record_task_failure`` does after a ``gave_up``.
+        # Simulate a transient circuit-breaker / direct triage that flips
+        # status without emitting a ``blocked`` event — exactly what
+        # ``_record_task_failure`` does below the limit.  One failure is
+        # under the default limit (2), so recovery is still correct.
         conn.execute(
-            "UPDATE tasks SET status='blocked', consecutive_failures=5, "
-            "last_failure_error='persistent error' WHERE id=?",
+            "UPDATE tasks SET status='blocked', consecutive_failures=1, "
+            "last_failure_error='transient error' WHERE id=?",
             (child,),
         )
         conn.commit()
@@ -128,8 +138,9 @@ def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
         assert promoted == 1
         task = kb.get_task(conn, child)
         assert task.status == "ready"
-        assert task.consecutive_failures == 0
-        assert task.last_failure_error is None
+        # Counter is preserved across recovery (not reset) so the breaker
+        # can still accumulate if the task keeps failing (#35072).
+        assert task.consecutive_failures == 1
 
 
 def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None:
diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py
index 8e333189177..b2510855ea2 100644
--- a/tests/hermes_cli/test_kanban_db.py
+++ b/tests/hermes_cli/test_kanban_db.py
@@ -892,6 +892,73 @@ def test_recompute_ready_recovers_below_limit(kanban_home):
         assert task.consecutive_failures == 1
 
 
+def test_recompute_ready_honours_dispatcher_failure_limit(kanban_home):
+    """The guard's effective limit must follow the same resolution order
+    as the circuit breaker (#35072): per-task max_retries → dispatcher
+    failure_limit → DEFAULT_FAILURE_LIMIT.
+
+    Without threading the dispatcher's ``kanban.failure_limit`` through,
+    the guard falls back to DEFAULT_FAILURE_LIMIT and disagrees with the
+    breaker — sticking a task prematurely (config limit > default) or
+    letting a tripped task escape (config limit < default).
+    """
+    with kb.connect() as conn:
+        # Config allows MORE retries than the default. A task blocked
+        # with failures below the configured limit must still recover.
+        t = kb.create_task(conn, title="lenient", assignee="a")
+        conn.execute(
+            "UPDATE tasks SET status='blocked', consecutive_failures=? "
+            "WHERE id=?",
+            (kb.DEFAULT_FAILURE_LIMIT, t),
+        )
+        conn.commit()
+        # Default-limit call would stick it (failures >= default).
+        assert kb.recompute_ready(conn) == 0
+        assert kb.get_task(conn, t).status == "blocked"
+        # Dispatcher configured a higher limit → recover, preserve counter.
+        promoted = kb.recompute_ready(
+            conn, failure_limit=kb.DEFAULT_FAILURE_LIMIT + 2
+        )
+        assert promoted == 1
+        task = kb.get_task(conn, t)
+        assert task.status == "ready"
+        assert task.consecutive_failures == kb.DEFAULT_FAILURE_LIMIT
+
+        # Config allows FEWER retries than the default. A task at the
+        # stricter limit must stay blocked even though it's below default.
+        t2 = kb.create_task(conn, title="strict", assignee="a")
+        conn.execute(
+            "UPDATE tasks SET status='blocked', consecutive_failures=1 "
+            "WHERE id=?",
+            (t2,),
+        )
+        conn.commit()
+        # Default-limit (2) would recover it (1 < 2).
+        # Stricter config limit (1) must keep it blocked (1 >= 1).
+        assert kb.recompute_ready(conn, failure_limit=1) == 0
+        assert kb.get_task(conn, t2).status == "blocked"
+
+
+def test_recompute_ready_per_task_max_retries_overrides_dispatcher(kanban_home):
+    """A per-task ``max_retries`` wins over the dispatcher failure_limit,
+    matching ``_record_task_failure``'s resolution order."""
+    with kb.connect() as conn:
+        t = kb.create_task(conn, title="per-task", assignee="a")
+        # Per-task allows 4 retries; dispatcher config says 2.
+        conn.execute(
+            "UPDATE tasks SET status='blocked', consecutive_failures=2, "
+            "max_retries=4 WHERE id=?",
+            (t,),
+        )
+        conn.commit()
+        # failures(2) < per-task limit(4) → recover, despite dispatcher=2.
+        promoted = kb.recompute_ready(conn, failure_limit=2)
+        assert promoted == 1
+        task = kb.get_task(conn, t)
+        assert task.status == "ready"
+        assert task.consecutive_failures == 2
+
+
 # ---------------------------------------------------------------------------
 # Parent-completion invariant at the claim gate (RCA t_a6acd07d)
 # ---------------------------------------------------------------------------