fix(kanban): align recompute_ready guard with breaker's configured failure_limit

Follow-up to the budget-exhaustion recovery fix. recompute_ready's
new circuit-breaker guard resolved its effective limit from per-task
max_retries -> DEFAULT_FAILURE_LIMIT, skipping the dispatcher's
configured kanban.failure_limit. _record_task_failure resolves
max_retries -> failure_limit(config) -> DEFAULT, so the two disagreed
whenever an operator set kanban.failure_limit != 2:

- config > 2: a task could get stuck at DEFAULT(2) before reaching its
  allowed retry count.
- config < 2: a task the breaker already blocked could be auto-recovered
  back to ready, defeating the stricter limit.

Thread the dispatcher's failure_limit through dispatch_once into
recompute_ready so the guard and the breaker share one resolution order.
Updated test_circuit_breaker_block_still_auto_promotes (it asserted a
failures=5 block auto-recovers and resets the counter — that's the
pre-#35072 behavior the loop fix removes); it now exercises a below-limit
transient block, with the at-limit case covered in test_kanban_db.py.
Added two tests for the config-tier and per-task override resolution.
This commit is contained in:
teknium1 2026-05-29 21:31:55 -07:00 committed by Teknium
parent 6ab71d3bb4
commit 8e5a6854c3
3 changed files with 108 additions and 18 deletions

View file

@ -2591,7 +2591,9 @@ def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool:
return bool(row) and row["kind"] == "blocked"
def recompute_ready(conn: sqlite3.Connection) -> int:
def recompute_ready(
conn: sqlite3.Connection, failure_limit: int = None,
) -> int:
"""Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``.
Returns the number of tasks promoted. Safe to call inside or outside
@ -2606,12 +2608,22 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
``kanban_unblock`` (#28712).
2. The task's ``consecutive_failures`` has reached the effective
failure limit (per-task ``max_retries`` or
``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry
loops when a task repeatedly exhausts its iteration budget:
without this guard the counter would reset on every recovery
cycle and the circuit breaker could never trip.
failure limit. This prevents infinite retry loops when a task
repeatedly exhausts its iteration budget: without this guard the
counter would reset on every recovery cycle and the circuit
breaker could never trip (#35072).
The effective failure limit resolves in the same order as the
circuit breaker in ``_record_task_failure`` so the two never
disagree about when a task is permanently blocked:
1. per-task ``max_retries`` if set
2. caller-supplied ``failure_limit`` (the dispatcher passes the
``kanban.failure_limit`` config value through ``dispatch_once``)
3. ``DEFAULT_FAILURE_LIMIT``
"""
if failure_limit is None:
failure_limit = DEFAULT_FAILURE_LIMIT
promoted = 0
with write_txn(conn):
todo_rows = conn.execute(
@ -2647,7 +2659,7 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
task_limit = row["max_retries"]
effective_limit = (
int(task_limit) if task_limit is not None
else DEFAULT_FAILURE_LIMIT
else int(failure_limit)
)
if failures >= effective_limit:
continue
@ -5577,7 +5589,7 @@ def dispatch_once(
if _crash_auto_blocked:
result.auto_blocked.extend(_crash_auto_blocked)
result.timed_out = enforce_max_runtime(conn)
result.promoted = recompute_ready(conn)
result.promoted = recompute_ready(conn, failure_limit=failure_limit)
# Count tasks already running so max_spawn enforces concurrency rather
# than a per-tick spawn budget. See the docstring above for the full

View file

@ -106,20 +106,30 @@ def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Pa
def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
"""A child that was put into ``blocked`` *without* a worker-issued
``kanban_block`` (e.g. circuit-breaker after repeated spawn
failures, manual DB triage) must still get auto-promoted when its
parents complete preserves the pre-#28712 recovery semantics."""
``kanban_block`` (e.g. a transient crash, manual DB triage) and whose
``consecutive_failures`` is still *below* the circuit-breaker limit
must get auto-promoted when its parents complete preserves the
pre-#28712 recovery semantics for genuinely transient failures.
The complementary case a block whose failure count has *reached*
the limit must stay blocked is covered by
``test_kanban_db.py::test_recompute_ready_skips_tasks_at_failure_limit``
(#35072). Together they pin the contract: ``recompute_ready`` defers
the give-up decision to the same effective limit the breaker uses, so
the two never disagree.
"""
with kb.connect() as conn:
parent = kb.create_task(conn, title="parent")
child = kb.create_task(conn, title="child", parents=[parent])
kb.complete_task(conn, parent, result="ok")
# Simulate a circuit-breaker / direct triage that flips status
# without emitting a ``blocked`` event — exactly what
# ``_record_task_failure`` does after a ``gave_up``.
# Simulate a transient circuit-breaker / direct triage that flips
# status without emitting a ``blocked`` event — exactly what
# ``_record_task_failure`` does below the limit. One failure is
# under the default limit (2), so recovery is still correct.
conn.execute(
"UPDATE tasks SET status='blocked', consecutive_failures=5, "
"last_failure_error='persistent error' WHERE id=?",
"UPDATE tasks SET status='blocked', consecutive_failures=1, "
"last_failure_error='transient error' WHERE id=?",
(child,),
)
conn.commit()
@ -128,8 +138,9 @@ def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
assert promoted == 1
task = kb.get_task(conn, child)
assert task.status == "ready"
assert task.consecutive_failures == 0
assert task.last_failure_error is None
# Counter is preserved across recovery (not reset) so the breaker
# can still accumulate if the task keeps failing (#35072).
assert task.consecutive_failures == 1
def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None:

View file

@ -892,6 +892,73 @@ def test_recompute_ready_recovers_below_limit(kanban_home):
assert task.consecutive_failures == 1
def test_recompute_ready_honours_dispatcher_failure_limit(kanban_home):
"""The guard's effective limit must follow the same resolution order
as the circuit breaker (#35072): per-task max_retries → dispatcher
failure_limit DEFAULT_FAILURE_LIMIT.
Without threading the dispatcher's ``kanban.failure_limit`` through,
the guard falls back to DEFAULT_FAILURE_LIMIT and disagrees with the
breaker sticking a task prematurely (config limit > default) or
letting a tripped task escape (config limit < default).
"""
with kb.connect() as conn:
# Config allows MORE retries than the default. A task blocked
# with failures below the configured limit must still recover.
t = kb.create_task(conn, title="lenient", assignee="a")
conn.execute(
"UPDATE tasks SET status='blocked', consecutive_failures=? "
"WHERE id=?",
(kb.DEFAULT_FAILURE_LIMIT, t),
)
conn.commit()
# Default-limit call would stick it (failures >= default).
assert kb.recompute_ready(conn) == 0
assert kb.get_task(conn, t).status == "blocked"
# Dispatcher configured a higher limit → recover, preserve counter.
promoted = kb.recompute_ready(
conn, failure_limit=kb.DEFAULT_FAILURE_LIMIT + 2
)
assert promoted == 1
task = kb.get_task(conn, t)
assert task.status == "ready"
assert task.consecutive_failures == kb.DEFAULT_FAILURE_LIMIT
# Config allows FEWER retries than the default. A task at the
# stricter limit must stay blocked even though it's below default.
t2 = kb.create_task(conn, title="strict", assignee="a")
conn.execute(
"UPDATE tasks SET status='blocked', consecutive_failures=1 "
"WHERE id=?",
(t2,),
)
conn.commit()
# Default-limit (2) would recover it (1 < 2).
# Stricter config limit (1) must keep it blocked (1 >= 1).
assert kb.recompute_ready(conn, failure_limit=1) == 0
assert kb.get_task(conn, t2).status == "blocked"
def test_recompute_ready_per_task_max_retries_overrides_dispatcher(kanban_home):
"""A per-task ``max_retries`` wins over the dispatcher failure_limit,
matching ``_record_task_failure``'s resolution order."""
with kb.connect() as conn:
t = kb.create_task(conn, title="per-task", assignee="a")
# Per-task allows 4 retries; dispatcher config says 2.
conn.execute(
"UPDATE tasks SET status='blocked', consecutive_failures=2, "
"max_retries=4 WHERE id=?",
(t,),
)
conn.commit()
# failures(2) < per-task limit(4) → recover, despite dispatcher=2.
promoted = kb.recompute_ready(conn, failure_limit=2)
assert promoted == 1
task = kb.get_task(conn, t)
assert task.status == "ready"
assert task.consecutive_failures == 2
# ---------------------------------------------------------------------------
# Parent-completion invariant at the claim gate (RCA t_a6acd07d)
# ---------------------------------------------------------------------------