From 8e5a6854c3bf081c46df2600775f14bbbde9cc2d Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Fri, 29 May 2026 21:31:55 -0700 Subject: [PATCH] fix(kanban): align recompute_ready guard with breaker's configured failure_limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the budget-exhaustion recovery fix. recompute_ready's new circuit-breaker guard resolved its effective limit from per-task max_retries -> DEFAULT_FAILURE_LIMIT, skipping the dispatcher's configured kanban.failure_limit. _record_task_failure resolves max_retries -> failure_limit(config) -> DEFAULT, so the two disagreed whenever an operator set kanban.failure_limit != 2: - config > 2: a task could get stuck at DEFAULT(2) before reaching its allowed retry count. - config < 2: a task the breaker already blocked could be auto-recovered back to ready, defeating the stricter limit. Thread the dispatcher's failure_limit through dispatch_once into recompute_ready so the guard and the breaker share one resolution order. Updated test_circuit_breaker_block_still_auto_promotes (it asserted a failures=5 block auto-recovers and resets the counter — that's the pre-#35072 behavior the loop fix removes); it now exercises a below-limit transient block, with the at-limit case covered in test_kanban_db.py. Added two tests for the config-tier and per-task override resolution. --- hermes_cli/kanban_db.py | 28 +++++--- .../hermes_cli/test_kanban_blocked_sticky.py | 31 ++++++--- tests/hermes_cli/test_kanban_db.py | 67 +++++++++++++++++++ 3 files changed, 108 insertions(+), 18 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 95ba0d55c00..17fe7476dfe 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2591,7 +2591,9 @@ def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool: return bool(row) and row["kind"] == "blocked" -def recompute_ready(conn: sqlite3.Connection) -> int: +def recompute_ready( + conn: sqlite3.Connection, failure_limit: int = None, +) -> int: """Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``. Returns the number of tasks promoted. Safe to call inside or outside @@ -2606,12 +2608,22 @@ def recompute_ready(conn: sqlite3.Connection) -> int: ``kanban_unblock`` (#28712). 2. The task's ``consecutive_failures`` has reached the effective - failure limit (per-task ``max_retries`` or - ``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry - loops when a task repeatedly exhausts its iteration budget: - without this guard the counter would reset on every recovery - cycle and the circuit breaker could never trip. + failure limit. This prevents infinite retry loops when a task + repeatedly exhausts its iteration budget: without this guard the + counter would reset on every recovery cycle and the circuit + breaker could never trip (#35072). + + The effective failure limit resolves in the same order as the + circuit breaker in ``_record_task_failure`` so the two never + disagree about when a task is permanently blocked: + + 1. per-task ``max_retries`` if set + 2. caller-supplied ``failure_limit`` (the dispatcher passes the + ``kanban.failure_limit`` config value through ``dispatch_once``) + 3. ``DEFAULT_FAILURE_LIMIT`` """ + if failure_limit is None: + failure_limit = DEFAULT_FAILURE_LIMIT promoted = 0 with write_txn(conn): todo_rows = conn.execute( @@ -2647,7 +2659,7 @@ def recompute_ready(conn: sqlite3.Connection) -> int: task_limit = row["max_retries"] effective_limit = ( int(task_limit) if task_limit is not None - else DEFAULT_FAILURE_LIMIT + else int(failure_limit) ) if failures >= effective_limit: continue @@ -5577,7 +5589,7 @@ def dispatch_once( if _crash_auto_blocked: result.auto_blocked.extend(_crash_auto_blocked) result.timed_out = enforce_max_runtime(conn) - result.promoted = recompute_ready(conn) + result.promoted = recompute_ready(conn, failure_limit=failure_limit) # Count tasks already running so max_spawn enforces concurrency rather # than a per-tick spawn budget. See the docstring above for the full diff --git a/tests/hermes_cli/test_kanban_blocked_sticky.py b/tests/hermes_cli/test_kanban_blocked_sticky.py index e6bd093d938..2d7cafef826 100644 --- a/tests/hermes_cli/test_kanban_blocked_sticky.py +++ b/tests/hermes_cli/test_kanban_blocked_sticky.py @@ -106,20 +106,30 @@ def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Pa def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None: """A child that was put into ``blocked`` *without* a worker-issued - ``kanban_block`` (e.g. circuit-breaker after repeated spawn - failures, manual DB triage) must still get auto-promoted when its - parents complete — preserves the pre-#28712 recovery semantics.""" + ``kanban_block`` (e.g. a transient crash, manual DB triage) and whose + ``consecutive_failures`` is still *below* the circuit-breaker limit + must get auto-promoted when its parents complete — preserves the + pre-#28712 recovery semantics for genuinely transient failures. + + The complementary case — a block whose failure count has *reached* + the limit must stay blocked — is covered by + ``test_kanban_db.py::test_recompute_ready_skips_tasks_at_failure_limit`` + (#35072). Together they pin the contract: ``recompute_ready`` defers + the give-up decision to the same effective limit the breaker uses, so + the two never disagree. + """ with kb.connect() as conn: parent = kb.create_task(conn, title="parent") child = kb.create_task(conn, title="child", parents=[parent]) kb.complete_task(conn, parent, result="ok") - # Simulate a circuit-breaker / direct triage that flips status - # without emitting a ``blocked`` event — exactly what - # ``_record_task_failure`` does after a ``gave_up``. + # Simulate a transient circuit-breaker / direct triage that flips + # status without emitting a ``blocked`` event — exactly what + # ``_record_task_failure`` does below the limit. One failure is + # under the default limit (2), so recovery is still correct. conn.execute( - "UPDATE tasks SET status='blocked', consecutive_failures=5, " - "last_failure_error='persistent error' WHERE id=?", + "UPDATE tasks SET status='blocked', consecutive_failures=1, " + "last_failure_error='transient error' WHERE id=?", (child,), ) conn.commit() @@ -128,8 +138,9 @@ def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None: assert promoted == 1 task = kb.get_task(conn, child) assert task.status == "ready" - assert task.consecutive_failures == 0 - assert task.last_failure_error is None + # Counter is preserved across recovery (not reset) so the breaker + # can still accumulate if the task keeps failing (#35072). + assert task.consecutive_failures == 1 def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None: diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 8e333189177..b2510855ea2 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -892,6 +892,73 @@ def test_recompute_ready_recovers_below_limit(kanban_home): assert task.consecutive_failures == 1 +def test_recompute_ready_honours_dispatcher_failure_limit(kanban_home): + """The guard's effective limit must follow the same resolution order + as the circuit breaker (#35072): per-task max_retries → dispatcher + failure_limit → DEFAULT_FAILURE_LIMIT. + + Without threading the dispatcher's ``kanban.failure_limit`` through, + the guard falls back to DEFAULT_FAILURE_LIMIT and disagrees with the + breaker — sticking a task prematurely (config limit > default) or + letting a tripped task escape (config limit < default). + """ + with kb.connect() as conn: + # Config allows MORE retries than the default. A task blocked + # with failures below the configured limit must still recover. + t = kb.create_task(conn, title="lenient", assignee="a") + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=? " + "WHERE id=?", + (kb.DEFAULT_FAILURE_LIMIT, t), + ) + conn.commit() + # Default-limit call would stick it (failures >= default). + assert kb.recompute_ready(conn) == 0 + assert kb.get_task(conn, t).status == "blocked" + # Dispatcher configured a higher limit → recover, preserve counter. + promoted = kb.recompute_ready( + conn, failure_limit=kb.DEFAULT_FAILURE_LIMIT + 2 + ) + assert promoted == 1 + task = kb.get_task(conn, t) + assert task.status == "ready" + assert task.consecutive_failures == kb.DEFAULT_FAILURE_LIMIT + + # Config allows FEWER retries than the default. A task at the + # stricter limit must stay blocked even though it's below default. + t2 = kb.create_task(conn, title="strict", assignee="a") + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=1 " + "WHERE id=?", + (t2,), + ) + conn.commit() + # Default-limit (2) would recover it (1 < 2). + # Stricter config limit (1) must keep it blocked (1 >= 1). + assert kb.recompute_ready(conn, failure_limit=1) == 0 + assert kb.get_task(conn, t2).status == "blocked" + + +def test_recompute_ready_per_task_max_retries_overrides_dispatcher(kanban_home): + """A per-task ``max_retries`` wins over the dispatcher failure_limit, + matching ``_record_task_failure``'s resolution order.""" + with kb.connect() as conn: + t = kb.create_task(conn, title="per-task", assignee="a") + # Per-task allows 4 retries; dispatcher config says 2. + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=2, " + "max_retries=4 WHERE id=?", + (t,), + ) + conn.commit() + # failures(2) < per-task limit(4) → recover, despite dispatcher=2. + promoted = kb.recompute_ready(conn, failure_limit=2) + assert promoted == 1 + task = kb.get_task(conn, t) + assert task.status == "ready" + assert task.consecutive_failures == 2 + + # --------------------------------------------------------------------------- # Parent-completion invariant at the claim gate (RCA t_a6acd07d) # ---------------------------------------------------------------------------