mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(kanban): align recompute_ready guard with breaker's configured failure_limit
Follow-up to the budget-exhaustion recovery fix. recompute_ready's new circuit-breaker guard resolved its effective limit from per-task max_retries -> DEFAULT_FAILURE_LIMIT, skipping the dispatcher's configured kanban.failure_limit. _record_task_failure resolves max_retries -> failure_limit(config) -> DEFAULT, so the two disagreed whenever an operator set kanban.failure_limit != 2: - config > 2: a task could get stuck at DEFAULT(2) before reaching its allowed retry count. - config < 2: a task the breaker already blocked could be auto-recovered back to ready, defeating the stricter limit. Thread the dispatcher's failure_limit through dispatch_once into recompute_ready so the guard and the breaker share one resolution order. Updated test_circuit_breaker_block_still_auto_promotes (it asserted a failures=5 block auto-recovers and resets the counter — that's the pre-#35072 behavior the loop fix removes); it now exercises a below-limit transient block, with the at-limit case covered in test_kanban_db.py. Added two tests for the config-tier and per-task override resolution.
This commit is contained in:
parent
6ab71d3bb4
commit
8e5a6854c3
3 changed files with 108 additions and 18 deletions
|
|
@ -2591,7 +2591,9 @@ def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool:
|
|||
return bool(row) and row["kind"] == "blocked"
|
||||
|
||||
|
||||
def recompute_ready(conn: sqlite3.Connection) -> int:
|
||||
def recompute_ready(
|
||||
conn: sqlite3.Connection, failure_limit: int = None,
|
||||
) -> int:
|
||||
"""Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``.
|
||||
|
||||
Returns the number of tasks promoted. Safe to call inside or outside
|
||||
|
|
@ -2606,12 +2608,22 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
|
|||
``kanban_unblock`` (#28712).
|
||||
|
||||
2. The task's ``consecutive_failures`` has reached the effective
|
||||
failure limit (per-task ``max_retries`` or
|
||||
``DEFAULT_FAILURE_LIMIT``). This prevents infinite retry
|
||||
loops when a task repeatedly exhausts its iteration budget:
|
||||
without this guard the counter would reset on every recovery
|
||||
cycle and the circuit breaker could never trip.
|
||||
failure limit. This prevents infinite retry loops when a task
|
||||
repeatedly exhausts its iteration budget: without this guard the
|
||||
counter would reset on every recovery cycle and the circuit
|
||||
breaker could never trip (#35072).
|
||||
|
||||
The effective failure limit resolves in the same order as the
|
||||
circuit breaker in ``_record_task_failure`` so the two never
|
||||
disagree about when a task is permanently blocked:
|
||||
|
||||
1. per-task ``max_retries`` if set
|
||||
2. caller-supplied ``failure_limit`` (the dispatcher passes the
|
||||
``kanban.failure_limit`` config value through ``dispatch_once``)
|
||||
3. ``DEFAULT_FAILURE_LIMIT``
|
||||
"""
|
||||
if failure_limit is None:
|
||||
failure_limit = DEFAULT_FAILURE_LIMIT
|
||||
promoted = 0
|
||||
with write_txn(conn):
|
||||
todo_rows = conn.execute(
|
||||
|
|
@ -2647,7 +2659,7 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
|
|||
task_limit = row["max_retries"]
|
||||
effective_limit = (
|
||||
int(task_limit) if task_limit is not None
|
||||
else DEFAULT_FAILURE_LIMIT
|
||||
else int(failure_limit)
|
||||
)
|
||||
if failures >= effective_limit:
|
||||
continue
|
||||
|
|
@ -5577,7 +5589,7 @@ def dispatch_once(
|
|||
if _crash_auto_blocked:
|
||||
result.auto_blocked.extend(_crash_auto_blocked)
|
||||
result.timed_out = enforce_max_runtime(conn)
|
||||
result.promoted = recompute_ready(conn)
|
||||
result.promoted = recompute_ready(conn, failure_limit=failure_limit)
|
||||
|
||||
# Count tasks already running so max_spawn enforces concurrency rather
|
||||
# than a per-tick spawn budget. See the docstring above for the full
|
||||
|
|
|
|||
|
|
@ -106,20 +106,30 @@ def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Pa
|
|||
|
||||
def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
|
||||
"""A child that was put into ``blocked`` *without* a worker-issued
|
||||
``kanban_block`` (e.g. circuit-breaker after repeated spawn
|
||||
failures, manual DB triage) must still get auto-promoted when its
|
||||
parents complete — preserves the pre-#28712 recovery semantics."""
|
||||
``kanban_block`` (e.g. a transient crash, manual DB triage) and whose
|
||||
``consecutive_failures`` is still *below* the circuit-breaker limit
|
||||
must get auto-promoted when its parents complete — preserves the
|
||||
pre-#28712 recovery semantics for genuinely transient failures.
|
||||
|
||||
The complementary case — a block whose failure count has *reached*
|
||||
the limit must stay blocked — is covered by
|
||||
``test_kanban_db.py::test_recompute_ready_skips_tasks_at_failure_limit``
|
||||
(#35072). Together they pin the contract: ``recompute_ready`` defers
|
||||
the give-up decision to the same effective limit the breaker uses, so
|
||||
the two never disagree.
|
||||
"""
|
||||
with kb.connect() as conn:
|
||||
parent = kb.create_task(conn, title="parent")
|
||||
child = kb.create_task(conn, title="child", parents=[parent])
|
||||
kb.complete_task(conn, parent, result="ok")
|
||||
|
||||
# Simulate a circuit-breaker / direct triage that flips status
|
||||
# without emitting a ``blocked`` event — exactly what
|
||||
# ``_record_task_failure`` does after a ``gave_up``.
|
||||
# Simulate a transient circuit-breaker / direct triage that flips
|
||||
# status without emitting a ``blocked`` event — exactly what
|
||||
# ``_record_task_failure`` does below the limit. One failure is
|
||||
# under the default limit (2), so recovery is still correct.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=5, "
|
||||
"last_failure_error='persistent error' WHERE id=?",
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=1, "
|
||||
"last_failure_error='transient error' WHERE id=?",
|
||||
(child,),
|
||||
)
|
||||
conn.commit()
|
||||
|
|
@ -128,8 +138,9 @@ def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None:
|
|||
assert promoted == 1
|
||||
task = kb.get_task(conn, child)
|
||||
assert task.status == "ready"
|
||||
assert task.consecutive_failures == 0
|
||||
assert task.last_failure_error is None
|
||||
# Counter is preserved across recovery (not reset) so the breaker
|
||||
# can still accumulate if the task keeps failing (#35072).
|
||||
assert task.consecutive_failures == 1
|
||||
|
||||
|
||||
def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None:
|
||||
|
|
|
|||
|
|
@ -892,6 +892,73 @@ def test_recompute_ready_recovers_below_limit(kanban_home):
|
|||
assert task.consecutive_failures == 1
|
||||
|
||||
|
||||
def test_recompute_ready_honours_dispatcher_failure_limit(kanban_home):
|
||||
"""The guard's effective limit must follow the same resolution order
|
||||
as the circuit breaker (#35072): per-task max_retries → dispatcher
|
||||
failure_limit → DEFAULT_FAILURE_LIMIT.
|
||||
|
||||
Without threading the dispatcher's ``kanban.failure_limit`` through,
|
||||
the guard falls back to DEFAULT_FAILURE_LIMIT and disagrees with the
|
||||
breaker — sticking a task prematurely (config limit > default) or
|
||||
letting a tripped task escape (config limit < default).
|
||||
"""
|
||||
with kb.connect() as conn:
|
||||
# Config allows MORE retries than the default. A task blocked
|
||||
# with failures below the configured limit must still recover.
|
||||
t = kb.create_task(conn, title="lenient", assignee="a")
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=? "
|
||||
"WHERE id=?",
|
||||
(kb.DEFAULT_FAILURE_LIMIT, t),
|
||||
)
|
||||
conn.commit()
|
||||
# Default-limit call would stick it (failures >= default).
|
||||
assert kb.recompute_ready(conn) == 0
|
||||
assert kb.get_task(conn, t).status == "blocked"
|
||||
# Dispatcher configured a higher limit → recover, preserve counter.
|
||||
promoted = kb.recompute_ready(
|
||||
conn, failure_limit=kb.DEFAULT_FAILURE_LIMIT + 2
|
||||
)
|
||||
assert promoted == 1
|
||||
task = kb.get_task(conn, t)
|
||||
assert task.status == "ready"
|
||||
assert task.consecutive_failures == kb.DEFAULT_FAILURE_LIMIT
|
||||
|
||||
# Config allows FEWER retries than the default. A task at the
|
||||
# stricter limit must stay blocked even though it's below default.
|
||||
t2 = kb.create_task(conn, title="strict", assignee="a")
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=1 "
|
||||
"WHERE id=?",
|
||||
(t2,),
|
||||
)
|
||||
conn.commit()
|
||||
# Default-limit (2) would recover it (1 < 2).
|
||||
# Stricter config limit (1) must keep it blocked (1 >= 1).
|
||||
assert kb.recompute_ready(conn, failure_limit=1) == 0
|
||||
assert kb.get_task(conn, t2).status == "blocked"
|
||||
|
||||
|
||||
def test_recompute_ready_per_task_max_retries_overrides_dispatcher(kanban_home):
|
||||
"""A per-task ``max_retries`` wins over the dispatcher failure_limit,
|
||||
matching ``_record_task_failure``'s resolution order."""
|
||||
with kb.connect() as conn:
|
||||
t = kb.create_task(conn, title="per-task", assignee="a")
|
||||
# Per-task allows 4 retries; dispatcher config says 2.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='blocked', consecutive_failures=2, "
|
||||
"max_retries=4 WHERE id=?",
|
||||
(t,),
|
||||
)
|
||||
conn.commit()
|
||||
# failures(2) < per-task limit(4) → recover, despite dispatcher=2.
|
||||
promoted = kb.recompute_ready(conn, failure_limit=2)
|
||||
assert promoted == 1
|
||||
task = kb.get_task(conn, t)
|
||||
assert task.status == "ready"
|
||||
assert task.consecutive_failures == 2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parent-completion invariant at the claim gate (RCA t_a6acd07d)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue