mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
fix(kanban): worker-initiated block must not be auto-promoted (#28712)
When a worker calls ``kanban_block(reason="review-required: ...")`` to
hand a task off for human review, the dispatcher's ``recompute_ready``
was treating the resulting ``blocked`` status as eligible for
auto-promotion — exactly the same as a circuit-breaker block. On the
next tick the task flipped back to ``ready``, a fresh worker spawned,
found nothing to do (work already applied, review-required comment
already posted), exited cleanly, got recorded as ``protocol_violation``
→ ``gave_up`` → ``blocked``, and the dispatcher promoted again.
Infinite loop until manual ``hermes kanban reclaim`` + ``kanban block``.
Add ``_has_sticky_block`` which distinguishes the two block sources
using the cheapest available signal: the most recent
``"blocked"``/``"unblocked"`` event in ``task_events``.
* Worker / operator ``kanban_block`` emits ``"blocked"`` →
``_has_sticky_block`` returns True → ``recompute_ready`` skips the
task entirely. ``unblock_task`` emits ``"unblocked"`` which flips
the predicate back, so the only legitimate exit is the documented
human-in-the-loop path.
* Circuit-breaker ``_record_task_failure`` emits ``"gave_up"`` (not
``"blocked"``) → predicate stays False → original
parent-completion-recovery semantics from #40c1decb3 are preserved.
* Tasks blocked purely by direct DB manipulation also recover, since
they have no ``"blocked"`` event row at all — matches the existing
``test_recompute_ready_promotes_blocked_with_done_parents`` fixture
behaviour.
This commit is contained in:
parent
64a9a199bb
commit
34120a0ae2
1 changed files with 55 additions and 1 deletions
|
|
@ -2002,11 +2002,58 @@ def _synthesize_ended_run(
|
|||
# Dependency resolution (todo -> ready)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _has_sticky_block(conn: sqlite3.Connection, task_id: str) -> bool:
|
||||
"""Return True when ``task_id`` is sticky-blocked by an explicit
|
||||
worker/operator ``kanban_block`` call (#28712).
|
||||
|
||||
A ``blocked`` status can come from two very different sources:
|
||||
|
||||
* **Worker- or operator-initiated** — a worker called
|
||||
``kanban_block(reason="review-required: ...")`` (or somebody ran
|
||||
``hermes kanban block <id>``). This is a deliberate handoff that
|
||||
should stay blocked until an operator unblocks it. The block tool
|
||||
emits a ``"blocked"`` event row in ``task_events``.
|
||||
|
||||
* **Circuit-breaker** — ``_record_task_failure`` tripped after
|
||||
repeated crashes / spawn failures / timeouts. This emits
|
||||
``"gave_up"``, *not* ``"blocked"``, and is meant to recover
|
||||
automatically once the underlying conditions change (e.g. parents
|
||||
finish, transient infra error clears).
|
||||
|
||||
The cheapest signal that distinguishes the two is the most recent
|
||||
``"blocked"`` / ``"unblocked"`` event for the task. If the most
|
||||
recent one is ``"blocked"`` (or there is a ``"blocked"`` event and
|
||||
no ``"unblocked"`` event has fired since), the task is sticky and
|
||||
``recompute_ready`` must *not* auto-promote it.
|
||||
|
||||
Returns ``False`` when there is no such event at all (e.g. the task
|
||||
was set to ``status='blocked'`` by the circuit breaker or by direct
|
||||
DB manipulation) — preserves the pre-#28712 auto-recover semantics
|
||||
for that path.
|
||||
"""
|
||||
row = conn.execute(
|
||||
"SELECT kind FROM task_events "
|
||||
"WHERE task_id = ? AND kind IN ('blocked', 'unblocked') "
|
||||
"ORDER BY id DESC LIMIT 1",
|
||||
(task_id,),
|
||||
).fetchone()
|
||||
return bool(row) and row["kind"] == "blocked"
|
||||
|
||||
|
||||
def recompute_ready(conn: sqlite3.Connection) -> int:
|
||||
"""Promote ``todo`` tasks to ``ready`` when all parents are ``done`` or ``archived``.
|
||||
|
||||
Returns the number of tasks promoted. Safe to call inside or outside
|
||||
an existing transaction; it opens its own IMMEDIATE txn.
|
||||
|
||||
``blocked`` tasks are also considered for promotion (so a task
|
||||
blocked purely by a parent dependency unblocks itself when the
|
||||
parent completes), *except* when the most recent block event was a
|
||||
worker-initiated ``kanban_block`` — those stay blocked until an
|
||||
explicit ``kanban_unblock`` (#28712). Without that guard, a
|
||||
``review-required`` handoff would auto-respawn, the fresh worker
|
||||
would find nothing to do, exit cleanly, get recorded as a protocol
|
||||
violation, and the cycle would repeat indefinitely.
|
||||
"""
|
||||
promoted = 0
|
||||
with write_txn(conn):
|
||||
|
|
@ -2016,6 +2063,12 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
|
|||
for row in todo_rows:
|
||||
task_id = row["id"]
|
||||
cur_status = row["status"]
|
||||
if cur_status == "blocked" and _has_sticky_block(conn, task_id):
|
||||
# Worker / operator asked for human review — do not
|
||||
# silently auto-recover. ``unblock_task`` is the only
|
||||
# legitimate exit (it emits ``"unblocked"`` which flips
|
||||
# this predicate back).
|
||||
continue
|
||||
parents = conn.execute(
|
||||
"SELECT t.status FROM tasks t "
|
||||
"JOIN task_links l ON l.parent_id = t.id "
|
||||
|
|
@ -2024,7 +2077,8 @@ def recompute_ready(conn: sqlite3.Connection) -> int:
|
|||
).fetchall()
|
||||
if all(p["status"] in ("done", "archived") for p in parents):
|
||||
# Blocked tasks also get their failure counters reset —
|
||||
# this is effectively an auto-unblock.
|
||||
# this is effectively an auto-unblock (circuit-breaker
|
||||
# recovery; worker-initiated blocks are skipped above).
|
||||
if cur_status == "blocked":
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status = 'ready', "
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue