mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-22 05:22:09 +00:00
fix(kanban): gate claim + unblock on parent completion
Enforce the parent-completion invariant at claim_task (the single ready->running chokepoint) and re-gate unblock_task so blocked->ready only fires when parents are done. Prevents child tasks from running ahead of in-progress parents under the create-then-link race. Also adds a stress test that races concurrent create+link against hammered claim_task and asserts no child runs while any parent is undone. Ref: kanban/boards/cookai/workspaces/t_a6acd07d/root-cause.md Refs: t_8d6af9d6
This commit is contained in:
parent
79694018f8
commit
cda20eec0c
3 changed files with 344 additions and 4 deletions
|
|
@ -1804,6 +1804,31 @@ def claim_task(
|
|||
lock = claimer or _claimer_id()
|
||||
expires = now + int(ttl_seconds)
|
||||
with write_txn(conn):
|
||||
# Structural invariant: never transition ready -> running while any
|
||||
# parent is not yet 'done'. This is the single enforcement point
|
||||
# regardless of which writer (create_task, link_tasks, unblock_task,
|
||||
# release_stale_claims, manual SQL) set status='ready'. If a racy
|
||||
# writer promoted a task with undone parents, demote it back to
|
||||
# 'todo' here — recompute_ready will re-promote when the parents
|
||||
# actually finish. See RCA at
|
||||
# kanban/boards/cookai/workspaces/t_a6acd07d/root-cause.md.
|
||||
undone = conn.execute(
|
||||
"SELECT 1 FROM task_links l "
|
||||
"JOIN tasks p ON p.id = l.parent_id "
|
||||
"WHERE l.child_id = ? AND p.status != 'done' LIMIT 1",
|
||||
(task_id,),
|
||||
).fetchone()
|
||||
if undone:
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status = 'todo' "
|
||||
"WHERE id = ? AND status = 'ready'",
|
||||
(task_id,),
|
||||
)
|
||||
_append_event(
|
||||
conn, task_id, "claim_rejected",
|
||||
{"reason": "parents_not_done"},
|
||||
)
|
||||
return None
|
||||
# Defensive: if a prior run somehow leaked (invariant violation from
|
||||
# an unknown code path), close it as 'reclaimed' so we don't strand
|
||||
# it when the CAS resets the pointer below. No-op when the invariant
|
||||
|
|
@ -2503,14 +2528,30 @@ def unblock_task(conn: sqlite3.Connection, task_id: str) -> bool:
|
|||
""",
|
||||
(now, int(stale["current_run_id"])),
|
||||
)
|
||||
cur = conn.execute(
|
||||
"UPDATE tasks SET status = 'ready', current_run_id = NULL "
|
||||
"WHERE id = ? AND status = 'blocked'",
|
||||
# Re-gate on parent completion before flipping 'blocked' back to
|
||||
# 'ready'. Unconditionally setting status='ready' here bypasses the
|
||||
# parent-completion invariant (the dispatcher trusts that column);
|
||||
# if parents are still in progress the task must wait in 'todo'
|
||||
# until recompute_ready picks it up. RCA: Bug 2 at
|
||||
# kanban/boards/cookai/workspaces/t_a6acd07d/root-cause.md.
|
||||
undone_parents = conn.execute(
|
||||
"SELECT 1 FROM task_links l "
|
||||
"JOIN tasks p ON p.id = l.parent_id "
|
||||
"WHERE l.child_id = ? AND p.status != 'done' LIMIT 1",
|
||||
(task_id,),
|
||||
).fetchone()
|
||||
new_status = "todo" if undone_parents else "ready"
|
||||
cur = conn.execute(
|
||||
"UPDATE tasks SET status = ?, current_run_id = NULL "
|
||||
"WHERE id = ? AND status = 'blocked'",
|
||||
(new_status, task_id),
|
||||
)
|
||||
if cur.rowcount != 1:
|
||||
return False
|
||||
_append_event(conn, task_id, "unblocked", None)
|
||||
_append_event(
|
||||
conn, task_id, "unblocked",
|
||||
{"status": new_status} if new_status != "ready" else None,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue