diff --git a/tests/hermes_cli/test_kanban_blocked_sticky.py b/tests/hermes_cli/test_kanban_blocked_sticky.py new file mode 100644 index 00000000000..e6bd093d938 --- /dev/null +++ b/tests/hermes_cli/test_kanban_blocked_sticky.py @@ -0,0 +1,268 @@ +"""Regression tests for #28712 — kanban dispatcher must not auto-promote +worker-initiated ``kanban_block`` (sticky blocks), but must keep +auto-recovering circuit-breaker blocks. + +The bug: when a worker called ``kanban_block(reason="review-required: +...")`` to hand off to a human, the dispatcher's ``recompute_ready`` +would promote the task back to ``ready`` on the next tick. The fresh +worker found nothing to do (work already applied), exited cleanly, and +got recorded as a ``protocol_violation`` → ``gave_up`` → promote → loop +until manual intervention. + +These tests pin down: + +* Worker / operator-initiated blocks are sticky and survive + ``recompute_ready``. +* Circuit-breaker blocks (``gave_up`` event, status flipped via + ``_record_task_failure``) still auto-recover — the original intent + of #40c1decb3 is preserved. +* An explicit ``kanban_unblock`` clears the sticky state. +* The full block → promote → crash → ``gave_up`` loop is broken after + this fix: subsequent ticks leave the task blocked. + +The tangentially related schema-init ordering bug originally reported +in #28712 (``init_db`` crashing on legacy DBs that pre-dated the +``session_id`` migration) is covered separately by +``test_kanban_db.py::test_connect_migrates_legacy_db_before_optional_column_indexes``, +landed via #28754 / #28781 ahead of this fix. +""" + +from __future__ import annotations + +import time +from pathlib import Path + +import pytest + +from hermes_cli import kanban_db as kb + + +@pytest.fixture +def kanban_home(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Isolated HERMES_HOME with an empty kanban DB.""" + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(home)) + monkeypatch.setattr(Path, "home", lambda: tmp_path) + kb.init_db() + return home + + +# --------------------------------------------------------------------------- +# Worker-initiated kanban_block must be sticky +# --------------------------------------------------------------------------- + + +def test_worker_block_is_not_auto_promoted_by_recompute_ready(kanban_home: Path) -> None: + """A standalone task that a worker explicitly blocks for review + must stay blocked across an arbitrary number of dispatcher ticks. + Before #28712's fix, ``recompute_ready`` would silently flip it + back to ``ready`` on the very next tick.""" + with kb.connect() as conn: + tid = kb.create_task(conn, title="needs human review") + kb.claim_task(conn, tid) + assert kb.block_task( + conn, tid, + reason="review-required: please verify ACL change", + expected_run_id=kb.get_task(conn, tid).current_run_id, + ) + assert kb.get_task(conn, tid).status == "blocked" + + # Hammer the promotion code — exactly the dispatcher loop's + # behaviour, just compressed in time. + for _ in range(5): + promoted = kb.recompute_ready(conn) + assert promoted == 0, "worker-blocked task must not auto-promote" + assert kb.get_task(conn, tid).status == "blocked" + + +def test_worker_block_on_child_with_done_parents_is_still_sticky(kanban_home: Path) -> None: + """The parent-completion path is the one ``recompute_ready`` was + designed for, so it's the most dangerous false-positive: even when + every parent is done, a worker-initiated block on the child must + stay blocked.""" + with kb.connect() as conn: + parent = kb.create_task(conn, title="parent") + child = kb.create_task(conn, title="child", parents=[parent]) + kb.complete_task(conn, parent, result="parent ok") + + kb.claim_task(conn, child) + kb.block_task( + conn, child, + reason="review-required: child needs sign-off", + expected_run_id=kb.get_task(conn, child).current_run_id, + ) + assert kb.get_task(conn, child).status == "blocked" + + promoted = kb.recompute_ready(conn) + assert promoted == 0 + assert kb.get_task(conn, child).status == "blocked" + + +# --------------------------------------------------------------------------- +# Circuit-breaker blocks still auto-recover (preserve #40c1decb3 intent) +# --------------------------------------------------------------------------- + + +def test_circuit_breaker_block_still_auto_promotes(kanban_home: Path) -> None: + """A child that was put into ``blocked`` *without* a worker-issued + ``kanban_block`` (e.g. circuit-breaker after repeated spawn + failures, manual DB triage) must still get auto-promoted when its + parents complete — preserves the pre-#28712 recovery semantics.""" + with kb.connect() as conn: + parent = kb.create_task(conn, title="parent") + child = kb.create_task(conn, title="child", parents=[parent]) + kb.complete_task(conn, parent, result="ok") + + # Simulate a circuit-breaker / direct triage that flips status + # without emitting a ``blocked`` event — exactly what + # ``_record_task_failure`` does after a ``gave_up``. + conn.execute( + "UPDATE tasks SET status='blocked', consecutive_failures=5, " + "last_failure_error='persistent error' WHERE id=?", + (child,), + ) + conn.commit() + + promoted = kb.recompute_ready(conn) + assert promoted == 1 + task = kb.get_task(conn, child) + assert task.status == "ready" + assert task.consecutive_failures == 0 + assert task.last_failure_error is None + + +def test_gave_up_event_alone_does_not_make_block_sticky(kanban_home: Path) -> None: + """The circuit-breaker emits ``gave_up`` (not ``blocked``). Make + sure ``_has_sticky_block`` doesn't accidentally treat ``gave_up`` + as sticky — otherwise we'd regress the safety net for genuinely + transient crashes.""" + with kb.connect() as conn: + parent = kb.create_task(conn, title="parent") + child = kb.create_task(conn, title="child", parents=[parent]) + kb.complete_task(conn, parent, result="ok") + + # Status + event match what _record_task_failure writes when + # the breaker trips. + conn.execute( + "UPDATE tasks SET status='blocked' WHERE id=?", (child,), + ) + conn.execute( + "INSERT INTO task_events (task_id, kind, payload, created_at) " + "VALUES (?, 'gave_up', NULL, ?)", + (child, int(time.time())), + ) + conn.commit() + + promoted = kb.recompute_ready(conn) + assert promoted == 1 + assert kb.get_task(conn, child).status == "ready" + + +# --------------------------------------------------------------------------- +# unblock_task clears the sticky state +# --------------------------------------------------------------------------- + + +def test_unblock_clears_sticky_state_and_lets_block_recover(kanban_home: Path) -> None: + """``hermes kanban unblock`` (or the ``kanban_unblock`` tool) is + the only legitimate way out of a worker-initiated block. After + unblock, a *subsequent* circuit-breaker block on the same task + must again be eligible for auto-recovery.""" + with kb.connect() as conn: + tid = kb.create_task(conn, title="t") + kb.claim_task(conn, tid) + kb.block_task( + conn, tid, + reason="review-required: ...", + expected_run_id=kb.get_task(conn, tid).current_run_id, + ) + assert kb.unblock_task(conn, tid) + # After unblock the task is no longer blocked at all. + assert kb.get_task(conn, tid).status == "ready" + + # Now simulate a *later* circuit-breaker block (no new + # ``blocked`` event, just status flip). The most recent + # block/unblock event is ``unblocked`` → guard does not fire + # → recompute can recover. + conn.execute( + "UPDATE tasks SET status='blocked' WHERE id=?", (tid,), + ) + conn.commit() + + promoted = kb.recompute_ready(conn) + assert promoted == 1 + assert kb.get_task(conn, tid).status == "ready" + + +# --------------------------------------------------------------------------- +# Full bug-shaped loop: block → promote → crash → gave_up → next tick +# --------------------------------------------------------------------------- + + +def test_protocol_violation_loop_is_broken(kanban_home: Path) -> None: + """Reproduces the exact #28712 loop and asserts the dispatcher + leaves the task blocked instead of cycling. + + Loop shape from the issue: + + 1. Worker calls ``kanban_block`` → status='blocked', + ``task_runs.outcome='blocked'``, ``blocked`` event. + 2. (Bug) Dispatcher promotes back to ``ready``. + 3. Fresh worker exits cleanly without terminal tool call → + ``protocol_violation`` event. + 4. ``_record_task_failure(failure_limit=1)`` → ``gave_up`` event, + status='blocked' again. + 5. (Bug) Dispatcher promotes again → infinite loop. + + With the fix in place, step 2 never happens — the test simulates + one would-be loop cycle by faking the crash-then-gave_up entries + that *would* have been written and asserts the *next* tick still + leaves the task blocked. + """ + with kb.connect() as conn: + tid = kb.create_task(conn, title="loop reproducer") + kb.claim_task(conn, tid) + kb.block_task( + conn, tid, + reason="review-required: human eyes please", + expected_run_id=kb.get_task(conn, tid).current_run_id, + ) + assert kb.get_task(conn, tid).status == "blocked" + + # First dispatcher tick — must NOT promote. + assert kb.recompute_ready(conn) == 0 + assert kb.get_task(conn, tid).status == "blocked" + + # Simulate the (hypothetical) protocol_violation + gave_up + # entries that the dispatcher would have written if the bug + # were still present. Even with those event rows in place, + # the worker-initiated ``blocked`` event is the most recent + # of the ``{blocked, unblocked}`` pair, so the sticky guard + # still fires. + now = int(time.time()) + conn.execute( + "INSERT INTO task_events (task_id, kind, payload, created_at) " + "VALUES (?, 'protocol_violation', NULL, ?)", + (tid, now), + ) + conn.execute( + "INSERT INTO task_events (task_id, kind, payload, created_at) " + "VALUES (?, 'gave_up', NULL, ?)", + (tid, now + 1), + ) + conn.commit() + + # Subsequent ticks must still leave it blocked. + for _ in range(3): + promoted = kb.recompute_ready(conn) + assert promoted == 0 + assert kb.get_task(conn, tid).status == "blocked" + + +# --------------------------------------------------------------------------- +# Schema-init recovery on legacy DBs is covered by +# tests/hermes_cli/test_kanban_db.py::test_connect_migrates_legacy_db_before_optional_column_indexes +# (landed via #28754 / #28781). The original PR shipped a duplicate test +# here; dropped during salvage to avoid two assertions of the same contract. +# ---------------------------------------------------------------------------