fix(kanban): stale reclaim must not tick failure counter (#28680)

Follow-up to #28452. detect_stale_running() was calling _record_task_failure() on every reclaim, which ticked the consecutive_failures counter. With the default failure_limit=2, two legitimately long-running tasks (>4 h without explicit heartbeat) would auto-block via the spawn-failure circuit breaker — even though no worker actually failed. Stale reclaim is dispatcher-side absence-of-heartbeat detection, not a worker fault. Removed the _record_task_failure() call; the 'stale' event in task_events is still the audit surface, but the failure counter is now reserved for spawn_failed / timed_out / crashed (real failures). Also documents the heartbeat requirement: - KANBAN_GUIDANCE in agent/prompt_builder.py now states the rule ('call kanban_heartbeat at least once an hour for tasks running longer than 1 hour') so workers learn the contract. - kanban.md adds the stale event row to the events table and flags the heartbeat requirement in the worker lifecycle list. New regression test: test_detect_stale_does_not_tick_failure_counter locks in the new behaviour.
2026-06-08 08:11:38 +00:00 · 2026-05-19 03:15:18 -07:00 · 2026-05-19 03:15:18 -07:00 · 88ee58f7d2
commit 88ee58f7d2
parent 7f253f5557
4 changed files with 79 additions and 23 deletions
--- a/tests/hermes_cli/test_kanban_db.py
+++ b/tests/hermes_cli/test_kanban_db.py
@ -2793,3 +2793,65 @@ def test_detect_stale_skips_blocked_tasks(kanban_home, monkeypatch):
        )
        assert stale == [], "Blocked task should not be reclaimed by stale detection"
        assert kb.get_task(conn, t).status == "blocked"
+
+
+def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch):
+    """Stale reclaim must NOT tick consecutive_failures.
+
+    Stale detection is dispatcher-side absence-of-heartbeat detection,
+    not a worker failure. Counting it as a failure would let two
+    legitimately-long-running tasks (>4h without explicit heartbeat) trip
+    the circuit breaker and auto-block at the default failure_limit=2,
+    even though no worker actually failed. The 'stale' event in
+    task_events is the right audit surface; the consecutive_failures
+    counter is reserved for spawn_failed / timed_out / crashed.
+    """
+    import hermes_cli.kanban_db as _kb
+
+    with kb.connect() as conn:
+        t = kb.create_task(conn, title="stale-no-counter-tick", assignee="worker")
+        kb.claim_task(conn, t)
+        kb._set_worker_pid(conn, t, os.getpid())
+
+        five_hours_ago = int(time.time()) - (5 * 3600)
+        with kb.write_txn(conn):
+            conn.execute(
+                "UPDATE tasks SET started_at = ? WHERE id = ?", (five_hours_ago, t)
+            )
+            conn.execute(
+                "UPDATE task_runs SET started_at = ? "
+                "WHERE id = (SELECT current_run_id FROM tasks WHERE id = ?)",
+                (five_hours_ago, t),
+            )
+            # Counter starts at 0; assert that's our baseline.
+            row = conn.execute(
+                "SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
+            ).fetchone()
+            assert row["consecutive_failures"] in (0, None)
+
+        monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False)
+        stale = kb.detect_stale_running(
+            conn, stale_timeout_seconds=14400, signal_fn=lambda p, s: None,
+        )
+        assert t in stale, "Task should be reclaimed by stale detection"
+
+        # Critical assertion: the failure counter MUST NOT have ticked.
+        # Stale reclaim resets to ready for re-dispatch without penalty.
+        row = conn.execute(
+            "SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
+        ).fetchone()
+        assert row["consecutive_failures"] in (0, None), (
+            f"Stale reclaim ticked consecutive_failures to "
+            f"{row['consecutive_failures']!r}; should remain 0/NULL."
+        )
+
+        # And the audit trail still records the stale event so operators
+        # can see what happened.
+        events = conn.execute(
+            "SELECT kind FROM task_events WHERE task_id = ? ORDER BY id",
+            (t,),
+        ).fetchall()
+        kinds = [e["kind"] for e in events]
+        assert "stale" in kinds, (
+            f"Expected 'stale' event in task_events; got {kinds!r}"
+        )