fix(kanban): stale reclaim must not tick failure counter (#28680)

Follow-up to #28452. detect_stale_running() was calling
_record_task_failure() on every reclaim, which ticked the
consecutive_failures counter. With the default failure_limit=2,
two legitimately long-running tasks (>4 h without explicit
heartbeat) would auto-block via the spawn-failure circuit
breaker — even though no worker actually failed.

Stale reclaim is dispatcher-side absence-of-heartbeat detection,
not a worker fault. Removed the _record_task_failure() call;
the 'stale' event in task_events is still the audit surface,
but the failure counter is now reserved for spawn_failed /
timed_out / crashed (real failures).

Also documents the heartbeat requirement:
- KANBAN_GUIDANCE in agent/prompt_builder.py now states the
  rule ('call kanban_heartbeat at least once an hour for tasks
  running longer than 1 hour') so workers learn the contract.
- kanban.md adds the stale event row to the events table and
  flags the heartbeat requirement in the worker lifecycle list.

New regression test: test_detect_stale_does_not_tick_failure_counter
locks in the new behaviour.
This commit is contained in:
Teknium 2026-05-19 03:15:18 -07:00 committed by GitHub
parent 7f253f5557
commit 88ee58f7d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 79 additions and 23 deletions

View file

@ -2793,3 +2793,65 @@ def test_detect_stale_skips_blocked_tasks(kanban_home, monkeypatch):
)
assert stale == [], "Blocked task should not be reclaimed by stale detection"
assert kb.get_task(conn, t).status == "blocked"
def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch):
"""Stale reclaim must NOT tick consecutive_failures.
Stale detection is dispatcher-side absence-of-heartbeat detection,
not a worker failure. Counting it as a failure would let two
legitimately-long-running tasks (>4h without explicit heartbeat) trip
the circuit breaker and auto-block at the default failure_limit=2,
even though no worker actually failed. The 'stale' event in
task_events is the right audit surface; the consecutive_failures
counter is reserved for spawn_failed / timed_out / crashed.
"""
import hermes_cli.kanban_db as _kb
with kb.connect() as conn:
t = kb.create_task(conn, title="stale-no-counter-tick", assignee="worker")
kb.claim_task(conn, t)
kb._set_worker_pid(conn, t, os.getpid())
five_hours_ago = int(time.time()) - (5 * 3600)
with kb.write_txn(conn):
conn.execute(
"UPDATE tasks SET started_at = ? WHERE id = ?", (five_hours_ago, t)
)
conn.execute(
"UPDATE task_runs SET started_at = ? "
"WHERE id = (SELECT current_run_id FROM tasks WHERE id = ?)",
(five_hours_ago, t),
)
# Counter starts at 0; assert that's our baseline.
row = conn.execute(
"SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
).fetchone()
assert row["consecutive_failures"] in (0, None)
monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False)
stale = kb.detect_stale_running(
conn, stale_timeout_seconds=14400, signal_fn=lambda p, s: None,
)
assert t in stale, "Task should be reclaimed by stale detection"
# Critical assertion: the failure counter MUST NOT have ticked.
# Stale reclaim resets to ready for re-dispatch without penalty.
row = conn.execute(
"SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
).fetchone()
assert row["consecutive_failures"] in (0, None), (
f"Stale reclaim ticked consecutive_failures to "
f"{row['consecutive_failures']!r}; should remain 0/NULL."
)
# And the audit trail still records the stale event so operators
# can see what happened.
events = conn.execute(
"SELECT kind FROM task_events WHERE task_id = ? ORDER BY id",
(t,),
).fetchall()
kinds = [e["kind"] for e in events]
assert "stale" in kinds, (
f"Expected 'stale' event in task_events; got {kinds!r}"
)