diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index 603b44ff9ba..9c36d205ac5 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -206,7 +206,12 @@ KANBAN_GUIDANCE = ( "files outside it unless the task explicitly asks.\n" "3. **Heartbeat on long operations.** Call `kanban_heartbeat(note=...)` " "every few minutes during long subprocesses (training, encoding, crawling). " - "Skip heartbeats for short tasks.\n" + "Skip heartbeats for short tasks. **If your task may run longer than 1 hour, " + "you MUST call `kanban_heartbeat` at least once an hour** — the dispatcher " + "reclaims tasks running past `kanban.dispatch_stale_timeout_seconds` " + "(default 4 hours) when no heartbeat has arrived in the last hour. A " + "reclaim re-queues the task as `ready` without penalty (no failure counter " + "tick), but you lose your current run's progress.\n" "4. **Block on genuine ambiguity.** If you need a human decision you cannot " "infer (missing credentials, UX choice, paywalled source, peer output you " "need first), call `kanban_block(reason=\"...\")` and stop. Don't guess. " diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index b0a076480ac..80fb1153f58 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -4066,27 +4066,15 @@ def detect_stale_running( ) reclaimed.append(tid) - # Increment failure counter. The task is already ``ready`` and the - # run is already closed; this just ticks the counter and may trip - # the circuit breaker. - _record_task_failure( - conn, tid, - error=( - f"no heartbeat for {int(hb_age)}s " - if hb_age is not None - else "no heartbeat ever" - ) + f" after {int(elapsed)}s running", - outcome="stale", - release_claim=False, - end_run=False, - event_payload_extra={ - "elapsed_seconds": int(elapsed), - "heartbeat_age_seconds": ( - int(hb_age) if hb_age is not None else None - ), - "timeout_seconds": stale_timeout_seconds, - }, - ) + # Intentionally NOT calling _record_task_failure here. Stale reclaim + # is dispatcher-side detection of an absent heartbeat; the task is + # going straight back to ``ready`` for re-dispatch. Counting it as + # a worker failure would let two legitimately-long-running tasks + # (>4h without explicit heartbeat) trip the circuit breaker and + # auto-block, even though no worker actually failed. The 'stale' + # event already lives in task_events for auditability; that's the + # right surface for "this happened" without conflating with the + # spawn_failed / timed_out / crashed counters. return reclaimed diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 6e338ffa879..c5b5a97a12b 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -2793,3 +2793,65 @@ def test_detect_stale_skips_blocked_tasks(kanban_home, monkeypatch): ) assert stale == [], "Blocked task should not be reclaimed by stale detection" assert kb.get_task(conn, t).status == "blocked" + + +def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch): + """Stale reclaim must NOT tick consecutive_failures. + + Stale detection is dispatcher-side absence-of-heartbeat detection, + not a worker failure. Counting it as a failure would let two + legitimately-long-running tasks (>4h without explicit heartbeat) trip + the circuit breaker and auto-block at the default failure_limit=2, + even though no worker actually failed. The 'stale' event in + task_events is the right audit surface; the consecutive_failures + counter is reserved for spawn_failed / timed_out / crashed. + """ + import hermes_cli.kanban_db as _kb + + with kb.connect() as conn: + t = kb.create_task(conn, title="stale-no-counter-tick", assignee="worker") + kb.claim_task(conn, t) + kb._set_worker_pid(conn, t, os.getpid()) + + five_hours_ago = int(time.time()) - (5 * 3600) + with kb.write_txn(conn): + conn.execute( + "UPDATE tasks SET started_at = ? WHERE id = ?", (five_hours_ago, t) + ) + conn.execute( + "UPDATE task_runs SET started_at = ? " + "WHERE id = (SELECT current_run_id FROM tasks WHERE id = ?)", + (five_hours_ago, t), + ) + # Counter starts at 0; assert that's our baseline. + row = conn.execute( + "SELECT consecutive_failures FROM tasks WHERE id = ?", (t,) + ).fetchone() + assert row["consecutive_failures"] in (0, None) + + monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False) + stale = kb.detect_stale_running( + conn, stale_timeout_seconds=14400, signal_fn=lambda p, s: None, + ) + assert t in stale, "Task should be reclaimed by stale detection" + + # Critical assertion: the failure counter MUST NOT have ticked. + # Stale reclaim resets to ready for re-dispatch without penalty. + row = conn.execute( + "SELECT consecutive_failures FROM tasks WHERE id = ?", (t,) + ).fetchone() + assert row["consecutive_failures"] in (0, None), ( + f"Stale reclaim ticked consecutive_failures to " + f"{row['consecutive_failures']!r}; should remain 0/NULL." + ) + + # And the audit trail still records the stale event so operators + # can see what happened. + events = conn.execute( + "SELECT kind FROM task_events WHERE task_id = ? ORDER BY id", + (t,), + ).fetchall() + kinds = [e["kind"] for e in events] + assert "stale" in kinds, ( + f"Expected 'stale' event in task_events; got {kinds!r}" + ) diff --git a/website/docs/user-guide/features/kanban.md b/website/docs/user-guide/features/kanban.md index 084e096529e..275195ef921 100644 --- a/website/docs/user-guide/features/kanban.md +++ b/website/docs/user-guide/features/kanban.md @@ -334,7 +334,7 @@ Any profile that should be able to work kanban tasks must load the `kanban-worke 1. On spawn, call `kanban_show()` to read title + body + parent handoffs + prior attempts + full comment thread. 2. `cd $HERMES_KANBAN_WORKSPACE` (via the terminal tool) and do the work there. -3. Call `kanban_heartbeat(note="...")` every few minutes during long operations. +3. Call `kanban_heartbeat(note="...")` every few minutes during long operations. **If your work may run longer than 1 hour, call `kanban_heartbeat` at least once an hour** — the dispatcher reclaims tasks that have been running past `kanban.dispatch_stale_timeout_seconds` (default 4 h) with no heartbeat in the last hour, on the assumption the worker crashed without cleanup. A reclaim is benign (the task goes back to `ready` for re-dispatch without a failure-counter tick) but you lose your current run's progress. 4. Complete with `kanban_complete(summary="...", metadata={...})`, or `kanban_block(reason="...")` if stuck. That final `kanban_complete` / `kanban_block` call is part of the worker @@ -833,6 +833,7 @@ Every transition appends a row to `task_events`. Each row carries an optional `r | `reclaimed` | `{stale_lock}` | Claim TTL expired without a completion; task goes back to `ready`. | | `crashed` | `{pid, claimer}` | Worker PID no longer alive but TTL hadn't expired yet. | | `timed_out` | `{pid, elapsed_seconds, limit_seconds, sigkill}` | `max_runtime_seconds` exceeded; dispatcher SIGTERM'd (then SIGKILL'd after 5 s grace) and re-queued. | +| `stale` | `{elapsed_seconds, last_heartbeat_at, heartbeat_age_seconds, timeout_seconds, pid, terminated}` | Task ran longer than `kanban.dispatch_stale_timeout_seconds` (default 4 h) AND no `kanban_heartbeat` arrived in the last hour. Dispatcher SIGTERM'd the host-local worker (if any), reset the task to `ready` for re-dispatch. Does NOT tick the failure counter (stale is dispatcher-side absence detection, not a worker fault). Workers running long operations should call `kanban_heartbeat` at least once an hour to avoid this. | | `spawn_failed` | `{error, failures}` | One spawn attempt failed (missing PATH, workspace unmountable, …). Counter increments; task returns to `ready` for retry. | | `protocol_violation` | `{pid, claimer, exit_code}` | Worker exited successfully while the task was still `running`, usually because it answered without calling `kanban_complete` or `kanban_block`. The dispatcher also emits `gave_up` and auto-blocks immediately instead of retrying. | | `gave_up` | `{failures, effective_limit, limit_source, error}` | Circuit breaker fired after N consecutive non-successful attempts. Task auto-blocks with the last error. The effective limit resolves as task `max_retries`, then dispatcher `failure_limit` / `kanban.failure_limit`, then the built-in default. |