mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(kanban): stale reclaim must not tick failure counter (#28680)
Follow-up to #28452. detect_stale_running() was calling _record_task_failure() on every reclaim, which ticked the consecutive_failures counter. With the default failure_limit=2, two legitimately long-running tasks (>4 h without explicit heartbeat) would auto-block via the spawn-failure circuit breaker — even though no worker actually failed. Stale reclaim is dispatcher-side absence-of-heartbeat detection, not a worker fault. Removed the _record_task_failure() call; the 'stale' event in task_events is still the audit surface, but the failure counter is now reserved for spawn_failed / timed_out / crashed (real failures). Also documents the heartbeat requirement: - KANBAN_GUIDANCE in agent/prompt_builder.py now states the rule ('call kanban_heartbeat at least once an hour for tasks running longer than 1 hour') so workers learn the contract. - kanban.md adds the stale event row to the events table and flags the heartbeat requirement in the worker lifecycle list. New regression test: test_detect_stale_does_not_tick_failure_counter locks in the new behaviour.
This commit is contained in:
parent
7f253f5557
commit
88ee58f7d2
4 changed files with 79 additions and 23 deletions
|
|
@ -206,7 +206,12 @@ KANBAN_GUIDANCE = (
|
|||
"files outside it unless the task explicitly asks.\n"
|
||||
"3. **Heartbeat on long operations.** Call `kanban_heartbeat(note=...)` "
|
||||
"every few minutes during long subprocesses (training, encoding, crawling). "
|
||||
"Skip heartbeats for short tasks.\n"
|
||||
"Skip heartbeats for short tasks. **If your task may run longer than 1 hour, "
|
||||
"you MUST call `kanban_heartbeat` at least once an hour** — the dispatcher "
|
||||
"reclaims tasks running past `kanban.dispatch_stale_timeout_seconds` "
|
||||
"(default 4 hours) when no heartbeat has arrived in the last hour. A "
|
||||
"reclaim re-queues the task as `ready` without penalty (no failure counter "
|
||||
"tick), but you lose your current run's progress.\n"
|
||||
"4. **Block on genuine ambiguity.** If you need a human decision you cannot "
|
||||
"infer (missing credentials, UX choice, paywalled source, peer output you "
|
||||
"need first), call `kanban_block(reason=\"...\")` and stop. Don't guess. "
|
||||
|
|
|
|||
|
|
@ -4066,27 +4066,15 @@ def detect_stale_running(
|
|||
)
|
||||
reclaimed.append(tid)
|
||||
|
||||
# Increment failure counter. The task is already ``ready`` and the
|
||||
# run is already closed; this just ticks the counter and may trip
|
||||
# the circuit breaker.
|
||||
_record_task_failure(
|
||||
conn, tid,
|
||||
error=(
|
||||
f"no heartbeat for {int(hb_age)}s "
|
||||
if hb_age is not None
|
||||
else "no heartbeat ever"
|
||||
) + f" after {int(elapsed)}s running",
|
||||
outcome="stale",
|
||||
release_claim=False,
|
||||
end_run=False,
|
||||
event_payload_extra={
|
||||
"elapsed_seconds": int(elapsed),
|
||||
"heartbeat_age_seconds": (
|
||||
int(hb_age) if hb_age is not None else None
|
||||
),
|
||||
"timeout_seconds": stale_timeout_seconds,
|
||||
},
|
||||
)
|
||||
# Intentionally NOT calling _record_task_failure here. Stale reclaim
|
||||
# is dispatcher-side detection of an absent heartbeat; the task is
|
||||
# going straight back to ``ready`` for re-dispatch. Counting it as
|
||||
# a worker failure would let two legitimately-long-running tasks
|
||||
# (>4h without explicit heartbeat) trip the circuit breaker and
|
||||
# auto-block, even though no worker actually failed. The 'stale'
|
||||
# event already lives in task_events for auditability; that's the
|
||||
# right surface for "this happened" without conflating with the
|
||||
# spawn_failed / timed_out / crashed counters.
|
||||
|
||||
return reclaimed
|
||||
|
||||
|
|
|
|||
|
|
@ -2793,3 +2793,65 @@ def test_detect_stale_skips_blocked_tasks(kanban_home, monkeypatch):
|
|||
)
|
||||
assert stale == [], "Blocked task should not be reclaimed by stale detection"
|
||||
assert kb.get_task(conn, t).status == "blocked"
|
||||
|
||||
|
||||
def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch):
|
||||
"""Stale reclaim must NOT tick consecutive_failures.
|
||||
|
||||
Stale detection is dispatcher-side absence-of-heartbeat detection,
|
||||
not a worker failure. Counting it as a failure would let two
|
||||
legitimately-long-running tasks (>4h without explicit heartbeat) trip
|
||||
the circuit breaker and auto-block at the default failure_limit=2,
|
||||
even though no worker actually failed. The 'stale' event in
|
||||
task_events is the right audit surface; the consecutive_failures
|
||||
counter is reserved for spawn_failed / timed_out / crashed.
|
||||
"""
|
||||
import hermes_cli.kanban_db as _kb
|
||||
|
||||
with kb.connect() as conn:
|
||||
t = kb.create_task(conn, title="stale-no-counter-tick", assignee="worker")
|
||||
kb.claim_task(conn, t)
|
||||
kb._set_worker_pid(conn, t, os.getpid())
|
||||
|
||||
five_hours_ago = int(time.time()) - (5 * 3600)
|
||||
with kb.write_txn(conn):
|
||||
conn.execute(
|
||||
"UPDATE tasks SET started_at = ? WHERE id = ?", (five_hours_ago, t)
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE task_runs SET started_at = ? "
|
||||
"WHERE id = (SELECT current_run_id FROM tasks WHERE id = ?)",
|
||||
(five_hours_ago, t),
|
||||
)
|
||||
# Counter starts at 0; assert that's our baseline.
|
||||
row = conn.execute(
|
||||
"SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
|
||||
).fetchone()
|
||||
assert row["consecutive_failures"] in (0, None)
|
||||
|
||||
monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False)
|
||||
stale = kb.detect_stale_running(
|
||||
conn, stale_timeout_seconds=14400, signal_fn=lambda p, s: None,
|
||||
)
|
||||
assert t in stale, "Task should be reclaimed by stale detection"
|
||||
|
||||
# Critical assertion: the failure counter MUST NOT have ticked.
|
||||
# Stale reclaim resets to ready for re-dispatch without penalty.
|
||||
row = conn.execute(
|
||||
"SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
|
||||
).fetchone()
|
||||
assert row["consecutive_failures"] in (0, None), (
|
||||
f"Stale reclaim ticked consecutive_failures to "
|
||||
f"{row['consecutive_failures']!r}; should remain 0/NULL."
|
||||
)
|
||||
|
||||
# And the audit trail still records the stale event so operators
|
||||
# can see what happened.
|
||||
events = conn.execute(
|
||||
"SELECT kind FROM task_events WHERE task_id = ? ORDER BY id",
|
||||
(t,),
|
||||
).fetchall()
|
||||
kinds = [e["kind"] for e in events]
|
||||
assert "stale" in kinds, (
|
||||
f"Expected 'stale' event in task_events; got {kinds!r}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -334,7 +334,7 @@ Any profile that should be able to work kanban tasks must load the `kanban-worke
|
|||
|
||||
1. On spawn, call `kanban_show()` to read title + body + parent handoffs + prior attempts + full comment thread.
|
||||
2. `cd $HERMES_KANBAN_WORKSPACE` (via the terminal tool) and do the work there.
|
||||
3. Call `kanban_heartbeat(note="...")` every few minutes during long operations.
|
||||
3. Call `kanban_heartbeat(note="...")` every few minutes during long operations. **If your work may run longer than 1 hour, call `kanban_heartbeat` at least once an hour** — the dispatcher reclaims tasks that have been running past `kanban.dispatch_stale_timeout_seconds` (default 4 h) with no heartbeat in the last hour, on the assumption the worker crashed without cleanup. A reclaim is benign (the task goes back to `ready` for re-dispatch without a failure-counter tick) but you lose your current run's progress.
|
||||
4. Complete with `kanban_complete(summary="...", metadata={...})`, or `kanban_block(reason="...")` if stuck.
|
||||
|
||||
That final `kanban_complete` / `kanban_block` call is part of the worker
|
||||
|
|
@ -833,6 +833,7 @@ Every transition appends a row to `task_events`. Each row carries an optional `r
|
|||
| `reclaimed` | `{stale_lock}` | Claim TTL expired without a completion; task goes back to `ready`. |
|
||||
| `crashed` | `{pid, claimer}` | Worker PID no longer alive but TTL hadn't expired yet. |
|
||||
| `timed_out` | `{pid, elapsed_seconds, limit_seconds, sigkill}` | `max_runtime_seconds` exceeded; dispatcher SIGTERM'd (then SIGKILL'd after 5 s grace) and re-queued. |
|
||||
| `stale` | `{elapsed_seconds, last_heartbeat_at, heartbeat_age_seconds, timeout_seconds, pid, terminated}` | Task ran longer than `kanban.dispatch_stale_timeout_seconds` (default 4 h) AND no `kanban_heartbeat` arrived in the last hour. Dispatcher SIGTERM'd the host-local worker (if any), reset the task to `ready` for re-dispatch. Does NOT tick the failure counter (stale is dispatcher-side absence detection, not a worker fault). Workers running long operations should call `kanban_heartbeat` at least once an hour to avoid this. |
|
||||
| `spawn_failed` | `{error, failures}` | One spawn attempt failed (missing PATH, workspace unmountable, …). Counter increments; task returns to `ready` for retry. |
|
||||
| `protocol_violation` | `{pid, claimer, exit_code}` | Worker exited successfully while the task was still `running`, usually because it answered without calling `kanban_complete` or `kanban_block`. The dispatcher also emits `gave_up` and auto-blocks immediately instead of retrying. |
|
||||
| `gave_up` | `{failures, effective_limit, limit_source, error}` | Circuit breaker fired after N consecutive non-successful attempts. Task auto-blocks with the last error. The effective limit resolves as task `max_retries`, then dispatcher `failure_limit` / `kanban.failure_limit`, then the built-in default. |
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue