mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-08 08:11:38 +00:00
fix(kanban): stale reclaim must not tick failure counter (#28680)
Follow-up to #28452. detect_stale_running() was calling _record_task_failure() on every reclaim, which ticked the consecutive_failures counter. With the default failure_limit=2, two legitimately long-running tasks (>4 h without explicit heartbeat) would auto-block via the spawn-failure circuit breaker — even though no worker actually failed. Stale reclaim is dispatcher-side absence-of-heartbeat detection, not a worker fault. Removed the _record_task_failure() call; the 'stale' event in task_events is still the audit surface, but the failure counter is now reserved for spawn_failed / timed_out / crashed (real failures). Also documents the heartbeat requirement: - KANBAN_GUIDANCE in agent/prompt_builder.py now states the rule ('call kanban_heartbeat at least once an hour for tasks running longer than 1 hour') so workers learn the contract. - kanban.md adds the stale event row to the events table and flags the heartbeat requirement in the worker lifecycle list. New regression test: test_detect_stale_does_not_tick_failure_counter locks in the new behaviour.
This commit is contained in:
parent
7f253f5557
commit
88ee58f7d2
4 changed files with 79 additions and 23 deletions
|
|
@ -2793,3 +2793,65 @@ def test_detect_stale_skips_blocked_tasks(kanban_home, monkeypatch):
|
|||
)
|
||||
assert stale == [], "Blocked task should not be reclaimed by stale detection"
|
||||
assert kb.get_task(conn, t).status == "blocked"
|
||||
|
||||
|
||||
def test_detect_stale_does_not_tick_failure_counter(kanban_home, monkeypatch):
|
||||
"""Stale reclaim must NOT tick consecutive_failures.
|
||||
|
||||
Stale detection is dispatcher-side absence-of-heartbeat detection,
|
||||
not a worker failure. Counting it as a failure would let two
|
||||
legitimately-long-running tasks (>4h without explicit heartbeat) trip
|
||||
the circuit breaker and auto-block at the default failure_limit=2,
|
||||
even though no worker actually failed. The 'stale' event in
|
||||
task_events is the right audit surface; the consecutive_failures
|
||||
counter is reserved for spawn_failed / timed_out / crashed.
|
||||
"""
|
||||
import hermes_cli.kanban_db as _kb
|
||||
|
||||
with kb.connect() as conn:
|
||||
t = kb.create_task(conn, title="stale-no-counter-tick", assignee="worker")
|
||||
kb.claim_task(conn, t)
|
||||
kb._set_worker_pid(conn, t, os.getpid())
|
||||
|
||||
five_hours_ago = int(time.time()) - (5 * 3600)
|
||||
with kb.write_txn(conn):
|
||||
conn.execute(
|
||||
"UPDATE tasks SET started_at = ? WHERE id = ?", (five_hours_ago, t)
|
||||
)
|
||||
conn.execute(
|
||||
"UPDATE task_runs SET started_at = ? "
|
||||
"WHERE id = (SELECT current_run_id FROM tasks WHERE id = ?)",
|
||||
(five_hours_ago, t),
|
||||
)
|
||||
# Counter starts at 0; assert that's our baseline.
|
||||
row = conn.execute(
|
||||
"SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
|
||||
).fetchone()
|
||||
assert row["consecutive_failures"] in (0, None)
|
||||
|
||||
monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False)
|
||||
stale = kb.detect_stale_running(
|
||||
conn, stale_timeout_seconds=14400, signal_fn=lambda p, s: None,
|
||||
)
|
||||
assert t in stale, "Task should be reclaimed by stale detection"
|
||||
|
||||
# Critical assertion: the failure counter MUST NOT have ticked.
|
||||
# Stale reclaim resets to ready for re-dispatch without penalty.
|
||||
row = conn.execute(
|
||||
"SELECT consecutive_failures FROM tasks WHERE id = ?", (t,)
|
||||
).fetchone()
|
||||
assert row["consecutive_failures"] in (0, None), (
|
||||
f"Stale reclaim ticked consecutive_failures to "
|
||||
f"{row['consecutive_failures']!r}; should remain 0/NULL."
|
||||
)
|
||||
|
||||
# And the audit trail still records the stale event so operators
|
||||
# can see what happened.
|
||||
events = conn.execute(
|
||||
"SELECT kind FROM task_events WHERE task_id = ? ORDER BY id",
|
||||
(t,),
|
||||
).fetchall()
|
||||
kinds = [e["kind"] for e in events]
|
||||
assert "stale" in kinds, (
|
||||
f"Expected 'stale' event in task_events; got {kinds!r}"
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue