fix(kanban): unify failure counter across spawn/timeout/crash outcomes (#20410)

The dispatcher's circuit breaker only protected against spawn-side
failures (profile missing, workspace mount error, exec failure).
Workers that successfully spawned but then timed out or crashed
re-queued to ``ready`` with no counter increment, so the next tick
re-spawned them — loops forever until someone noticed. Reported
externally on Twitter (Forbidden Seeds) and confirmed by walking the
kernel: ``enforce_max_runtime`` flipped the task back to ready, emitted
a ``timed_out`` event, and never touched ``spawn_failures``; same for
``detect_crashed_workers``.

Fix: unify the counter across all non-success outcomes.

Schema
------
* ``tasks.spawn_failures`` → ``tasks.consecutive_failures``
* ``tasks.last_spawn_error`` → ``tasks.last_failure_error``
* Migration renames the columns in-place on existing DBs (``ALTER
  TABLE RENAME COLUMN`` — SQLite >= 3.25) so historical counter
  values are preserved. Row mappers fall through to the legacy names
  if both column renames and a migration somehow got out of sync.

Counter lifecycle
-----------------
New helper ``_record_task_failure(conn, task_id, error, *, outcome,
release_claim, end_run, event_payload_extra)`` is the single point
every non-success outcome funnels through:

* ``spawn_failed``  → ``_record_spawn_failure`` (kept as alias)
  calls it with ``release_claim=True, end_run=True`` — transitions
  running→ready, clears claim, closes run.
* ``timed_out`` → ``enforce_max_runtime`` already does the status
  transition + run close + event emission, then calls
  ``_record_task_failure`` with ``release_claim=False, end_run=False``
  just to bump the counter (and trip the breaker if needed).
* ``crashed`` → ``detect_crashed_workers`` same pattern, but the
  counter increment runs after the main write_txn closes (SQLite
  doesn't nest write transactions).

If the counter hits the breaker threshold (``DEFAULT_FAILURE_LIMIT=5``,
same as before), the task transitions to ``blocked`` with a ``gave_up``
event on top of whatever outcome-specific event was already emitted.

Reset semantics changed: the counter now clears only on successful
``complete_task`` (and operator ``reclaim_task`` — an explicit "I've
looked at this, try again with a fresh budget"). Previously
``_clear_spawn_failures`` ran on every successful spawn, which would
have wiped the counter before a timeout could accumulate past threshold
— exactly the loop this fix prevents.

Diagnostics
-----------
* ``_rule_repeated_spawn_failures`` → ``_rule_repeated_failures``. Now
  fires regardless of which outcome is at fault. Classifies the most
  recent failure (spawn_failed / timed_out / crashed) from the run
  history so the title ("Agent timeout x3", "Agent crash x4", "Agent
  spawn x5") and suggested action (``doctor`` for spawn, ``log`` for
  timeout/crash) stay outcome-specific without N duplicate rules.
* ``_rule_repeated_crashes`` kept as a narrower early-warning at
  threshold 2 (vs 3 for the unified rule), but now suppresses itself
  when the unified rule would also fire — avoids double-flagging.
* Diagnostic ``data`` payload now carries
  ``{consecutive_failures, most_recent_outcome, last_error}`` instead
  of spawn-specific keys.

CLI
---
* ``Task.consecutive_failures`` / ``Task.last_failure_error`` are the
  public fields now. Existing callers that referenced the old names
  get migrated (tests updated in this commit).
* Backward-compat: ``DEFAULT_SPAWN_FAILURE_LIMIT``,
  ``_clear_spawn_failures``, ``_record_spawn_failure`` stay as aliases.

Tests
-----
* 6 new kernel tests: timeout increments counter, 3 consecutive
  timeouts trip the breaker (was the reported gap), crash increments
  counter, reclaim clears counter, completion clears counter, spawn
  success does NOT clear counter.
* Diagnostic tests: updated ``repeated_spawn_failures`` cases to use
  the new kind name and add a timeout-loop test.
* Dashboard API test: spawn_failures column update → consecutive_failures.

389/389 kanban-suite tests pass.

Live verification
-----------------
Seeded 4 tasks in an isolated HERMES_HOME: 3 timeouts, 4 crashes,
2-spawn-failed + 2-timed-out, and a task that had prior failures but
completed successfully. Board correctly shows "!! 3 tasks need
attention" (the successful one has no badge because the counter
reset). Drawer for the timeout-loop task renders "Agent timeout x3"
with most_recent_outcome=timed_out and the "Check logs" suggested
action (not the spawn-flavoured "Verify profile"). The successful
task has zero diagnostics.

Closes the Forbidden-Seeds-reported gap.
This commit is contained in:
Teknium 2026-05-05 13:55:37 -07:00 committed by GitHub
parent 587ef55f2c
commit 1fc8733a69
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 630 additions and 125 deletions

View file

@ -96,7 +96,7 @@ def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawna
assert tid not in res.auto_blocked
task = kb.get_task(conn, tid)
assert task.status == "ready"
assert task.spawn_failures == 3
assert task.consecutive_failures == 3
# Two more ticks → fifth failure exceeds the limit.
res1 = kb.dispatch_once(conn, spawn_fn=_bad_spawn, failure_limit=5)
@ -105,15 +105,20 @@ def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawna
assert tid in res2.auto_blocked
task = kb.get_task(conn, tid)
assert task.status == "blocked"
assert task.spawn_failures >= 5
assert task.last_spawn_error and "no PATH" in task.last_spawn_error
assert task.consecutive_failures >= 5
assert task.last_failure_error and "no PATH" in task.last_failure_error
finally:
conn.close()
def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spawnable):
"""A successful spawn clears the counter so past failures don't count
against future retries of the same task."""
def test_successful_spawn_does_not_reset_failure_counter(kanban_home, all_assignees_spawnable):
"""Under unified consecutive-failure counting, a successful spawn
does NOT reset the counter past failures stay on the books until
a successful completion. This is by design: it prevents a task
that keeps timing out after spawn from looping forever.
(Pre-unification behaviour was to reset on spawn success; see the
complete_task reset for the replacement point.)
"""
calls = [0]
def _flaky_spawn(task, ws):
calls[0] += 1
@ -128,11 +133,12 @@ def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spaw
kb.dispatch_once(conn, spawn_fn=_flaky_spawn, failure_limit=5)
kb.dispatch_once(conn, spawn_fn=_flaky_spawn, failure_limit=5)
task = kb.get_task(conn, tid)
assert task.spawn_failures == 2
assert task.consecutive_failures == 2
kb.dispatch_once(conn, spawn_fn=_flaky_spawn, failure_limit=5)
task = kb.get_task(conn, tid)
assert task.spawn_failures == 0
assert task.last_spawn_error is None
# Counter STAYS at 2 — spawn succeeded but run isn't complete yet.
assert task.consecutive_failures == 2
assert task.last_failure_error is not None
# Task is now running with a pid.
assert task.status == "running"
assert task.worker_pid == 99999
@ -140,6 +146,30 @@ def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spaw
conn.close()
def test_successful_completion_resets_failure_counter(kanban_home, all_assignees_spawnable):
"""A successful kb.complete_task wipes the counter — the task+profile
combination proved it can succeed, so past failures are history."""
conn = kb.connect()
try:
tid = kb.create_task(conn, title="x", assignee="worker")
# Simulate 2 prior failures on the record.
kb.write_txn_ctx = kb.write_txn
with kb.write_txn(conn):
conn.execute(
"UPDATE tasks SET consecutive_failures = 2, "
"last_failure_error = 'old failure' WHERE id = ?",
(tid,),
)
# Complete the task.
ok = kb.complete_task(conn, tid, summary="done")
assert ok
task = kb.get_task(conn, tid)
assert task.consecutive_failures == 0
assert task.last_failure_error is None
finally:
conn.close()
def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spawnable):
"""`dir:` workspace with no path should fail workspace resolution AND
count against the failure budget not just crash the tick."""
@ -158,9 +188,9 @@ def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spa
)
res = kb.dispatch_once(conn, failure_limit=3)
task = kb.get_task(conn, tid)
assert task.spawn_failures == 1
assert task.consecutive_failures == 1
assert task.status == "ready"
assert task.last_spawn_error and "workspace" in task.last_spawn_error
assert task.last_failure_error and "workspace" in task.last_failure_error
# Run twice more → auto-blocked.
kb.dispatch_once(conn, failure_limit=3)
res = kb.dispatch_once(conn, failure_limit=3)
@ -3052,3 +3082,184 @@ def test_reassign_task_with_reclaim_first_switches_profile(kanban_home):
assert row["status"] == "ready"
finally:
conn.close()
# ---------------------------------------------------------------------------
# Unified failure counter — timeout + crash paths increment the same counter
# as spawn failures, and the circuit breaker trips after N consecutive
# failures regardless of which outcome caused them.
# ---------------------------------------------------------------------------
def test_enforce_max_runtime_increments_consecutive_failures(kanban_home, monkeypatch):
"""A single timeout increments consecutive_failures by 1 (was the
infinite-respawn gap before unification)."""
import hermes_cli.kanban_db as _kb
state = {"sent_term": False}
def _alive(pid):
return not state["sent_term"]
def _signal(pid, sig):
import signal as _sig
if sig == _sig.SIGTERM:
state["sent_term"] = True
monkeypatch.setattr(_kb, "_pid_alive", _alive)
conn = kb.connect()
try:
tid = kb.create_task(
conn, title="overrun", assignee="worker",
max_runtime_seconds=1,
)
kb.claim_task(conn, tid)
kb._set_worker_pid(conn, tid, os.getpid())
with kb.write_txn(conn):
conn.execute(
"UPDATE tasks SET started_at = ? WHERE id = ?",
(int(time.time()) - 30, tid),
)
before = kb.get_task(conn, tid)
assert before.consecutive_failures == 0
kb.enforce_max_runtime(conn, signal_fn=_signal)
after = kb.get_task(conn, tid)
assert after.consecutive_failures == 1
assert "elapsed" in (after.last_failure_error or "")
# Task status flipped back to ready (not yet past threshold).
assert after.status == "ready"
finally:
conn.close()
def test_repeated_timeouts_trip_the_circuit_breaker(kanban_home, monkeypatch):
"""N consecutive timeouts with the unified counter should eventually
hit the failure_limit threshold and auto-block the task. This closes
the Forbidden-Seeds-reported gap where timeout loops never capped.
"""
import hermes_cli.kanban_db as _kb
state = {"sent_term": False}
def _alive(pid):
return not state["sent_term"]
def _signal(pid, sig):
import signal as _sig
if sig == _sig.SIGTERM:
state["sent_term"] = True
monkeypatch.setattr(_kb, "_pid_alive", _alive)
conn = kb.connect()
try:
tid = kb.create_task(
conn, title="loop forever", assignee="slow-worker",
max_runtime_seconds=1,
)
# Drop the failure_limit to 3 so we don't need 5 timeouts.
# This uses the module-level DEFAULT; we simulate by calling
# _record_task_failure directly with a tight limit.
for _ in range(3):
# Fresh claim + "started long ago" each iteration.
with kb.write_txn(conn):
conn.execute(
"UPDATE tasks SET status='running', claim_lock=?, "
"claim_expires=?, worker_pid=?, started_at=? "
"WHERE id=?",
(
f"{_kb._claimer_id().split(':', 1)[0]}:lock",
int(time.time()) + 3600,
os.getpid(),
int(time.time()) - 30,
tid,
),
)
conn.execute(
"INSERT INTO task_runs (task_id, status, claim_lock, "
"claim_expires, worker_pid, started_at) "
"VALUES (?, 'running', ?, ?, ?, ?)",
(
tid,
f"{_kb._claimer_id().split(':', 1)[0]}:lock",
int(time.time()) + 3600,
os.getpid(),
int(time.time()) - 30,
),
)
rid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
conn.execute(
"UPDATE tasks SET current_run_id=? WHERE id=?",
(rid, tid),
)
state["sent_term"] = False
# Lower the threshold by monkeypatching the default.
monkeypatch.setattr(_kb, "DEFAULT_FAILURE_LIMIT", 3)
kb.enforce_max_runtime(conn, signal_fn=_signal)
final = kb.get_task(conn, tid)
# After 3 consecutive timeouts with failure_limit=3, task should
# be auto-blocked, not looping forever as ``ready``.
assert final.status == "blocked", \
f"expected blocked after 3 timeouts, got {final.status}"
assert final.consecutive_failures >= 3
# ``gave_up`` event emitted (plus 3 ``timed_out`` events).
kinds = [
r["kind"] for r in conn.execute(
"SELECT kind FROM task_events WHERE task_id=? ORDER BY id",
(tid,),
)
]
assert kinds.count("timed_out") >= 3
assert "gave_up" in kinds
finally:
conn.close()
def test_detect_crashed_workers_increments_counter(kanban_home):
"""A single crash increments the consecutive_failures counter."""
conn = kb.connect()
try:
tid = kb.create_task(conn, title="crashy", assignee="worker")
kb.claim_task(conn, tid)
kb._set_worker_pid(conn, tid, 99999) # fake pid — not alive
kb.detect_crashed_workers(conn)
task = kb.get_task(conn, tid)
assert task.consecutive_failures == 1
assert task.status == "ready"
finally:
conn.close()
def test_reclaim_task_clears_failure_counter(kanban_home):
"""Operator reclaim wipes the counter so the next retry gets a fresh
budget."""
import secrets
conn = kb.connect()
try:
tid = kb.create_task(conn, title="stuck", assignee="worker")
lock = secrets.token_hex(4)
with kb.write_txn(conn):
conn.execute(
"UPDATE tasks SET status='running', claim_lock=?, "
"claim_expires=?, worker_pid=?, consecutive_failures=4, "
"last_failure_error='prior issue' WHERE id=?",
(lock, int(time.time()) + 3600, 12345, tid),
)
conn.execute(
"INSERT INTO task_runs (task_id, status, claim_lock, "
"claim_expires, worker_pid, started_at) "
"VALUES (?, 'running', ?, ?, ?, ?)",
(tid, lock, int(time.time()) + 3600, 12345, int(time.time())),
)
rid = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
conn.execute(
"UPDATE tasks SET current_run_id=? WHERE id=?",
(rid, tid),
)
ok = kb.reclaim_task(conn, tid, reason="operator fixed config")
assert ok
task = kb.get_task(conn, tid)
assert task.consecutive_failures == 0
assert task.last_failure_error is None
assert task.status == "ready"
finally:
conn.close()

View file

@ -39,8 +39,8 @@ def _task(**overrides):
"title": "demo task",
"assignee": "demo",
"status": "ready",
"spawn_failures": 0,
"last_spawn_error": None,
"consecutive_failures": 0,
"last_failure_error": None,
}
base.update(overrides)
return base
@ -126,27 +126,55 @@ def test_prose_phantom_refs_clears_on_later_clean_edit():
assert diags == []
def test_repeated_spawn_failures_fires_at_threshold():
task = _task(status="blocked", spawn_failures=3,
last_spawn_error="Profile 'debugger' does not exist")
diags = kd.compute_task_diagnostics(task, [], [])
def test_repeated_failures_fires_at_threshold_on_spawn():
"""A task with multiple spawn_failed runs gets a spawn-flavoured
diagnostic (title mentions 'spawn', suggested action is ``doctor``).
"""
task = _task(status="ready", consecutive_failures=3,
last_failure_error="Profile 'debugger' does not exist")
runs = [
_run(outcome="spawn_failed", run_id=1),
_run(outcome="spawn_failed", run_id=2),
_run(outcome="spawn_failed", run_id=3),
]
diags = kd.compute_task_diagnostics(task, [], runs)
assert len(diags) == 1
d = diags[0]
assert d.kind == "repeated_spawn_failures"
assert d.kind == "repeated_failures"
assert d.severity == "error"
# CLI hints are what operators actually need here.
suggested = [a.label for a in d.actions if a.suggested]
assert any("doctor" in s for s in suggested)
def test_repeated_spawn_failures_escalates_to_critical():
task = _task(spawn_failures=6, last_spawn_error="boom")
def test_repeated_failures_fires_on_timeout_loop():
"""The rule surfaces for timeout loops too — that's the point of
unifying the counter. Suggested action is 'check logs', not
'fix profile'."""
task = _task(status="ready", consecutive_failures=3,
last_failure_error="elapsed 600s > limit 300s")
runs = [
_run(outcome="timed_out", run_id=1),
_run(outcome="timed_out", run_id=2),
_run(outcome="timed_out", run_id=3),
]
diags = kd.compute_task_diagnostics(task, [], runs)
assert len(diags) == 1
d = diags[0]
assert d.kind == "repeated_failures"
assert d.data["most_recent_outcome"] == "timed_out"
suggested = [a.label for a in d.actions if a.suggested]
assert any("log" in s.lower() for s in suggested)
def test_repeated_failures_escalates_to_critical():
task = _task(consecutive_failures=6, last_failure_error="boom")
diags = kd.compute_task_diagnostics(task, [], [])
assert diags[0].severity == "critical"
def test_repeated_spawn_failures_below_threshold_silent():
task = _task(spawn_failures=2)
def test_repeated_failures_below_threshold_silent():
task = _task(consecutive_failures=2)
assert kd.compute_task_diagnostics(task, [], []) == []
@ -243,9 +271,9 @@ def test_repeated_crashes_no_error_fallback_title():
assert "no error recorded" in diags[0].title
def test_repeated_spawn_failures_surfaces_actual_error_in_title():
task = _task(spawn_failures=5,
last_spawn_error="insufficient_quota: billing limit reached")
def test_repeated_failures_surfaces_actual_error_in_title():
task = _task(consecutive_failures=5,
last_failure_error="insufficient_quota: billing limit reached")
diags = kd.compute_task_diagnostics(task, [], [])
assert len(diags) == 1
d = diags[0]
@ -280,8 +308,8 @@ def test_repeated_crashes_truncates_huge_tracebacks():
def test_diagnostics_sorted_critical_first():
"""A task with both a critical (many spawn failures) and a warning
(prose phantoms) diagnostic should list the critical one first."""
task = _task(status="done", spawn_failures=10,
last_spawn_error="nope")
task = _task(status="done", consecutive_failures=10,
last_failure_error="nope")
events = [
_event("completed", ts=100, summary="referenced t_missing"),
_event("suspected_hallucinated_references", ts=101,
@ -289,7 +317,7 @@ def test_diagnostics_sorted_critical_first():
]
diags = kd.compute_task_diagnostics(task, events, [])
kinds = [d.kind for d in diags]
assert kinds[0] == "repeated_spawn_failures" # critical
assert kinds[0] == "repeated_failures" # critical
assert "prose_phantom_refs" in kinds
@ -346,8 +374,8 @@ def test_broken_rule_is_isolated(monkeypatch):
# rules should still run and produce their diagnostics.
monkeypatch.setattr(kd, "_RULES", [_bad_rule] + kd._RULES)
task = _task(spawn_failures=5, last_spawn_error="e")
task = _task(consecutive_failures=5, last_failure_error="e")
diags = kd.compute_task_diagnostics(task, [], [])
# The broken rule silently drops, the real one still fires.
kinds = [d.kind for d in diags]
assert "repeated_spawn_failures" in kinds
assert "repeated_failures" in kinds

View file

@ -1395,7 +1395,7 @@ def test_diagnostics_endpoint_severity_filter(client):
# An error-severity diagnostic (spawn failures) on another
p2 = kb.create_task(conn, title="spawn", assignee="b")
conn.execute(
"UPDATE tasks SET spawn_failures=5, last_spawn_error='x' WHERE id=?",
"UPDATE tasks SET consecutive_failures=5, last_failure_error='x' WHERE id=?",
(p2,),
)
conn.commit()