mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
fix(kanban): unify failure counter across spawn/timeout/crash outcomes (#20410)
The dispatcher's circuit breaker only protected against spawn-side
failures (profile missing, workspace mount error, exec failure).
Workers that successfully spawned but then timed out or crashed
re-queued to ``ready`` with no counter increment, so the next tick
re-spawned them — loops forever until someone noticed. Reported
externally on Twitter (Forbidden Seeds) and confirmed by walking the
kernel: ``enforce_max_runtime`` flipped the task back to ready, emitted
a ``timed_out`` event, and never touched ``spawn_failures``; same for
``detect_crashed_workers``.
Fix: unify the counter across all non-success outcomes.
Schema
------
* ``tasks.spawn_failures`` → ``tasks.consecutive_failures``
* ``tasks.last_spawn_error`` → ``tasks.last_failure_error``
* Migration renames the columns in-place on existing DBs (``ALTER
TABLE RENAME COLUMN`` — SQLite >= 3.25) so historical counter
values are preserved. Row mappers fall through to the legacy names
if both column renames and a migration somehow got out of sync.
Counter lifecycle
-----------------
New helper ``_record_task_failure(conn, task_id, error, *, outcome,
release_claim, end_run, event_payload_extra)`` is the single point
every non-success outcome funnels through:
* ``spawn_failed`` → ``_record_spawn_failure`` (kept as alias)
calls it with ``release_claim=True, end_run=True`` — transitions
running→ready, clears claim, closes run.
* ``timed_out`` → ``enforce_max_runtime`` already does the status
transition + run close + event emission, then calls
``_record_task_failure`` with ``release_claim=False, end_run=False``
just to bump the counter (and trip the breaker if needed).
* ``crashed`` → ``detect_crashed_workers`` same pattern, but the
counter increment runs after the main write_txn closes (SQLite
doesn't nest write transactions).
If the counter hits the breaker threshold (``DEFAULT_FAILURE_LIMIT=5``,
same as before), the task transitions to ``blocked`` with a ``gave_up``
event on top of whatever outcome-specific event was already emitted.
Reset semantics changed: the counter now clears only on successful
``complete_task`` (and operator ``reclaim_task`` — an explicit "I've
looked at this, try again with a fresh budget"). Previously
``_clear_spawn_failures`` ran on every successful spawn, which would
have wiped the counter before a timeout could accumulate past threshold
— exactly the loop this fix prevents.
Diagnostics
-----------
* ``_rule_repeated_spawn_failures`` → ``_rule_repeated_failures``. Now
fires regardless of which outcome is at fault. Classifies the most
recent failure (spawn_failed / timed_out / crashed) from the run
history so the title ("Agent timeout x3", "Agent crash x4", "Agent
spawn x5") and suggested action (``doctor`` for spawn, ``log`` for
timeout/crash) stay outcome-specific without N duplicate rules.
* ``_rule_repeated_crashes`` kept as a narrower early-warning at
threshold 2 (vs 3 for the unified rule), but now suppresses itself
when the unified rule would also fire — avoids double-flagging.
* Diagnostic ``data`` payload now carries
``{consecutive_failures, most_recent_outcome, last_error}`` instead
of spawn-specific keys.
CLI
---
* ``Task.consecutive_failures`` / ``Task.last_failure_error`` are the
public fields now. Existing callers that referenced the old names
get migrated (tests updated in this commit).
* Backward-compat: ``DEFAULT_SPAWN_FAILURE_LIMIT``,
``_clear_spawn_failures``, ``_record_spawn_failure`` stay as aliases.
Tests
-----
* 6 new kernel tests: timeout increments counter, 3 consecutive
timeouts trip the breaker (was the reported gap), crash increments
counter, reclaim clears counter, completion clears counter, spawn
success does NOT clear counter.
* Diagnostic tests: updated ``repeated_spawn_failures`` cases to use
the new kind name and add a timeout-loop test.
* Dashboard API test: spawn_failures column update → consecutive_failures.
389/389 kanban-suite tests pass.
Live verification
-----------------
Seeded 4 tasks in an isolated HERMES_HOME: 3 timeouts, 4 crashes,
2-spawn-failed + 2-timed-out, and a task that had prior failures but
completed successfully. Board correctly shows "!! 3 tasks need
attention" (the successful one has no badge because the counter
reset). Drawer for the timeout-loop task renders "Agent timeout x3"
with most_recent_outcome=timed_out and the "Check logs" suggested
action (not the spawn-flavoured "Verify profile"). The successful
task has zero diagnostics.
Closes the Forbidden-Seeds-reported gap.
This commit is contained in:
parent
587ef55f2c
commit
1fc8733a69
5 changed files with 630 additions and 125 deletions
|
|
@ -39,8 +39,8 @@ def _task(**overrides):
|
|||
"title": "demo task",
|
||||
"assignee": "demo",
|
||||
"status": "ready",
|
||||
"spawn_failures": 0,
|
||||
"last_spawn_error": None,
|
||||
"consecutive_failures": 0,
|
||||
"last_failure_error": None,
|
||||
}
|
||||
base.update(overrides)
|
||||
return base
|
||||
|
|
@ -126,27 +126,55 @@ def test_prose_phantom_refs_clears_on_later_clean_edit():
|
|||
assert diags == []
|
||||
|
||||
|
||||
def test_repeated_spawn_failures_fires_at_threshold():
|
||||
task = _task(status="blocked", spawn_failures=3,
|
||||
last_spawn_error="Profile 'debugger' does not exist")
|
||||
diags = kd.compute_task_diagnostics(task, [], [])
|
||||
def test_repeated_failures_fires_at_threshold_on_spawn():
|
||||
"""A task with multiple spawn_failed runs gets a spawn-flavoured
|
||||
diagnostic (title mentions 'spawn', suggested action is ``doctor``).
|
||||
"""
|
||||
task = _task(status="ready", consecutive_failures=3,
|
||||
last_failure_error="Profile 'debugger' does not exist")
|
||||
runs = [
|
||||
_run(outcome="spawn_failed", run_id=1),
|
||||
_run(outcome="spawn_failed", run_id=2),
|
||||
_run(outcome="spawn_failed", run_id=3),
|
||||
]
|
||||
diags = kd.compute_task_diagnostics(task, [], runs)
|
||||
assert len(diags) == 1
|
||||
d = diags[0]
|
||||
assert d.kind == "repeated_spawn_failures"
|
||||
assert d.kind == "repeated_failures"
|
||||
assert d.severity == "error"
|
||||
# CLI hints are what operators actually need here.
|
||||
suggested = [a.label for a in d.actions if a.suggested]
|
||||
assert any("doctor" in s for s in suggested)
|
||||
|
||||
|
||||
def test_repeated_spawn_failures_escalates_to_critical():
|
||||
task = _task(spawn_failures=6, last_spawn_error="boom")
|
||||
def test_repeated_failures_fires_on_timeout_loop():
|
||||
"""The rule surfaces for timeout loops too — that's the point of
|
||||
unifying the counter. Suggested action is 'check logs', not
|
||||
'fix profile'."""
|
||||
task = _task(status="ready", consecutive_failures=3,
|
||||
last_failure_error="elapsed 600s > limit 300s")
|
||||
runs = [
|
||||
_run(outcome="timed_out", run_id=1),
|
||||
_run(outcome="timed_out", run_id=2),
|
||||
_run(outcome="timed_out", run_id=3),
|
||||
]
|
||||
diags = kd.compute_task_diagnostics(task, [], runs)
|
||||
assert len(diags) == 1
|
||||
d = diags[0]
|
||||
assert d.kind == "repeated_failures"
|
||||
assert d.data["most_recent_outcome"] == "timed_out"
|
||||
suggested = [a.label for a in d.actions if a.suggested]
|
||||
assert any("log" in s.lower() for s in suggested)
|
||||
|
||||
|
||||
def test_repeated_failures_escalates_to_critical():
|
||||
task = _task(consecutive_failures=6, last_failure_error="boom")
|
||||
diags = kd.compute_task_diagnostics(task, [], [])
|
||||
assert diags[0].severity == "critical"
|
||||
|
||||
|
||||
def test_repeated_spawn_failures_below_threshold_silent():
|
||||
task = _task(spawn_failures=2)
|
||||
def test_repeated_failures_below_threshold_silent():
|
||||
task = _task(consecutive_failures=2)
|
||||
assert kd.compute_task_diagnostics(task, [], []) == []
|
||||
|
||||
|
||||
|
|
@ -243,9 +271,9 @@ def test_repeated_crashes_no_error_fallback_title():
|
|||
assert "no error recorded" in diags[0].title
|
||||
|
||||
|
||||
def test_repeated_spawn_failures_surfaces_actual_error_in_title():
|
||||
task = _task(spawn_failures=5,
|
||||
last_spawn_error="insufficient_quota: billing limit reached")
|
||||
def test_repeated_failures_surfaces_actual_error_in_title():
|
||||
task = _task(consecutive_failures=5,
|
||||
last_failure_error="insufficient_quota: billing limit reached")
|
||||
diags = kd.compute_task_diagnostics(task, [], [])
|
||||
assert len(diags) == 1
|
||||
d = diags[0]
|
||||
|
|
@ -280,8 +308,8 @@ def test_repeated_crashes_truncates_huge_tracebacks():
|
|||
def test_diagnostics_sorted_critical_first():
|
||||
"""A task with both a critical (many spawn failures) and a warning
|
||||
(prose phantoms) diagnostic should list the critical one first."""
|
||||
task = _task(status="done", spawn_failures=10,
|
||||
last_spawn_error="nope")
|
||||
task = _task(status="done", consecutive_failures=10,
|
||||
last_failure_error="nope")
|
||||
events = [
|
||||
_event("completed", ts=100, summary="referenced t_missing"),
|
||||
_event("suspected_hallucinated_references", ts=101,
|
||||
|
|
@ -289,7 +317,7 @@ def test_diagnostics_sorted_critical_first():
|
|||
]
|
||||
diags = kd.compute_task_diagnostics(task, events, [])
|
||||
kinds = [d.kind for d in diags]
|
||||
assert kinds[0] == "repeated_spawn_failures" # critical
|
||||
assert kinds[0] == "repeated_failures" # critical
|
||||
assert "prose_phantom_refs" in kinds
|
||||
|
||||
|
||||
|
|
@ -346,8 +374,8 @@ def test_broken_rule_is_isolated(monkeypatch):
|
|||
# rules should still run and produce their diagnostics.
|
||||
monkeypatch.setattr(kd, "_RULES", [_bad_rule] + kd._RULES)
|
||||
|
||||
task = _task(spawn_failures=5, last_spawn_error="e")
|
||||
task = _task(consecutive_failures=5, last_failure_error="e")
|
||||
diags = kd.compute_task_diagnostics(task, [], [])
|
||||
# The broken rule silently drops, the real one still fires.
|
||||
kinds = [d.kind for d in diags]
|
||||
assert "repeated_spawn_failures" in kinds
|
||||
assert "repeated_failures" in kinds
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue