mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-08 03:01:47 +00:00
fix: auto-block repeated kanban retries
This commit is contained in:
parent
595e906698
commit
411cfa26e3
5 changed files with 119 additions and 20 deletions
|
|
@ -1230,6 +1230,10 @@ DEFAULT_CONFIG = {
|
|||
# Seconds between dispatcher ticks (idle or not). Lower = snappier
|
||||
# pickup of newly-ready tasks; higher = less SQL pressure.
|
||||
"dispatch_interval_seconds": 60,
|
||||
# Auto-block after this many consecutive non-success attempts for the
|
||||
# same task/profile (spawn_failed, timed_out, or crashed). Reassignment
|
||||
# resets the streak for the new profile.
|
||||
"failure_limit": 2,
|
||||
},
|
||||
|
||||
# execute_code settings — controls the tool used for programmatic tool calls.
|
||||
|
|
|
|||
|
|
@ -443,8 +443,8 @@ def build_parser(parent_subparsers: argparse._SubParsersAction) -> argparse.Argu
|
|||
help="Cap number of spawns this pass")
|
||||
p_disp.add_argument("--failure-limit", type=int,
|
||||
default=kb.DEFAULT_SPAWN_FAILURE_LIMIT,
|
||||
help=f"Auto-block a task after this many consecutive spawn failures "
|
||||
f"(default: {kb.DEFAULT_SPAWN_FAILURE_LIMIT})")
|
||||
help=f"Auto-block a task after this many consecutive non-success attempts "
|
||||
f"(spawn_failed, timed_out, or crashed; default: {kb.DEFAULT_SPAWN_FAILURE_LIMIT})")
|
||||
p_disp.add_argument("--json", action="store_true")
|
||||
|
||||
# --- daemon (deprecated) ---
|
||||
|
|
@ -1657,6 +1657,7 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
|
|||
" kanban:\n"
|
||||
" dispatch_in_gateway: true # default\n"
|
||||
" dispatch_interval_seconds: 60\n"
|
||||
" failure_limit: 2 # consecutive non-success attempts before auto-block\n"
|
||||
"\n"
|
||||
"Running both the gateway AND this standalone daemon will\n"
|
||||
"race for claims. If you truly need the old standalone\n"
|
||||
|
|
|
|||
|
|
@ -1380,7 +1380,7 @@ def assign_task(conn: sqlite3.Connection, task_id: str, profile: Optional[str])
|
|||
profile = _canonical_assignee(profile)
|
||||
with write_txn(conn):
|
||||
row = conn.execute(
|
||||
"SELECT status, claim_lock FROM tasks WHERE id = ?", (task_id,)
|
||||
"SELECT status, claim_lock, assignee FROM tasks WHERE id = ?", (task_id,)
|
||||
).fetchone()
|
||||
if not row:
|
||||
return False
|
||||
|
|
@ -1389,7 +1389,17 @@ def assign_task(conn: sqlite3.Connection, task_id: str, profile: Optional[str])
|
|||
f"cannot reassign {task_id}: currently running (claimed). "
|
||||
"Wait for completion or reclaim the stale lock first."
|
||||
)
|
||||
conn.execute("UPDATE tasks SET assignee = ? WHERE id = ?", (profile, task_id))
|
||||
if row["assignee"] != profile:
|
||||
# The retry guard is scoped to the task/profile combination. A
|
||||
# human reassigning the task is an explicit recovery action, so the
|
||||
# new profile should not inherit the previous profile's streak.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET assignee = ?, consecutive_failures = 0, "
|
||||
"last_failure_error = NULL WHERE id = ?",
|
||||
(profile, task_id),
|
||||
)
|
||||
else:
|
||||
conn.execute("UPDATE tasks SET assignee = ? WHERE id = ?", (profile, task_id))
|
||||
_append_event(conn, task_id, "assigned", {"assignee": profile})
|
||||
return True
|
||||
|
||||
|
|
@ -2569,11 +2579,11 @@ def set_workspace_path(
|
|||
# Dispatcher (one-shot pass)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# After this many consecutive `spawn_failed` events on a task, the dispatcher
|
||||
# stops retrying and parks the task in ``blocked`` with a reason so a human
|
||||
# can investigate. Prevents the dispatcher from thrashing forever on a task
|
||||
# whose profile doesn't exist, whose workspace is unmountable, etc.
|
||||
DEFAULT_FAILURE_LIMIT = 5
|
||||
# After this many consecutive non-success attempts on a task/profile, the
|
||||
# dispatcher stops retrying and parks the task in ``blocked`` with a reason so
|
||||
# a human can investigate. Prevents retry storms when a worker repeatedly times
|
||||
# out, crashes, or cannot spawn.
|
||||
DEFAULT_FAILURE_LIMIT = 2
|
||||
# Legacy alias — callers / tests still reference the old name.
|
||||
DEFAULT_SPAWN_FAILURE_LIMIT = DEFAULT_FAILURE_LIMIT
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue