From 7bcdced6c1d87a342f5d29e28821d40832d25066 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 19 May 2026 03:27:45 -0700 Subject: [PATCH] fix(kanban): respawn guard defers blocker_auth instead of auto-blocking (#28683) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #28455. The respawn guard's blocker_auth rule (last error matched a quota/auth/429 pattern) was auto-blocking the task on first occurrence. That's too aggressive: transient rate limits typically clear in seconds to minutes, but the auto-block puts the task in 'blocked' status which requires manual unblock. Now treats blocker_auth the same as recent_success and active_pr: defer the spawn this tick, leave the task in 'ready', let the next tick try again. If the auth error genuinely persists, the existing consecutive_failures counter trips the auto-block circuit breaker after failure_limit failures via the normal path — so a persistent 401/403/quota-exhausted still ends up blocked, just not on first hit. Also documents the respawn_guarded event in kanban.md's events table with the three guard reasons. Updated test_dispatch_respawn_guard_auto_blocks_auth_error → renamed to test_dispatch_respawn_guard_defers_auth_error_without_auto_block; asserts task stays in 'ready' and the guard reason is recorded. --- hermes_cli/kanban_db.py | 49 ++++++++++++---------- tests/hermes_cli/test_kanban_db.py | 30 +++++++++++-- website/docs/user-guide/features/kanban.md | 1 + 3 files changed, 53 insertions(+), 27 deletions(-) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index 80fb1153f58..edeae51707b 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -4462,12 +4462,20 @@ def check_respawn_guard(conn: sqlite3.Connection, task_id: str) -> Optional[str] """Return a guard reason if ``task_id`` should NOT be re-spawned, else None. Called per ready task in ``dispatch_once`` before any claim attempt. + Returning a reason defers the spawn this tick; the task stays in + ``ready`` and gets another chance on the next dispatcher tick. + Checks in priority order: ``"blocker_auth"`` The task's last failure error matches a quota / authentication - pattern. Retrying immediately will not help; the dispatcher - should auto-block the task to stop the respawn cycle. + pattern. Retrying immediately is unlikely to help (rate limits + reset on a timer; auth needs human action), so we defer to the + next tick. The existing ``consecutive_failures`` counter still + trips the auto-block circuit breaker after ``failure_limit`` + consecutive failures, so a persistent auth error eventually + blocks via the normal path — but a transient 429 gets a few + ticks of recovery first. ``"recent_success"`` A completed run exists within ``_RESPAWN_GUARD_SUCCESS_WINDOW`` @@ -4732,29 +4740,24 @@ def dispatch_once( continue # Respawn guard: refuse to re-spawn when useful work is already # in-flight/recent, or when the last failure is a deterministic - # blocker (quota / auth) that retrying won't resolve. + # blocker (quota / auth). The guard defers the spawn this tick so + # the task gets a chance to clear (rate limits often reset in + # seconds-to-minutes); the existing consecutive_failures counter + # still trips the auto-block circuit breaker after failure_limit + # consecutive failures, so a persistent auth error eventually + # blocks via the normal path rather than on first occurrence. guard_reason = check_respawn_guard(conn, row["id"]) if guard_reason is not None: - if guard_reason == "blocker_auth" and not dry_run: - # Auto-block to stop the cycle — quota/auth errors are - # deterministic and retrying immediately wastes quota. - # block_task emits its own "blocked" event, so no - # additional respawn_guarded event is needed here. - if block_task(conn, row["id"], reason=f"respawn_guard: {guard_reason}"): - result.auto_blocked.append(row["id"]) - else: - result.respawn_guarded.append((row["id"], guard_reason)) - else: - result.respawn_guarded.append((row["id"], guard_reason)) - # Emit an event so operators can see why the task was - # skipped when reading `hermes kanban tail` — without - # this the task appears stuck in ready with no diagnosis. - if not dry_run: - with write_txn(conn): - _append_event( - conn, row["id"], "respawn_guarded", - {"reason": guard_reason}, - ) + result.respawn_guarded.append((row["id"], guard_reason)) + # Emit an event so operators can see why the task was + # skipped when reading `hermes kanban tail` — without + # this the task appears stuck in ready with no diagnosis. + if not dry_run: + with write_txn(conn): + _append_event( + conn, row["id"], "respawn_guarded", + {"reason": guard_reason}, + ) continue if dry_run: result.spawned.append((row["id"], row["assignee"], "")) diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index c5b5a97a12b..a33becb4e0f 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -1174,10 +1174,20 @@ def test_respawn_guard_old_pr_comment_not_guarded(kanban_home): assert reason is None -def test_dispatch_respawn_guard_auto_blocks_auth_error( +def test_dispatch_respawn_guard_defers_auth_error_without_auto_block( kanban_home, all_assignees_spawnable ): - """dispatch_once auto-blocks a ready task whose last error is a blocker_auth.""" + """dispatch_once defers (does NOT auto-block) a ready task whose last + error is a blocker_auth. + + The old behaviour auto-blocked on first occurrence, which was too + aggressive: a transient 429 rate-limit (which typically clears in + seconds to minutes) would end up requiring manual unblock. The new + behaviour defers the spawn this tick; the task stays in ``ready`` + and gets another chance next tick. If the auth error genuinely + persists, the existing ``consecutive_failures`` circuit breaker + will auto-block via the normal failure-limit path. + """ spawned_ids = [] def fake_spawn(task, workspace): @@ -1191,10 +1201,22 @@ def test_dispatch_respawn_guard_auto_blocks_auth_error( ) res = kb.dispatch_once(conn, spawn_fn=fake_spawn) - assert t in res.auto_blocked + # Critical: task is NOT auto-blocked on first occurrence. + assert t not in res.auto_blocked, ( + f"blocker_auth should defer, not auto-block on first occurrence; " + f"got auto_blocked={res.auto_blocked!r}" + ) + # It IS recorded as respawn_guarded with the reason. + assert (t, "blocker_auth") in res.respawn_guarded, ( + f"expected (task_id, 'blocker_auth') in respawn_guarded; " + f"got {res.respawn_guarded!r}" + ) + # And it's NOT spawned this tick. assert t not in spawned_ids + # Status stays ``ready`` so a future tick (or operator action) can + # retry without manual unblock. with kb.connect() as conn: - assert kb.get_task(conn, t).status == "blocked" + assert kb.get_task(conn, t).status == "ready" def test_dispatch_respawn_guard_skips_recent_success( diff --git a/website/docs/user-guide/features/kanban.md b/website/docs/user-guide/features/kanban.md index 275195ef921..f251dff0c3b 100644 --- a/website/docs/user-guide/features/kanban.md +++ b/website/docs/user-guide/features/kanban.md @@ -834,6 +834,7 @@ Every transition appends a row to `task_events`. Each row carries an optional `r | `crashed` | `{pid, claimer}` | Worker PID no longer alive but TTL hadn't expired yet. | | `timed_out` | `{pid, elapsed_seconds, limit_seconds, sigkill}` | `max_runtime_seconds` exceeded; dispatcher SIGTERM'd (then SIGKILL'd after 5 s grace) and re-queued. | | `stale` | `{elapsed_seconds, last_heartbeat_at, heartbeat_age_seconds, timeout_seconds, pid, terminated}` | Task ran longer than `kanban.dispatch_stale_timeout_seconds` (default 4 h) AND no `kanban_heartbeat` arrived in the last hour. Dispatcher SIGTERM'd the host-local worker (if any), reset the task to `ready` for re-dispatch. Does NOT tick the failure counter (stale is dispatcher-side absence detection, not a worker fault). Workers running long operations should call `kanban_heartbeat` at least once an hour to avoid this. | +| `respawn_guarded` | `{reason}` | Dispatcher refused to re-spawn this ready task this tick. Reasons: `blocker_auth` (last failure was a quota/auth/429 error — wait for the rate window to reset), `recent_success` (a completed run happened in the last hour — wait for review before re-running), `active_pr` (a GitHub PR URL appears in a recent comment — a prior worker already opened a PR). The task stays in `ready`; the next tick gets another chance to spawn. If the underlying condition persists, the normal `consecutive_failures` circuit breaker will auto-block via `gave_up` after `failure_limit` failures. | | `spawn_failed` | `{error, failures}` | One spawn attempt failed (missing PATH, workspace unmountable, …). Counter increments; task returns to `ready` for retry. | | `protocol_violation` | `{pid, claimer, exit_code}` | Worker exited successfully while the task was still `running`, usually because it answered without calling `kanban_complete` or `kanban_block`. The dispatcher also emits `gave_up` and auto-blocks immediately instead of retrying. | | `gave_up` | `{failures, effective_limit, limit_source, error}` | Circuit breaker fired after N consecutive non-successful attempts. Task auto-blocks with the last error. The effective limit resolves as task `max_retries`, then dispatcher `failure_limit` / `kanban.failure_limit`, then the built-in default. |