mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(kanban): respawn guard defers blocker_auth instead of auto-blocking (#28683)
Follow-up to #28455. The respawn guard's blocker_auth rule (last error matched a quota/auth/429 pattern) was auto-blocking the task on first occurrence. That's too aggressive: transient rate limits typically clear in seconds to minutes, but the auto-block puts the task in 'blocked' status which requires manual unblock. Now treats blocker_auth the same as recent_success and active_pr: defer the spawn this tick, leave the task in 'ready', let the next tick try again. If the auth error genuinely persists, the existing consecutive_failures counter trips the auto-block circuit breaker after failure_limit failures via the normal path — so a persistent 401/403/quota-exhausted still ends up blocked, just not on first hit. Also documents the respawn_guarded event in kanban.md's events table with the three guard reasons. Updated test_dispatch_respawn_guard_auto_blocks_auth_error → renamed to test_dispatch_respawn_guard_defers_auth_error_without_auto_block; asserts task stays in 'ready' and the guard reason is recorded.
This commit is contained in:
parent
b10b783208
commit
7bcdced6c1
3 changed files with 53 additions and 27 deletions
|
|
@ -4462,12 +4462,20 @@ def check_respawn_guard(conn: sqlite3.Connection, task_id: str) -> Optional[str]
|
|||
"""Return a guard reason if ``task_id`` should NOT be re-spawned, else None.
|
||||
|
||||
Called per ready task in ``dispatch_once`` before any claim attempt.
|
||||
Returning a reason defers the spawn this tick; the task stays in
|
||||
``ready`` and gets another chance on the next dispatcher tick.
|
||||
|
||||
Checks in priority order:
|
||||
|
||||
``"blocker_auth"``
|
||||
The task's last failure error matches a quota / authentication
|
||||
pattern. Retrying immediately will not help; the dispatcher
|
||||
should auto-block the task to stop the respawn cycle.
|
||||
pattern. Retrying immediately is unlikely to help (rate limits
|
||||
reset on a timer; auth needs human action), so we defer to the
|
||||
next tick. The existing ``consecutive_failures`` counter still
|
||||
trips the auto-block circuit breaker after ``failure_limit``
|
||||
consecutive failures, so a persistent auth error eventually
|
||||
blocks via the normal path — but a transient 429 gets a few
|
||||
ticks of recovery first.
|
||||
|
||||
``"recent_success"``
|
||||
A completed run exists within ``_RESPAWN_GUARD_SUCCESS_WINDOW``
|
||||
|
|
@ -4732,29 +4740,24 @@ def dispatch_once(
|
|||
continue
|
||||
# Respawn guard: refuse to re-spawn when useful work is already
|
||||
# in-flight/recent, or when the last failure is a deterministic
|
||||
# blocker (quota / auth) that retrying won't resolve.
|
||||
# blocker (quota / auth). The guard defers the spawn this tick so
|
||||
# the task gets a chance to clear (rate limits often reset in
|
||||
# seconds-to-minutes); the existing consecutive_failures counter
|
||||
# still trips the auto-block circuit breaker after failure_limit
|
||||
# consecutive failures, so a persistent auth error eventually
|
||||
# blocks via the normal path rather than on first occurrence.
|
||||
guard_reason = check_respawn_guard(conn, row["id"])
|
||||
if guard_reason is not None:
|
||||
if guard_reason == "blocker_auth" and not dry_run:
|
||||
# Auto-block to stop the cycle — quota/auth errors are
|
||||
# deterministic and retrying immediately wastes quota.
|
||||
# block_task emits its own "blocked" event, so no
|
||||
# additional respawn_guarded event is needed here.
|
||||
if block_task(conn, row["id"], reason=f"respawn_guard: {guard_reason}"):
|
||||
result.auto_blocked.append(row["id"])
|
||||
else:
|
||||
result.respawn_guarded.append((row["id"], guard_reason))
|
||||
else:
|
||||
result.respawn_guarded.append((row["id"], guard_reason))
|
||||
# Emit an event so operators can see why the task was
|
||||
# skipped when reading `hermes kanban tail` — without
|
||||
# this the task appears stuck in ready with no diagnosis.
|
||||
if not dry_run:
|
||||
with write_txn(conn):
|
||||
_append_event(
|
||||
conn, row["id"], "respawn_guarded",
|
||||
{"reason": guard_reason},
|
||||
)
|
||||
result.respawn_guarded.append((row["id"], guard_reason))
|
||||
# Emit an event so operators can see why the task was
|
||||
# skipped when reading `hermes kanban tail` — without
|
||||
# this the task appears stuck in ready with no diagnosis.
|
||||
if not dry_run:
|
||||
with write_txn(conn):
|
||||
_append_event(
|
||||
conn, row["id"], "respawn_guarded",
|
||||
{"reason": guard_reason},
|
||||
)
|
||||
continue
|
||||
if dry_run:
|
||||
result.spawned.append((row["id"], row["assignee"], ""))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue