fix(kanban): auto-block workers that exit without completing (#20894) (#21214)

When a kanban worker subprocess exits rc=0 but its task is still in
status='running', the agent almost certainly answered the task
conversationally without calling kanban_complete or kanban_block. The
dispatcher used to classify this as a generic crash and respawn, which
loops forever on small local models (gemma4-e2b q4 etc.) that keep
returning clean but unproductive output.

Dispatcher changes:
- The waitpid reap loop at the top of dispatch_once now records each
  reaped child's raw exit status in a bounded module registry
  (_recent_worker_exits, TTL 600s, size cap 4096).
- _classify_worker_exit distinguishes clean_exit / nonzero_exit /
  signaled / unknown using os.WIFEXITED / WIFSIGNALED.
- detect_crashed_workers consults the classification when a worker
  is found dead. clean_exit → protocol_violation event + immediate
  circuit-breaker trip (failure_limit=1). Everything else keeps the
  existing crashed-event + counter behavior.
- DispatchResult.auto_blocked now includes protocol-violation trips.

Gateway fix (Bug A in #20894):
- gateway.run._notify_active_sessions_of_shutdown snapshots
  self.adapters with list(...) before iterating. adapter.send() can
  hit a fatal-error path that pops the adapter from the dict, which
  was raising 'RuntimeError: dictionary changed size during iteration'
  during shutdown.

Regression tests:
- test_detect_crashed_workers_protocol_violation_auto_blocks verifies
  rc=0 + still-running → status=blocked on first occurrence with
  protocol_violation + gave_up events and NO crashed event.
- test_detect_crashed_workers_nonzero_exit_uses_default_limit verifies
  non-zero exits keep the existing 2-strike behavior.

Closes #20894.
This commit is contained in:
Teknium 2026-05-07 05:24:16 -07:00 committed by GitHub
parent 699c770e5c
commit fdb9e0f6a6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 255 additions and 14 deletions

View file

@ -3636,6 +3636,100 @@ def test_detect_crashed_workers_increments_counter(kanban_home):
conn.close()
def test_detect_crashed_workers_protocol_violation_auto_blocks(kanban_home):
"""A worker that exited rc=0 while its task was still ``running``
is a protocol violation (agent answered conversationally without
calling kanban_complete / kanban_block). Retrying will just loop,
so auto-block immediately instead of waiting for the breaker to
trip at ``DEFAULT_FAILURE_LIMIT``.
Regression test for the respawn-loop-after-completion bug reported
against small local models (gemma4-e2b q4) where the model writes
the answer as plain text and the CLI exits rc=0 cleanly.
"""
import hermes_cli.kanban_db as _kb
conn = kb.connect()
try:
tid = kb.create_task(conn, title="quiet", assignee="worker")
host_prefix = _kb._claimer_id().split(":", 1)[0]
lock = f"{host_prefix}:mock"
kb.claim_task(conn, tid, claimer=lock)
fake_pid = 999998
kb._set_worker_pid(conn, tid, fake_pid)
# Simulate the reap loop having recorded a clean exit for this pid.
# os.W_EXITCODE(status=0, signal=0) == 0 on POSIX.
_kb._record_worker_exit(fake_pid, 0)
# Force liveness check to say "dead" for the fake pid.
original_alive = _kb._pid_alive
_kb._pid_alive = lambda p: False
try:
result_crashed = kb.detect_crashed_workers(conn)
finally:
_kb._pid_alive = original_alive
assert tid in result_crashed, "should be detected as crashed"
task = kb.get_task(conn, tid)
assert task.status == "blocked", (
f"protocol violation should auto-block on first occurrence, "
f"got status={task.status}"
)
assert "kanban_complete" in (task.last_failure_error or ""), (
f"expected protocol-violation message, got {task.last_failure_error!r}"
)
events = kb.list_events(conn, tid)
kinds = [e.kind for e in events]
assert "protocol_violation" in kinds, (
f"expected 'protocol_violation' event, got {kinds}"
)
# The ``crashed`` event would be misleading here — the worker
# didn't crash, it returned 0.
assert "crashed" not in kinds, (
f"should NOT emit 'crashed' event on clean exit, got {kinds}"
)
assert "gave_up" in kinds, (
f"breaker should trip, expected 'gave_up' event, got {kinds}"
)
finally:
conn.close()
def test_detect_crashed_workers_nonzero_exit_uses_default_limit(kanban_home):
"""A worker that exited non-zero (real error / crash) uses the
normal counter path one failure doesn't trip the breaker.
"""
import hermes_cli.kanban_db as _kb
conn = kb.connect()
try:
tid = kb.create_task(conn, title="crashy", assignee="worker")
host_prefix = _kb._claimer_id().split(":", 1)[0]
kb.claim_task(conn, tid, claimer=f"{host_prefix}:mock")
fake_pid = 999997
kb._set_worker_pid(conn, tid, fake_pid)
# W_EXITCODE(1, 0) == 256 — WIFEXITED True, WEXITSTATUS == 1.
_kb._record_worker_exit(fake_pid, 256)
original_alive = _kb._pid_alive
_kb._pid_alive = lambda p: False
try:
kb.detect_crashed_workers(conn)
finally:
_kb._pid_alive = original_alive
task = kb.get_task(conn, tid)
assert task.status == "ready", (
f"single non-zero crash shouldn't auto-block, got {task.status}"
)
assert task.consecutive_failures == 1
events = kb.list_events(conn, tid)
kinds = [e.kind for e in events]
assert "crashed" in kinds
assert "protocol_violation" not in kinds
finally:
conn.close()
def test_reclaim_task_clears_failure_counter(kanban_home):
"""Operator reclaim wipes the counter so the next retry gets a fresh
budget."""