fix(kanban): auto-block workers that exit without completing (#20894) (#21214)

When a kanban worker subprocess exits rc=0 but its task is still in status='running', the agent almost certainly answered the task conversationally without calling kanban_complete or kanban_block. The dispatcher used to classify this as a generic crash and respawn, which loops forever on small local models (gemma4-e2b q4 etc.) that keep returning clean but unproductive output. Dispatcher changes: - The waitpid reap loop at the top of dispatch_once now records each reaped child's raw exit status in a bounded module registry (_recent_worker_exits, TTL 600s, size cap 4096). - _classify_worker_exit distinguishes clean_exit / nonzero_exit / signaled / unknown using os.WIFEXITED / WIFSIGNALED. - detect_crashed_workers consults the classification when a worker is found dead. clean_exit → protocol_violation event + immediate circuit-breaker trip (failure_limit=1). Everything else keeps the existing crashed-event + counter behavior. - DispatchResult.auto_blocked now includes protocol-violation trips. Gateway fix (Bug A in #20894): - gateway.run._notify_active_sessions_of_shutdown snapshots self.adapters with list(...) before iterating. adapter.send() can hit a fatal-error path that pops the adapter from the dict, which was raising 'RuntimeError: dictionary changed size during iteration' during shutdown. Regression tests: - test_detect_crashed_workers_protocol_violation_auto_blocks verifies rc=0 + still-running → status=blocked on first occurrence with protocol_violation + gave_up events and NO crashed event. - test_detect_crashed_workers_nonzero_exit_uses_default_limit verifies non-zero exits keep the existing 2-strike behavior. Closes #20894.
2026-05-11 03:31:55 +00:00 · 2026-05-07 05:24:16 -07:00 · 2026-05-07 05:24:16 -07:00 · fdb9e0f6a6
commit fdb9e0f6a6
parent 699c770e5c
3 changed files with 255 additions and 14 deletions
--- a/tests/hermes_cli/test_kanban_core_functionality.py
+++ b/tests/hermes_cli/test_kanban_core_functionality.py
@ -3636,6 +3636,100 @@ def test_detect_crashed_workers_increments_counter(kanban_home):
        conn.close()


+def test_detect_crashed_workers_protocol_violation_auto_blocks(kanban_home):
+    """A worker that exited rc=0 while its task was still ``running``
+    is a protocol violation (agent answered conversationally without
+    calling kanban_complete / kanban_block). Retrying will just loop,
+    so auto-block immediately instead of waiting for the breaker to
+    trip at ``DEFAULT_FAILURE_LIMIT``.
+
+    Regression test for the respawn-loop-after-completion bug reported
+    against small local models (gemma4-e2b q4) where the model writes
+    the answer as plain text and the CLI exits rc=0 cleanly.
+    """
+    import hermes_cli.kanban_db as _kb
+    conn = kb.connect()
+    try:
+        tid = kb.create_task(conn, title="quiet", assignee="worker")
+        host_prefix = _kb._claimer_id().split(":", 1)[0]
+        lock = f"{host_prefix}:mock"
+        kb.claim_task(conn, tid, claimer=lock)
+        fake_pid = 999998
+        kb._set_worker_pid(conn, tid, fake_pid)
+
+        # Simulate the reap loop having recorded a clean exit for this pid.
+        # os.W_EXITCODE(status=0, signal=0) == 0 on POSIX.
+        _kb._record_worker_exit(fake_pid, 0)
+        # Force liveness check to say "dead" for the fake pid.
+        original_alive = _kb._pid_alive
+        _kb._pid_alive = lambda p: False
+        try:
+            result_crashed = kb.detect_crashed_workers(conn)
+        finally:
+            _kb._pid_alive = original_alive
+
+        assert tid in result_crashed, "should be detected as crashed"
+        task = kb.get_task(conn, tid)
+        assert task.status == "blocked", (
+            f"protocol violation should auto-block on first occurrence, "
+            f"got status={task.status}"
+        )
+        assert "kanban_complete" in (task.last_failure_error or ""), (
+            f"expected protocol-violation message, got {task.last_failure_error!r}"
+        )
+
+        events = kb.list_events(conn, tid)
+        kinds = [e.kind for e in events]
+        assert "protocol_violation" in kinds, (
+            f"expected 'protocol_violation' event, got {kinds}"
+        )
+        # The ``crashed`` event would be misleading here — the worker
+        # didn't crash, it returned 0.
+        assert "crashed" not in kinds, (
+            f"should NOT emit 'crashed' event on clean exit, got {kinds}"
+        )
+        assert "gave_up" in kinds, (
+            f"breaker should trip, expected 'gave_up' event, got {kinds}"
+        )
+    finally:
+        conn.close()
+
+
+def test_detect_crashed_workers_nonzero_exit_uses_default_limit(kanban_home):
+    """A worker that exited non-zero (real error / crash) uses the
+    normal counter path — one failure doesn't trip the breaker.
+    """
+    import hermes_cli.kanban_db as _kb
+    conn = kb.connect()
+    try:
+        tid = kb.create_task(conn, title="crashy", assignee="worker")
+        host_prefix = _kb._claimer_id().split(":", 1)[0]
+        kb.claim_task(conn, tid, claimer=f"{host_prefix}:mock")
+        fake_pid = 999997
+        kb._set_worker_pid(conn, tid, fake_pid)
+
+        # W_EXITCODE(1, 0) == 256 — WIFEXITED True, WEXITSTATUS == 1.
+        _kb._record_worker_exit(fake_pid, 256)
+        original_alive = _kb._pid_alive
+        _kb._pid_alive = lambda p: False
+        try:
+            kb.detect_crashed_workers(conn)
+        finally:
+            _kb._pid_alive = original_alive
+
+        task = kb.get_task(conn, tid)
+        assert task.status == "ready", (
+            f"single non-zero crash shouldn't auto-block, got {task.status}"
+        )
+        assert task.consecutive_failures == 1
+        events = kb.list_events(conn, tid)
+        kinds = [e.kind for e in events]
+        assert "crashed" in kinds
+        assert "protocol_violation" not in kinds
+    finally:
+        conn.close()
+
+
 def test_reclaim_task_clears_failure_counter(kanban_home):
    """Operator reclaim wipes the counter so the next retry gets a fresh
    budget."""