fix(kanban): suppress dispatcher stuck-warn when ready queue holds only non-spawnable assignees

After PR #20105 (dispatcher skips ready tasks whose assignee fails
``profile_exists()`` to prevent the orion-cc/orion-research crash
loop), the gateway and CLI emit a spurious "kanban dispatcher stuck:
ready queue non-empty for N consecutive ticks but 0 workers spawned"
warning every 5 minutes on multi-lane setups where the queue is
steadily full of human-pulled work assigned to terminal lanes.

The warn is intended to catch real failure modes (broken PATH,
missing venv, credential loss for a real Hermes profile). On a
multi-lane host it fires forever even though everything is healthy:
the dispatcher correctly chose not to spawn, and there is nothing
for the operator to fix.

Changes:

* ``DispatchResult`` gains a ``skipped_nonspawnable`` field
  (separate from ``skipped_unassigned``) so callers can distinguish
  "task missing an owner — operator should route it" from "task
  owned by a control-plane lane — terminal will pull it".
* ``dispatch_once`` routes the ``not profile_exists(assignee)`` skip
  into the new bucket (was lumped into ``skipped_unassigned``).
* New helper ``has_spawnable_ready(conn)`` returns True iff at least
  one ready+assigned+unclaimed task in the DB has an assignee that
  maps to a real Hermes profile. Falls back to legacy "any
  ready+assigned" when ``profile_exists`` is unimportable so degraded
  installs still surface the original warn.
* The gateway dispatcher (``gateway/run.py``) and the CLI standalone
  daemon (``hermes_cli/kanban.py``) both swap their cheap
  ``ready_nonempty`` probe to use ``has_spawnable_ready``. Stuck-warn
  now fires only when there is genuine spawnable work the dispatcher
  failed to start.
* CLI dispatch output prints ``Skipped (non-spawnable assignee —
  terminal lane, OK)`` for visibility without alarm.

Tests:

* New ``has_spawnable_ready`` cases (empty queue, terminal-lane
  only, mixed real+terminal).
* New ``test_dispatch_skips_nonspawnable_into_separate_bucket``
  verifies the bucketing change.
* Updated ``test_dispatch_skips_unassigned`` to assert no
  cross-leak.
* Added ``all_assignees_spawnable`` fixture in
  ``tests/hermes_cli/conftest.py`` and threaded it through dispatcher
  tests that use synthetic assignees ("alice", "bob"). PR #20105
  (the parent commit) silently broke 8 such tests by routing those
  assignees into ``skipped_nonspawnable`` instead of spawning; this
  PR repairs them as part of the same code area.

Verified locally: 246/246 kanban-suite tests pass.

Stacks on top of fix/kanban-dispatcher-skip-missing-profile-2026-05-05
(PR #20105). Reviewer: this PR is meant to merge AFTER #20105.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Brecht-H 2026-05-05 09:43:06 +00:00 committed by Teknium
parent ca5595fe7b
commit f25d3ec917
6 changed files with 152 additions and 25 deletions

View file

@ -0,0 +1,19 @@
"""Fixtures shared across hermes_cli kanban tests."""
from __future__ import annotations
import pytest
@pytest.fixture
def all_assignees_spawnable(monkeypatch):
"""Pretend every assignee maps to a real Hermes profile.
Most dispatcher tests use synthetic assignees ("alice", "bob") that
don't correspond to actual profile directories on disk. Without this
patch, the dispatcher's profile-exists guard (PR #20105) routes
those tasks into ``skipped_nonspawnable`` instead of spawning, which
would break tests that assert spawn behavior.
"""
from hermes_cli import profiles
monkeypatch.setattr(profiles, "profile_exists", lambda name: True)

View file

@ -80,7 +80,7 @@ def test_no_idempotency_key_never_collides(kanban_home):
# Spawn-failure circuit breaker
# ---------------------------------------------------------------------------
def test_spawn_failure_auto_blocks_after_limit(kanban_home):
def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawnable):
"""N consecutive spawn failures on the same task → auto_blocked."""
def _bad_spawn(task, ws):
raise RuntimeError("no PATH")
@ -109,7 +109,7 @@ def test_spawn_failure_auto_blocks_after_limit(kanban_home):
conn.close()
def test_successful_spawn_resets_failure_counter(kanban_home):
def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spawnable):
"""A successful spawn clears the counter so past failures don't count
against future retries of the same task."""
calls = [0]
@ -138,7 +138,7 @@ def test_successful_spawn_resets_failure_counter(kanban_home):
conn.close()
def test_workspace_resolution_failure_also_counts(kanban_home):
def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spawnable):
"""`dir:` workspace with no path should fail workspace resolution AND
count against the failure budget not just crash the tick."""
conn = kb.connect()
@ -824,7 +824,7 @@ def test_recompute_ready_emits_promoted_not_ready(kanban_home):
conn.close()
def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home):
def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home, all_assignees_spawnable):
def _bad(task, ws):
raise RuntimeError("nope")
conn = kb.connect()
@ -840,7 +840,7 @@ def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home):
conn.close()
def test_spawned_event_emitted_with_pid(kanban_home):
def test_spawned_event_emitted_with_pid(kanban_home, all_assignees_spawnable):
"""Successful spawn must append a ``spawned`` event with the pid in
the payload so humans tailing events see pid tracking."""
def _spawn_returns_pid(task, ws):
@ -1154,7 +1154,7 @@ def test_run_on_block_with_reason(kanban_home):
conn.close()
def test_run_on_spawn_failure_records_failed_runs(kanban_home):
def test_run_on_spawn_failure_records_failed_runs(kanban_home, all_assignees_spawnable):
"""Each spawn_failed event closes a run with outcome='spawn_failed',
and the Nth failure closes a run with outcome='gave_up'."""
def _bad(task, ws):

View file

@ -327,7 +327,7 @@ def test_worker_context_includes_parent_results_and_comments(kanban_home):
# Dispatcher
# ---------------------------------------------------------------------------
def test_dispatch_dry_run_does_not_claim(kanban_home):
def test_dispatch_dry_run_does_not_claim(kanban_home, all_assignees_spawnable):
with kb.connect() as conn:
t1 = kb.create_task(conn, title="a", assignee="alice")
t2 = kb.create_task(conn, title="b", assignee="bob")
@ -344,10 +344,58 @@ def test_dispatch_skips_unassigned(kanban_home):
t = kb.create_task(conn, title="floater")
res = kb.dispatch_once(conn, dry_run=True)
assert t in res.skipped_unassigned
assert t not in res.skipped_nonspawnable
assert not res.spawned
def test_dispatch_promotes_ready_and_spawns(kanban_home):
def test_dispatch_skips_nonspawnable_into_separate_bucket(kanban_home, monkeypatch):
"""Tasks whose assignee fails profile_exists() must NOT land in
``skipped_unassigned`` (which is operator-actionable) they go in
the dedicated ``skipped_nonspawnable`` bucket so health telemetry
can suppress false-positive "stuck" warnings."""
from hermes_cli import profiles
monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
with kb.connect() as conn:
t = kb.create_task(conn, title="for-terminal", assignee="orion-cc")
res = kb.dispatch_once(conn, dry_run=True)
assert t in res.skipped_nonspawnable
assert t not in res.skipped_unassigned
assert not res.spawned
def test_has_spawnable_ready_false_when_only_terminal_lanes(kanban_home, monkeypatch):
"""``has_spawnable_ready`` returns False when every ready task is
assigned to a control-plane lane used by gateway/CLI dispatchers
to silence the stuck-warn while terminals still have queued work."""
from hermes_cli import profiles
monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
with kb.connect() as conn:
kb.create_task(conn, title="t1", assignee="orion-cc")
kb.create_task(conn, title="t2", assignee="orion-research")
assert kb.has_spawnable_ready(conn) is False
def test_has_spawnable_ready_true_when_real_profile_present(kanban_home, monkeypatch):
"""``has_spawnable_ready`` returns True as soon as ANY ready task
has an assignee that maps to a real Hermes profile preserves the
real "stuck" signal when a daily/agent task is queued."""
from hermes_cli import profiles
monkeypatch.setattr(
profiles, "profile_exists", lambda name: name == "daily"
)
with kb.connect() as conn:
kb.create_task(conn, title="terminal-task", assignee="orion-cc")
kb.create_task(conn, title="hermes-task", assignee="daily")
assert kb.has_spawnable_ready(conn) is True
def test_has_spawnable_ready_false_on_empty_queue(kanban_home):
"""Empty queue is the trivial false case — no ready tasks at all."""
with kb.connect() as conn:
assert kb.has_spawnable_ready(conn) is False
def test_dispatch_promotes_ready_and_spawns(kanban_home, all_assignees_spawnable):
spawns = []
def fake_spawn(task, workspace):
@ -368,7 +416,7 @@ def test_dispatch_promotes_ready_and_spawns(kanban_home):
assert kb.get_task(conn, c).status == "running"
def test_dispatch_spawn_failure_releases_claim(kanban_home):
def test_dispatch_spawn_failure_releases_claim(kanban_home, all_assignees_spawnable):
def boom(task, workspace):
raise RuntimeError("spawn failed")