fix(kanban): suppress dispatcher stuck-warn when ready queue holds only non-spawnable assignees

After PR #20105 (dispatcher skips ready tasks whose assignee fails
``profile_exists()`` to prevent the orion-cc/orion-research crash
loop), the gateway and CLI emit a spurious "kanban dispatcher stuck:
ready queue non-empty for N consecutive ticks but 0 workers spawned"
warning every 5 minutes on multi-lane setups where the queue is
steadily full of human-pulled work assigned to terminal lanes.

The warn is intended to catch real failure modes (broken PATH,
missing venv, credential loss for a real Hermes profile). On a
multi-lane host it fires forever even though everything is healthy:
the dispatcher correctly chose not to spawn, and there is nothing
for the operator to fix.

Changes:

* ``DispatchResult`` gains a ``skipped_nonspawnable`` field
  (separate from ``skipped_unassigned``) so callers can distinguish
  "task missing an owner — operator should route it" from "task
  owned by a control-plane lane — terminal will pull it".
* ``dispatch_once`` routes the ``not profile_exists(assignee)`` skip
  into the new bucket (was lumped into ``skipped_unassigned``).
* New helper ``has_spawnable_ready(conn)`` returns True iff at least
  one ready+assigned+unclaimed task in the DB has an assignee that
  maps to a real Hermes profile. Falls back to legacy "any
  ready+assigned" when ``profile_exists`` is unimportable so degraded
  installs still surface the original warn.
* The gateway dispatcher (``gateway/run.py``) and the CLI standalone
  daemon (``hermes_cli/kanban.py``) both swap their cheap
  ``ready_nonempty`` probe to use ``has_spawnable_ready``. Stuck-warn
  now fires only when there is genuine spawnable work the dispatcher
  failed to start.
* CLI dispatch output prints ``Skipped (non-spawnable assignee —
  terminal lane, OK)`` for visibility without alarm.

Tests:

* New ``has_spawnable_ready`` cases (empty queue, terminal-lane
  only, mixed real+terminal).
* New ``test_dispatch_skips_nonspawnable_into_separate_bucket``
  verifies the bucketing change.
* Updated ``test_dispatch_skips_unassigned`` to assert no
  cross-leak.
* Added ``all_assignees_spawnable`` fixture in
  ``tests/hermes_cli/conftest.py`` and threaded it through dispatcher
  tests that use synthetic assignees ("alice", "bob"). PR #20105
  (the parent commit) silently broke 8 such tests by routing those
  assignees into ``skipped_nonspawnable`` instead of spawning; this
  PR repairs them as part of the same code area.

Verified locally: 246/246 kanban-suite tests pass.

Stacks on top of fix/kanban-dispatcher-skip-missing-profile-2026-05-05
(PR #20105). Reviewer: this PR is meant to merge AFTER #20105.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Brecht-H 2026-05-05 09:43:06 +00:00 committed by Teknium
parent ca5595fe7b
commit f25d3ec917
6 changed files with 152 additions and 25 deletions

View file

@ -3906,7 +3906,17 @@ class GatewayRunner:
return out return out
def _ready_nonempty() -> bool: def _ready_nonempty() -> bool:
"""Cheap probe: is there a ready+assigned+unclaimed task on ANY board?""" """Cheap probe: is there at least one ready+assigned+unclaimed
task on ANY board whose assignee maps to a real Hermes profile
(i.e. one the dispatcher would actually spawn for)?
Tasks assigned to control-plane lanes (e.g. ``orion-cc``,
``orion-research``) are pulled by terminals via
``claim_task`` directly and never spawnable, so a queue full
of those is "correctly idle", not "stuck". Filtering them out
here keeps the stuck-warn fire only on real failures (broken
PATH, missing venv, credential loss for a real Hermes profile).
"""
try: try:
boards = _kb.list_boards(include_archived=False) boards = _kb.list_boards(include_archived=False)
except Exception: except Exception:
@ -3916,12 +3926,7 @@ class GatewayRunner:
conn = None conn = None
try: try:
conn = _kb.connect(board=slug) conn = _kb.connect(board=slug)
row = conn.execute( if _kb.has_spawnable_ready(conn):
"SELECT 1 FROM tasks "
"WHERE status = 'ready' AND assignee IS NOT NULL "
" AND claim_lock IS NULL LIMIT 1"
).fetchone()
if row is not None:
return True return True
except Exception: except Exception:
continue continue

View file

@ -1274,6 +1274,7 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
for (tid, who, ws) in res.spawned for (tid, who, ws) in res.spawned
], ],
"skipped_unassigned": res.skipped_unassigned, "skipped_unassigned": res.skipped_unassigned,
"skipped_nonspawnable": res.skipped_nonspawnable,
}, indent=2)) }, indent=2))
return 0 return 0
print(f"Reclaimed: {res.reclaimed}") print(f"Reclaimed: {res.reclaimed}")
@ -1293,6 +1294,11 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
print(f" - {tid} -> {who} @ {ws or '-'}{tag}") print(f" - {tid} -> {who} @ {ws or '-'}{tag}")
if res.skipped_unassigned: if res.skipped_unassigned:
print(f"Skipped (unassigned): {', '.join(res.skipped_unassigned)}") print(f"Skipped (unassigned): {', '.join(res.skipped_unassigned)}")
if res.skipped_nonspawnable:
print(
f"Skipped (non-spawnable assignee — terminal lane, OK): "
f"{', '.join(res.skipped_nonspawnable)}"
)
return 0 return 0
@ -1404,16 +1410,18 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
) )
def _ready_queue_nonempty() -> bool: def _ready_queue_nonempty() -> bool:
"""Cheap SELECT — just asks whether there's at least one ready """Cheap probe — is there at least one ready+assigned+unclaimed
task with an assignee that the dispatcher could have picked up.""" task whose assignee maps to a real Hermes profile (i.e. one the
dispatcher would actually try to spawn for)?
Filters out tasks assigned to control-plane lanes
(e.g. ``orion-cc``, ``orion-research``) that are pulled by
terminals via ``claim_task`` directly those are correctly idle
from the dispatcher's perspective, not stuck.
"""
try: try:
with kb.connect() as conn: with kb.connect() as conn:
row = conn.execute( return kb.has_spawnable_ready(conn)
"SELECT 1 FROM tasks "
"WHERE status = 'ready' AND assignee IS NOT NULL "
" AND claim_lock IS NULL LIMIT 1"
).fetchone()
return row is not None
except Exception: except Exception:
return False return False

View file

@ -2118,6 +2118,15 @@ class DispatchResult:
spawned: list[tuple[str, str, str]] = field(default_factory=list) spawned: list[tuple[str, str, str]] = field(default_factory=list)
"""List of ``(task_id, assignee, workspace_path)`` triples.""" """List of ``(task_id, assignee, workspace_path)`` triples."""
skipped_unassigned: list[str] = field(default_factory=list) skipped_unassigned: list[str] = field(default_factory=list)
"""Ready task ids skipped because they have no assignee at all.
Operator-actionable usually a misfiled task waiting for routing."""
skipped_nonspawnable: list[str] = field(default_factory=list)
"""Ready task ids skipped because their assignee names a control-plane
lane (a Claude Code terminal like ``orion-cc``) rather than a Hermes
profile. Expected steady-state on multi-lane setups; NOT an
operator-actionable failure. Tracked separately so health telemetry
can distinguish "real stuck" (nothing spawned but spawnable work
available) from "correctly idle" (nothing spawnable in the queue)."""
crashed: list[str] = field(default_factory=list) crashed: list[str] = field(default_factory=list)
"""Task ids reclaimed because their worker PID disappeared.""" """Task ids reclaimed because their worker PID disappeared."""
auto_blocked: list[str] = field(default_factory=list) auto_blocked: list[str] = field(default_factory=list)
@ -2459,6 +2468,38 @@ def _clear_spawn_failures(conn: sqlite3.Connection, task_id: str) -> None:
) )
def has_spawnable_ready(conn: sqlite3.Connection) -> bool:
"""Return True iff there is at least one ready+assigned+unclaimed task
whose assignee maps to a real Hermes profile.
Used by the gateway- and CLI-embedded dispatchers' health telemetry to
decide whether ``0 spawned`` is a "stuck" condition (real spawnable
work waiting) or a "correctly idle" condition (only control-plane
lanes like ``orion-cc`` / ``orion-research`` waiting on terminals
that pull tasks via ``claim_task`` directly).
Falls back to "any ready+assigned" if ``profile_exists`` is not
importable (e.g. partial install) preserves the old behavior so
the warning still fires in degraded environments.
"""
rows = conn.execute(
"SELECT DISTINCT assignee FROM tasks "
"WHERE status = 'ready' AND assignee IS NOT NULL "
" AND claim_lock IS NULL"
).fetchall()
if not rows:
return False
try:
from hermes_cli.profiles import profile_exists # local import: avoids cycle
except Exception:
# Can't introspect — assume spawnable, preserve legacy behavior.
return True
for row in rows:
if profile_exists(row["assignee"]):
return True
return False
def dispatch_once( def dispatch_once(
conn: sqlite3.Connection, conn: sqlite3.Connection,
*, *,
@ -2521,7 +2562,13 @@ def dispatch_once(
except Exception: except Exception:
profile_exists = None # type: ignore[assignment] profile_exists = None # type: ignore[assignment]
if profile_exists is not None and not profile_exists(row["assignee"]): if profile_exists is not None and not profile_exists(row["assignee"]):
result.skipped_unassigned.append(row["id"]) # Bucket separately from skipped_unassigned: the operator
# cannot fix this by assigning a profile (the assignee IS the
# intended owner — a terminal lane). Health telemetry uses
# this distinction to suppress spurious "stuck" warnings on
# multi-lane setups where the ready queue is steadily full
# of human-pulled work.
result.skipped_nonspawnable.append(row["id"])
continue continue
if dry_run: if dry_run:
result.spawned.append((row["id"], row["assignee"], "")) result.spawned.append((row["id"], row["assignee"], ""))

View file

@ -0,0 +1,19 @@
"""Fixtures shared across hermes_cli kanban tests."""
from __future__ import annotations
import pytest
@pytest.fixture
def all_assignees_spawnable(monkeypatch):
"""Pretend every assignee maps to a real Hermes profile.
Most dispatcher tests use synthetic assignees ("alice", "bob") that
don't correspond to actual profile directories on disk. Without this
patch, the dispatcher's profile-exists guard (PR #20105) routes
those tasks into ``skipped_nonspawnable`` instead of spawning, which
would break tests that assert spawn behavior.
"""
from hermes_cli import profiles
monkeypatch.setattr(profiles, "profile_exists", lambda name: True)

View file

@ -80,7 +80,7 @@ def test_no_idempotency_key_never_collides(kanban_home):
# Spawn-failure circuit breaker # Spawn-failure circuit breaker
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_spawn_failure_auto_blocks_after_limit(kanban_home): def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawnable):
"""N consecutive spawn failures on the same task → auto_blocked.""" """N consecutive spawn failures on the same task → auto_blocked."""
def _bad_spawn(task, ws): def _bad_spawn(task, ws):
raise RuntimeError("no PATH") raise RuntimeError("no PATH")
@ -109,7 +109,7 @@ def test_spawn_failure_auto_blocks_after_limit(kanban_home):
conn.close() conn.close()
def test_successful_spawn_resets_failure_counter(kanban_home): def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spawnable):
"""A successful spawn clears the counter so past failures don't count """A successful spawn clears the counter so past failures don't count
against future retries of the same task.""" against future retries of the same task."""
calls = [0] calls = [0]
@ -138,7 +138,7 @@ def test_successful_spawn_resets_failure_counter(kanban_home):
conn.close() conn.close()
def test_workspace_resolution_failure_also_counts(kanban_home): def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spawnable):
"""`dir:` workspace with no path should fail workspace resolution AND """`dir:` workspace with no path should fail workspace resolution AND
count against the failure budget not just crash the tick.""" count against the failure budget not just crash the tick."""
conn = kb.connect() conn = kb.connect()
@ -824,7 +824,7 @@ def test_recompute_ready_emits_promoted_not_ready(kanban_home):
conn.close() conn.close()
def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home): def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home, all_assignees_spawnable):
def _bad(task, ws): def _bad(task, ws):
raise RuntimeError("nope") raise RuntimeError("nope")
conn = kb.connect() conn = kb.connect()
@ -840,7 +840,7 @@ def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home):
conn.close() conn.close()
def test_spawned_event_emitted_with_pid(kanban_home): def test_spawned_event_emitted_with_pid(kanban_home, all_assignees_spawnable):
"""Successful spawn must append a ``spawned`` event with the pid in """Successful spawn must append a ``spawned`` event with the pid in
the payload so humans tailing events see pid tracking.""" the payload so humans tailing events see pid tracking."""
def _spawn_returns_pid(task, ws): def _spawn_returns_pid(task, ws):
@ -1154,7 +1154,7 @@ def test_run_on_block_with_reason(kanban_home):
conn.close() conn.close()
def test_run_on_spawn_failure_records_failed_runs(kanban_home): def test_run_on_spawn_failure_records_failed_runs(kanban_home, all_assignees_spawnable):
"""Each spawn_failed event closes a run with outcome='spawn_failed', """Each spawn_failed event closes a run with outcome='spawn_failed',
and the Nth failure closes a run with outcome='gave_up'.""" and the Nth failure closes a run with outcome='gave_up'."""
def _bad(task, ws): def _bad(task, ws):

View file

@ -327,7 +327,7 @@ def test_worker_context_includes_parent_results_and_comments(kanban_home):
# Dispatcher # Dispatcher
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def test_dispatch_dry_run_does_not_claim(kanban_home): def test_dispatch_dry_run_does_not_claim(kanban_home, all_assignees_spawnable):
with kb.connect() as conn: with kb.connect() as conn:
t1 = kb.create_task(conn, title="a", assignee="alice") t1 = kb.create_task(conn, title="a", assignee="alice")
t2 = kb.create_task(conn, title="b", assignee="bob") t2 = kb.create_task(conn, title="b", assignee="bob")
@ -344,10 +344,58 @@ def test_dispatch_skips_unassigned(kanban_home):
t = kb.create_task(conn, title="floater") t = kb.create_task(conn, title="floater")
res = kb.dispatch_once(conn, dry_run=True) res = kb.dispatch_once(conn, dry_run=True)
assert t in res.skipped_unassigned assert t in res.skipped_unassigned
assert t not in res.skipped_nonspawnable
assert not res.spawned assert not res.spawned
def test_dispatch_promotes_ready_and_spawns(kanban_home): def test_dispatch_skips_nonspawnable_into_separate_bucket(kanban_home, monkeypatch):
"""Tasks whose assignee fails profile_exists() must NOT land in
``skipped_unassigned`` (which is operator-actionable) they go in
the dedicated ``skipped_nonspawnable`` bucket so health telemetry
can suppress false-positive "stuck" warnings."""
from hermes_cli import profiles
monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
with kb.connect() as conn:
t = kb.create_task(conn, title="for-terminal", assignee="orion-cc")
res = kb.dispatch_once(conn, dry_run=True)
assert t in res.skipped_nonspawnable
assert t not in res.skipped_unassigned
assert not res.spawned
def test_has_spawnable_ready_false_when_only_terminal_lanes(kanban_home, monkeypatch):
"""``has_spawnable_ready`` returns False when every ready task is
assigned to a control-plane lane used by gateway/CLI dispatchers
to silence the stuck-warn while terminals still have queued work."""
from hermes_cli import profiles
monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
with kb.connect() as conn:
kb.create_task(conn, title="t1", assignee="orion-cc")
kb.create_task(conn, title="t2", assignee="orion-research")
assert kb.has_spawnable_ready(conn) is False
def test_has_spawnable_ready_true_when_real_profile_present(kanban_home, monkeypatch):
"""``has_spawnable_ready`` returns True as soon as ANY ready task
has an assignee that maps to a real Hermes profile preserves the
real "stuck" signal when a daily/agent task is queued."""
from hermes_cli import profiles
monkeypatch.setattr(
profiles, "profile_exists", lambda name: name == "daily"
)
with kb.connect() as conn:
kb.create_task(conn, title="terminal-task", assignee="orion-cc")
kb.create_task(conn, title="hermes-task", assignee="daily")
assert kb.has_spawnable_ready(conn) is True
def test_has_spawnable_ready_false_on_empty_queue(kanban_home):
"""Empty queue is the trivial false case — no ready tasks at all."""
with kb.connect() as conn:
assert kb.has_spawnable_ready(conn) is False
def test_dispatch_promotes_ready_and_spawns(kanban_home, all_assignees_spawnable):
spawns = [] spawns = []
def fake_spawn(task, workspace): def fake_spawn(task, workspace):
@ -368,7 +416,7 @@ def test_dispatch_promotes_ready_and_spawns(kanban_home):
assert kb.get_task(conn, c).status == "running" assert kb.get_task(conn, c).status == "running"
def test_dispatch_spawn_failure_releases_claim(kanban_home): def test_dispatch_spawn_failure_releases_claim(kanban_home, all_assignees_spawnable):
def boom(task, workspace): def boom(task, workspace):
raise RuntimeError("spawn failed") raise RuntimeError("spawn failed")