fix(kanban): suppress dispatcher stuck-warn when ready queue holds only non-spawnable assignees

After PR #20105 (dispatcher skips ready tasks whose assignee fails ``profile_exists()`` to prevent the orion-cc/orion-research crash loop), the gateway and CLI emit a spurious "kanban dispatcher stuck: ready queue non-empty for N consecutive ticks but 0 workers spawned" warning every 5 minutes on multi-lane setups where the queue is steadily full of human-pulled work assigned to terminal lanes. The warn is intended to catch real failure modes (broken PATH, missing venv, credential loss for a real Hermes profile). On a multi-lane host it fires forever even though everything is healthy: the dispatcher correctly chose not to spawn, and there is nothing for the operator to fix. Changes: * ``DispatchResult`` gains a ``skipped_nonspawnable`` field (separate from ``skipped_unassigned``) so callers can distinguish "task missing an owner — operator should route it" from "task owned by a control-plane lane — terminal will pull it". * ``dispatch_once`` routes the ``not profile_exists(assignee)`` skip into the new bucket (was lumped into ``skipped_unassigned``). * New helper ``has_spawnable_ready(conn)`` returns True iff at least one ready+assigned+unclaimed task in the DB has an assignee that maps to a real Hermes profile. Falls back to legacy "any ready+assigned" when ``profile_exists`` is unimportable so degraded installs still surface the original warn. * The gateway dispatcher (``gateway/run.py``) and the CLI standalone daemon (``hermes_cli/kanban.py``) both swap their cheap ``ready_nonempty`` probe to use ``has_spawnable_ready``. Stuck-warn now fires only when there is genuine spawnable work the dispatcher failed to start. * CLI dispatch output prints ``Skipped (non-spawnable assignee — terminal lane, OK)`` for visibility without alarm. Tests: * New ``has_spawnable_ready`` cases (empty queue, terminal-lane only, mixed real+terminal). * New ``test_dispatch_skips_nonspawnable_into_separate_bucket`` verifies the bucketing change. * Updated ``test_dispatch_skips_unassigned`` to assert no cross-leak. * Added ``all_assignees_spawnable`` fixture in ``tests/hermes_cli/conftest.py`` and threaded it through dispatcher tests that use synthetic assignees ("alice", "bob"). PR #20105 (the parent commit) silently broke 8 such tests by routing those assignees into ``skipped_nonspawnable`` instead of spawning; this PR repairs them as part of the same code area. Verified locally: 246/246 kanban-suite tests pass. Stacks on top of fix/kanban-dispatcher-skip-missing-profile-2026-05-05 (PR #20105). Reviewer: this PR is meant to merge AFTER #20105. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 03:01:47 +00:00 · 2026-05-05 09:43:06 +00:00 · 2026-05-05 09:43:06 +00:00 · f25d3ec917
commit f25d3ec917
parent ca5595fe7b
6 changed files with 152 additions and 25 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -3906,7 +3906,17 @@ class GatewayRunner:
            return out

        def _ready_nonempty() -> bool:
-            """Cheap probe: is there a ready+assigned+unclaimed task on ANY board?"""
+            """Cheap probe: is there at least one ready+assigned+unclaimed
+            task on ANY board whose assignee maps to a real Hermes profile
+            (i.e. one the dispatcher would actually spawn for)?
+
+            Tasks assigned to control-plane lanes (e.g. ``orion-cc``,
+            ``orion-research``) are pulled by terminals via
+            ``claim_task`` directly and never spawnable, so a queue full
+            of those is "correctly idle", not "stuck". Filtering them out
+            here keeps the stuck-warn fire only on real failures (broken
+            PATH, missing venv, credential loss for a real Hermes profile).
+            """
            try:
                boards = _kb.list_boards(include_archived=False)
            except Exception:
@ -3916,12 +3926,7 @@ class GatewayRunner:
                conn = None
                try:
                    conn = _kb.connect(board=slug)
-                    row = conn.execute(
-                        "SELECT 1 FROM tasks "
-                        "WHERE status = 'ready' AND assignee IS NOT NULL "
-                        "    AND claim_lock IS NULL LIMIT 1"
-                    ).fetchone()
-                    if row is not None:
+                    if _kb.has_spawnable_ready(conn):
                        return True
                except Exception:
                    continue
--- a/hermes_cli/kanban.py
+++ b/hermes_cli/kanban.py
@ -1274,6 +1274,7 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
                for (tid, who, ws) in res.spawned
            ],
            "skipped_unassigned": res.skipped_unassigned,
+            "skipped_nonspawnable": res.skipped_nonspawnable,
        }, indent=2))
        return 0
    print(f"Reclaimed:    {res.reclaimed}")
@ -1293,6 +1294,11 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
        print(f"  - {tid}  ->  {who}  @ {ws or '-'}{tag}")
    if res.skipped_unassigned:
        print(f"Skipped (unassigned): {', '.join(res.skipped_unassigned)}")
+    if res.skipped_nonspawnable:
+        print(
+            f"Skipped (non-spawnable assignee — terminal lane, OK): "
+            f"{', '.join(res.skipped_nonspawnable)}"
+        )
    return 0


@ -1404,16 +1410,18 @@ def _cmd_daemon(args: argparse.Namespace) -> int:
            )

    def _ready_queue_nonempty() -> bool:
-        """Cheap SELECT — just asks whether there's at least one ready
-        task with an assignee that the dispatcher could have picked up."""
+        """Cheap probe — is there at least one ready+assigned+unclaimed
+        task whose assignee maps to a real Hermes profile (i.e. one the
+        dispatcher would actually try to spawn for)?
+
+        Filters out tasks assigned to control-plane lanes
+        (e.g. ``orion-cc``, ``orion-research``) that are pulled by
+        terminals via ``claim_task`` directly — those are correctly idle
+        from the dispatcher's perspective, not stuck.
+        """
        try:
            with kb.connect() as conn:
-                row = conn.execute(
-                    "SELECT 1 FROM tasks "
-                    "WHERE status = 'ready' AND assignee IS NOT NULL "
-                    "    AND claim_lock IS NULL LIMIT 1"
-                ).fetchone()
-                return row is not None
+                return kb.has_spawnable_ready(conn)
        except Exception:
            return False

--- a/hermes_cli/kanban_db.py
+++ b/hermes_cli/kanban_db.py
@ -2118,6 +2118,15 @@ class DispatchResult:
    spawned: list[tuple[str, str, str]] = field(default_factory=list)
    """List of ``(task_id, assignee, workspace_path)`` triples."""
    skipped_unassigned: list[str] = field(default_factory=list)
+    """Ready task ids skipped because they have no assignee at all.
+    Operator-actionable — usually a misfiled task waiting for routing."""
+    skipped_nonspawnable: list[str] = field(default_factory=list)
+    """Ready task ids skipped because their assignee names a control-plane
+    lane (a Claude Code terminal like ``orion-cc``) rather than a Hermes
+    profile. Expected steady-state on multi-lane setups; NOT an
+    operator-actionable failure. Tracked separately so health telemetry
+    can distinguish "real stuck" (nothing spawned but spawnable work
+    available) from "correctly idle" (nothing spawnable in the queue)."""
    crashed: list[str] = field(default_factory=list)
    """Task ids reclaimed because their worker PID disappeared."""
    auto_blocked: list[str] = field(default_factory=list)
@ -2459,6 +2468,38 @@ def _clear_spawn_failures(conn: sqlite3.Connection, task_id: str) -> None:
        )


+def has_spawnable_ready(conn: sqlite3.Connection) -> bool:
+    """Return True iff there is at least one ready+assigned+unclaimed task
+    whose assignee maps to a real Hermes profile.
+
+    Used by the gateway- and CLI-embedded dispatchers' health telemetry to
+    decide whether ``0 spawned`` is a "stuck" condition (real spawnable
+    work waiting) or a "correctly idle" condition (only control-plane
+    lanes like ``orion-cc`` / ``orion-research`` waiting on terminals
+    that pull tasks via ``claim_task`` directly).
+
+    Falls back to "any ready+assigned" if ``profile_exists`` is not
+    importable (e.g. partial install) — preserves the old behavior so
+    the warning still fires in degraded environments.
+    """
+    rows = conn.execute(
+        "SELECT DISTINCT assignee FROM tasks "
+        "WHERE status = 'ready' AND assignee IS NOT NULL "
+        "    AND claim_lock IS NULL"
+    ).fetchall()
+    if not rows:
+        return False
+    try:
+        from hermes_cli.profiles import profile_exists  # local import: avoids cycle
+    except Exception:
+        # Can't introspect — assume spawnable, preserve legacy behavior.
+        return True
+    for row in rows:
+        if profile_exists(row["assignee"]):
+            return True
+    return False
+
+
 def dispatch_once(
    conn: sqlite3.Connection,
    *,
@ -2521,7 +2562,13 @@ def dispatch_once(
        except Exception:
            profile_exists = None  # type: ignore[assignment]
        if profile_exists is not None and not profile_exists(row["assignee"]):
-            result.skipped_unassigned.append(row["id"])
+            # Bucket separately from skipped_unassigned: the operator
+            # cannot fix this by assigning a profile (the assignee IS the
+            # intended owner — a terminal lane). Health telemetry uses
+            # this distinction to suppress spurious "stuck" warnings on
+            # multi-lane setups where the ready queue is steadily full
+            # of human-pulled work.
+            result.skipped_nonspawnable.append(row["id"])
            continue
        if dry_run:
            result.spawned.append((row["id"], row["assignee"], ""))
--- a/tests/hermes_cli/conftest.py
+++ b/tests/hermes_cli/conftest.py
@ -0,0 +1,19 @@
+"""Fixtures shared across hermes_cli kanban tests."""
+
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture
+def all_assignees_spawnable(monkeypatch):
+    """Pretend every assignee maps to a real Hermes profile.
+
+    Most dispatcher tests use synthetic assignees ("alice", "bob") that
+    don't correspond to actual profile directories on disk. Without this
+    patch, the dispatcher's profile-exists guard (PR #20105) routes
+    those tasks into ``skipped_nonspawnable`` instead of spawning, which
+    would break tests that assert spawn behavior.
+    """
+    from hermes_cli import profiles
+    monkeypatch.setattr(profiles, "profile_exists", lambda name: True)
--- a/tests/hermes_cli/test_kanban_core_functionality.py
+++ b/tests/hermes_cli/test_kanban_core_functionality.py
@ -80,7 +80,7 @@ def test_no_idempotency_key_never_collides(kanban_home):
 # Spawn-failure circuit breaker
 # ---------------------------------------------------------------------------

-def test_spawn_failure_auto_blocks_after_limit(kanban_home):
+def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawnable):
    """N consecutive spawn failures on the same task → auto_blocked."""
    def _bad_spawn(task, ws):
        raise RuntimeError("no PATH")
@ -109,7 +109,7 @@ def test_spawn_failure_auto_blocks_after_limit(kanban_home):
        conn.close()


-def test_successful_spawn_resets_failure_counter(kanban_home):
+def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spawnable):
    """A successful spawn clears the counter so past failures don't count
    against future retries of the same task."""
    calls = [0]
@ -138,7 +138,7 @@ def test_successful_spawn_resets_failure_counter(kanban_home):
        conn.close()


-def test_workspace_resolution_failure_also_counts(kanban_home):
+def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spawnable):
    """`dir:` workspace with no path should fail workspace resolution AND
    count against the failure budget — not just crash the tick."""
    conn = kb.connect()
@ -824,7 +824,7 @@ def test_recompute_ready_emits_promoted_not_ready(kanban_home):
        conn.close()


-def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home):
+def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home, all_assignees_spawnable):
    def _bad(task, ws):
        raise RuntimeError("nope")
    conn = kb.connect()
@ -840,7 +840,7 @@ def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home):
        conn.close()


-def test_spawned_event_emitted_with_pid(kanban_home):
+def test_spawned_event_emitted_with_pid(kanban_home, all_assignees_spawnable):
    """Successful spawn must append a ``spawned`` event with the pid in
    the payload so humans tailing events see pid tracking."""
    def _spawn_returns_pid(task, ws):
@ -1154,7 +1154,7 @@ def test_run_on_block_with_reason(kanban_home):
        conn.close()


-def test_run_on_spawn_failure_records_failed_runs(kanban_home):
+def test_run_on_spawn_failure_records_failed_runs(kanban_home, all_assignees_spawnable):
    """Each spawn_failed event closes a run with outcome='spawn_failed',
    and the Nth failure closes a run with outcome='gave_up'."""
    def _bad(task, ws):
--- a/tests/hermes_cli/test_kanban_db.py
+++ b/tests/hermes_cli/test_kanban_db.py
@ -327,7 +327,7 @@ def test_worker_context_includes_parent_results_and_comments(kanban_home):
 # Dispatcher
 # ---------------------------------------------------------------------------

-def test_dispatch_dry_run_does_not_claim(kanban_home):
+def test_dispatch_dry_run_does_not_claim(kanban_home, all_assignees_spawnable):
    with kb.connect() as conn:
        t1 = kb.create_task(conn, title="a", assignee="alice")
        t2 = kb.create_task(conn, title="b", assignee="bob")
@ -344,10 +344,58 @@ def test_dispatch_skips_unassigned(kanban_home):
        t = kb.create_task(conn, title="floater")
        res = kb.dispatch_once(conn, dry_run=True)
    assert t in res.skipped_unassigned
+    assert t not in res.skipped_nonspawnable
    assert not res.spawned


-def test_dispatch_promotes_ready_and_spawns(kanban_home):
+def test_dispatch_skips_nonspawnable_into_separate_bucket(kanban_home, monkeypatch):
+    """Tasks whose assignee fails profile_exists() must NOT land in
+    ``skipped_unassigned`` (which is operator-actionable) — they go in
+    the dedicated ``skipped_nonspawnable`` bucket so health telemetry
+    can suppress false-positive "stuck" warnings."""
+    from hermes_cli import profiles
+    monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
+    with kb.connect() as conn:
+        t = kb.create_task(conn, title="for-terminal", assignee="orion-cc")
+        res = kb.dispatch_once(conn, dry_run=True)
+    assert t in res.skipped_nonspawnable
+    assert t not in res.skipped_unassigned
+    assert not res.spawned
+
+
+def test_has_spawnable_ready_false_when_only_terminal_lanes(kanban_home, monkeypatch):
+    """``has_spawnable_ready`` returns False when every ready task is
+    assigned to a control-plane lane — used by gateway/CLI dispatchers
+    to silence the stuck-warn while terminals still have queued work."""
+    from hermes_cli import profiles
+    monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
+    with kb.connect() as conn:
+        kb.create_task(conn, title="t1", assignee="orion-cc")
+        kb.create_task(conn, title="t2", assignee="orion-research")
+        assert kb.has_spawnable_ready(conn) is False
+
+
+def test_has_spawnable_ready_true_when_real_profile_present(kanban_home, monkeypatch):
+    """``has_spawnable_ready`` returns True as soon as ANY ready task
+    has an assignee that maps to a real Hermes profile — preserves the
+    real "stuck" signal when a daily/agent task is queued."""
+    from hermes_cli import profiles
+    monkeypatch.setattr(
+        profiles, "profile_exists", lambda name: name == "daily"
+    )
+    with kb.connect() as conn:
+        kb.create_task(conn, title="terminal-task", assignee="orion-cc")
+        kb.create_task(conn, title="hermes-task", assignee="daily")
+        assert kb.has_spawnable_ready(conn) is True
+
+
+def test_has_spawnable_ready_false_on_empty_queue(kanban_home):
+    """Empty queue is the trivial false case — no ready tasks at all."""
+    with kb.connect() as conn:
+        assert kb.has_spawnable_ready(conn) is False
+
+
+def test_dispatch_promotes_ready_and_spawns(kanban_home, all_assignees_spawnable):
    spawns = []

    def fake_spawn(task, workspace):
@ -368,7 +416,7 @@ def test_dispatch_promotes_ready_and_spawns(kanban_home):
        assert kb.get_task(conn, c).status == "running"


-def test_dispatch_spawn_failure_releases_claim(kanban_home):
+def test_dispatch_spawn_failure_releases_claim(kanban_home, all_assignees_spawnable):
    def boom(task, workspace):
        raise RuntimeError("spawn failed")