From f25d3ec9176bc15d9be59ab32dbd4537606d0254 Mon Sep 17 00:00:00 2001 From: Brecht-H <73849650+Brecht-H@users.noreply.github.com> Date: Tue, 5 May 2026 09:43:06 +0000 Subject: [PATCH] fix(kanban): suppress dispatcher stuck-warn when ready queue holds only non-spawnable assignees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After PR #20105 (dispatcher skips ready tasks whose assignee fails ``profile_exists()`` to prevent the orion-cc/orion-research crash loop), the gateway and CLI emit a spurious "kanban dispatcher stuck: ready queue non-empty for N consecutive ticks but 0 workers spawned" warning every 5 minutes on multi-lane setups where the queue is steadily full of human-pulled work assigned to terminal lanes. The warn is intended to catch real failure modes (broken PATH, missing venv, credential loss for a real Hermes profile). On a multi-lane host it fires forever even though everything is healthy: the dispatcher correctly chose not to spawn, and there is nothing for the operator to fix. Changes: * ``DispatchResult`` gains a ``skipped_nonspawnable`` field (separate from ``skipped_unassigned``) so callers can distinguish "task missing an owner — operator should route it" from "task owned by a control-plane lane — terminal will pull it". * ``dispatch_once`` routes the ``not profile_exists(assignee)`` skip into the new bucket (was lumped into ``skipped_unassigned``). * New helper ``has_spawnable_ready(conn)`` returns True iff at least one ready+assigned+unclaimed task in the DB has an assignee that maps to a real Hermes profile. Falls back to legacy "any ready+assigned" when ``profile_exists`` is unimportable so degraded installs still surface the original warn. * The gateway dispatcher (``gateway/run.py``) and the CLI standalone daemon (``hermes_cli/kanban.py``) both swap their cheap ``ready_nonempty`` probe to use ``has_spawnable_ready``. Stuck-warn now fires only when there is genuine spawnable work the dispatcher failed to start. * CLI dispatch output prints ``Skipped (non-spawnable assignee — terminal lane, OK)`` for visibility without alarm. Tests: * New ``has_spawnable_ready`` cases (empty queue, terminal-lane only, mixed real+terminal). * New ``test_dispatch_skips_nonspawnable_into_separate_bucket`` verifies the bucketing change. * Updated ``test_dispatch_skips_unassigned`` to assert no cross-leak. * Added ``all_assignees_spawnable`` fixture in ``tests/hermes_cli/conftest.py`` and threaded it through dispatcher tests that use synthetic assignees ("alice", "bob"). PR #20105 (the parent commit) silently broke 8 such tests by routing those assignees into ``skipped_nonspawnable`` instead of spawning; this PR repairs them as part of the same code area. Verified locally: 246/246 kanban-suite tests pass. Stacks on top of fix/kanban-dispatcher-skip-missing-profile-2026-05-05 (PR #20105). Reviewer: this PR is meant to merge AFTER #20105. Co-Authored-By: Claude Opus 4.7 (1M context) --- gateway/run.py | 19 ++++--- hermes_cli/kanban.py | 24 ++++++--- hermes_cli/kanban_db.py | 49 ++++++++++++++++- tests/hermes_cli/conftest.py | 19 +++++++ .../test_kanban_core_functionality.py | 12 ++--- tests/hermes_cli/test_kanban_db.py | 54 +++++++++++++++++-- 6 files changed, 152 insertions(+), 25 deletions(-) create mode 100644 tests/hermes_cli/conftest.py diff --git a/gateway/run.py b/gateway/run.py index 5e2163e830..5e10303dec 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3906,7 +3906,17 @@ class GatewayRunner: return out def _ready_nonempty() -> bool: - """Cheap probe: is there a ready+assigned+unclaimed task on ANY board?""" + """Cheap probe: is there at least one ready+assigned+unclaimed + task on ANY board whose assignee maps to a real Hermes profile + (i.e. one the dispatcher would actually spawn for)? + + Tasks assigned to control-plane lanes (e.g. ``orion-cc``, + ``orion-research``) are pulled by terminals via + ``claim_task`` directly and never spawnable, so a queue full + of those is "correctly idle", not "stuck". Filtering them out + here keeps the stuck-warn fire only on real failures (broken + PATH, missing venv, credential loss for a real Hermes profile). + """ try: boards = _kb.list_boards(include_archived=False) except Exception: @@ -3916,12 +3926,7 @@ class GatewayRunner: conn = None try: conn = _kb.connect(board=slug) - row = conn.execute( - "SELECT 1 FROM tasks " - "WHERE status = 'ready' AND assignee IS NOT NULL " - " AND claim_lock IS NULL LIMIT 1" - ).fetchone() - if row is not None: + if _kb.has_spawnable_ready(conn): return True except Exception: continue diff --git a/hermes_cli/kanban.py b/hermes_cli/kanban.py index 4befd64fa4..4d738eaff0 100644 --- a/hermes_cli/kanban.py +++ b/hermes_cli/kanban.py @@ -1274,6 +1274,7 @@ def _cmd_dispatch(args: argparse.Namespace) -> int: for (tid, who, ws) in res.spawned ], "skipped_unassigned": res.skipped_unassigned, + "skipped_nonspawnable": res.skipped_nonspawnable, }, indent=2)) return 0 print(f"Reclaimed: {res.reclaimed}") @@ -1293,6 +1294,11 @@ def _cmd_dispatch(args: argparse.Namespace) -> int: print(f" - {tid} -> {who} @ {ws or '-'}{tag}") if res.skipped_unassigned: print(f"Skipped (unassigned): {', '.join(res.skipped_unassigned)}") + if res.skipped_nonspawnable: + print( + f"Skipped (non-spawnable assignee — terminal lane, OK): " + f"{', '.join(res.skipped_nonspawnable)}" + ) return 0 @@ -1404,16 +1410,18 @@ def _cmd_daemon(args: argparse.Namespace) -> int: ) def _ready_queue_nonempty() -> bool: - """Cheap SELECT — just asks whether there's at least one ready - task with an assignee that the dispatcher could have picked up.""" + """Cheap probe — is there at least one ready+assigned+unclaimed + task whose assignee maps to a real Hermes profile (i.e. one the + dispatcher would actually try to spawn for)? + + Filters out tasks assigned to control-plane lanes + (e.g. ``orion-cc``, ``orion-research``) that are pulled by + terminals via ``claim_task`` directly — those are correctly idle + from the dispatcher's perspective, not stuck. + """ try: with kb.connect() as conn: - row = conn.execute( - "SELECT 1 FROM tasks " - "WHERE status = 'ready' AND assignee IS NOT NULL " - " AND claim_lock IS NULL LIMIT 1" - ).fetchone() - return row is not None + return kb.has_spawnable_ready(conn) except Exception: return False diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index fb20278392..9ed9f512b1 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -2118,6 +2118,15 @@ class DispatchResult: spawned: list[tuple[str, str, str]] = field(default_factory=list) """List of ``(task_id, assignee, workspace_path)`` triples.""" skipped_unassigned: list[str] = field(default_factory=list) + """Ready task ids skipped because they have no assignee at all. + Operator-actionable — usually a misfiled task waiting for routing.""" + skipped_nonspawnable: list[str] = field(default_factory=list) + """Ready task ids skipped because their assignee names a control-plane + lane (a Claude Code terminal like ``orion-cc``) rather than a Hermes + profile. Expected steady-state on multi-lane setups; NOT an + operator-actionable failure. Tracked separately so health telemetry + can distinguish "real stuck" (nothing spawned but spawnable work + available) from "correctly idle" (nothing spawnable in the queue).""" crashed: list[str] = field(default_factory=list) """Task ids reclaimed because their worker PID disappeared.""" auto_blocked: list[str] = field(default_factory=list) @@ -2459,6 +2468,38 @@ def _clear_spawn_failures(conn: sqlite3.Connection, task_id: str) -> None: ) +def has_spawnable_ready(conn: sqlite3.Connection) -> bool: + """Return True iff there is at least one ready+assigned+unclaimed task + whose assignee maps to a real Hermes profile. + + Used by the gateway- and CLI-embedded dispatchers' health telemetry to + decide whether ``0 spawned`` is a "stuck" condition (real spawnable + work waiting) or a "correctly idle" condition (only control-plane + lanes like ``orion-cc`` / ``orion-research`` waiting on terminals + that pull tasks via ``claim_task`` directly). + + Falls back to "any ready+assigned" if ``profile_exists`` is not + importable (e.g. partial install) — preserves the old behavior so + the warning still fires in degraded environments. + """ + rows = conn.execute( + "SELECT DISTINCT assignee FROM tasks " + "WHERE status = 'ready' AND assignee IS NOT NULL " + " AND claim_lock IS NULL" + ).fetchall() + if not rows: + return False + try: + from hermes_cli.profiles import profile_exists # local import: avoids cycle + except Exception: + # Can't introspect — assume spawnable, preserve legacy behavior. + return True + for row in rows: + if profile_exists(row["assignee"]): + return True + return False + + def dispatch_once( conn: sqlite3.Connection, *, @@ -2521,7 +2562,13 @@ def dispatch_once( except Exception: profile_exists = None # type: ignore[assignment] if profile_exists is not None and not profile_exists(row["assignee"]): - result.skipped_unassigned.append(row["id"]) + # Bucket separately from skipped_unassigned: the operator + # cannot fix this by assigning a profile (the assignee IS the + # intended owner — a terminal lane). Health telemetry uses + # this distinction to suppress spurious "stuck" warnings on + # multi-lane setups where the ready queue is steadily full + # of human-pulled work. + result.skipped_nonspawnable.append(row["id"]) continue if dry_run: result.spawned.append((row["id"], row["assignee"], "")) diff --git a/tests/hermes_cli/conftest.py b/tests/hermes_cli/conftest.py new file mode 100644 index 0000000000..531f033e7e --- /dev/null +++ b/tests/hermes_cli/conftest.py @@ -0,0 +1,19 @@ +"""Fixtures shared across hermes_cli kanban tests.""" + +from __future__ import annotations + +import pytest + + +@pytest.fixture +def all_assignees_spawnable(monkeypatch): + """Pretend every assignee maps to a real Hermes profile. + + Most dispatcher tests use synthetic assignees ("alice", "bob") that + don't correspond to actual profile directories on disk. Without this + patch, the dispatcher's profile-exists guard (PR #20105) routes + those tasks into ``skipped_nonspawnable`` instead of spawning, which + would break tests that assert spawn behavior. + """ + from hermes_cli import profiles + monkeypatch.setattr(profiles, "profile_exists", lambda name: True) diff --git a/tests/hermes_cli/test_kanban_core_functionality.py b/tests/hermes_cli/test_kanban_core_functionality.py index a7896bf940..3c5454b35c 100644 --- a/tests/hermes_cli/test_kanban_core_functionality.py +++ b/tests/hermes_cli/test_kanban_core_functionality.py @@ -80,7 +80,7 @@ def test_no_idempotency_key_never_collides(kanban_home): # Spawn-failure circuit breaker # --------------------------------------------------------------------------- -def test_spawn_failure_auto_blocks_after_limit(kanban_home): +def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawnable): """N consecutive spawn failures on the same task → auto_blocked.""" def _bad_spawn(task, ws): raise RuntimeError("no PATH") @@ -109,7 +109,7 @@ def test_spawn_failure_auto_blocks_after_limit(kanban_home): conn.close() -def test_successful_spawn_resets_failure_counter(kanban_home): +def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spawnable): """A successful spawn clears the counter so past failures don't count against future retries of the same task.""" calls = [0] @@ -138,7 +138,7 @@ def test_successful_spawn_resets_failure_counter(kanban_home): conn.close() -def test_workspace_resolution_failure_also_counts(kanban_home): +def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spawnable): """`dir:` workspace with no path should fail workspace resolution AND count against the failure budget — not just crash the tick.""" conn = kb.connect() @@ -824,7 +824,7 @@ def test_recompute_ready_emits_promoted_not_ready(kanban_home): conn.close() -def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home): +def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home, all_assignees_spawnable): def _bad(task, ws): raise RuntimeError("nope") conn = kb.connect() @@ -840,7 +840,7 @@ def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home): conn.close() -def test_spawned_event_emitted_with_pid(kanban_home): +def test_spawned_event_emitted_with_pid(kanban_home, all_assignees_spawnable): """Successful spawn must append a ``spawned`` event with the pid in the payload so humans tailing events see pid tracking.""" def _spawn_returns_pid(task, ws): @@ -1154,7 +1154,7 @@ def test_run_on_block_with_reason(kanban_home): conn.close() -def test_run_on_spawn_failure_records_failed_runs(kanban_home): +def test_run_on_spawn_failure_records_failed_runs(kanban_home, all_assignees_spawnable): """Each spawn_failed event closes a run with outcome='spawn_failed', and the Nth failure closes a run with outcome='gave_up'.""" def _bad(task, ws): diff --git a/tests/hermes_cli/test_kanban_db.py b/tests/hermes_cli/test_kanban_db.py index 1907938b42..e6d25a3d84 100644 --- a/tests/hermes_cli/test_kanban_db.py +++ b/tests/hermes_cli/test_kanban_db.py @@ -327,7 +327,7 @@ def test_worker_context_includes_parent_results_and_comments(kanban_home): # Dispatcher # --------------------------------------------------------------------------- -def test_dispatch_dry_run_does_not_claim(kanban_home): +def test_dispatch_dry_run_does_not_claim(kanban_home, all_assignees_spawnable): with kb.connect() as conn: t1 = kb.create_task(conn, title="a", assignee="alice") t2 = kb.create_task(conn, title="b", assignee="bob") @@ -344,10 +344,58 @@ def test_dispatch_skips_unassigned(kanban_home): t = kb.create_task(conn, title="floater") res = kb.dispatch_once(conn, dry_run=True) assert t in res.skipped_unassigned + assert t not in res.skipped_nonspawnable assert not res.spawned -def test_dispatch_promotes_ready_and_spawns(kanban_home): +def test_dispatch_skips_nonspawnable_into_separate_bucket(kanban_home, monkeypatch): + """Tasks whose assignee fails profile_exists() must NOT land in + ``skipped_unassigned`` (which is operator-actionable) — they go in + the dedicated ``skipped_nonspawnable`` bucket so health telemetry + can suppress false-positive "stuck" warnings.""" + from hermes_cli import profiles + monkeypatch.setattr(profiles, "profile_exists", lambda name: False) + with kb.connect() as conn: + t = kb.create_task(conn, title="for-terminal", assignee="orion-cc") + res = kb.dispatch_once(conn, dry_run=True) + assert t in res.skipped_nonspawnable + assert t not in res.skipped_unassigned + assert not res.spawned + + +def test_has_spawnable_ready_false_when_only_terminal_lanes(kanban_home, monkeypatch): + """``has_spawnable_ready`` returns False when every ready task is + assigned to a control-plane lane — used by gateway/CLI dispatchers + to silence the stuck-warn while terminals still have queued work.""" + from hermes_cli import profiles + monkeypatch.setattr(profiles, "profile_exists", lambda name: False) + with kb.connect() as conn: + kb.create_task(conn, title="t1", assignee="orion-cc") + kb.create_task(conn, title="t2", assignee="orion-research") + assert kb.has_spawnable_ready(conn) is False + + +def test_has_spawnable_ready_true_when_real_profile_present(kanban_home, monkeypatch): + """``has_spawnable_ready`` returns True as soon as ANY ready task + has an assignee that maps to a real Hermes profile — preserves the + real "stuck" signal when a daily/agent task is queued.""" + from hermes_cli import profiles + monkeypatch.setattr( + profiles, "profile_exists", lambda name: name == "daily" + ) + with kb.connect() as conn: + kb.create_task(conn, title="terminal-task", assignee="orion-cc") + kb.create_task(conn, title="hermes-task", assignee="daily") + assert kb.has_spawnable_ready(conn) is True + + +def test_has_spawnable_ready_false_on_empty_queue(kanban_home): + """Empty queue is the trivial false case — no ready tasks at all.""" + with kb.connect() as conn: + assert kb.has_spawnable_ready(conn) is False + + +def test_dispatch_promotes_ready_and_spawns(kanban_home, all_assignees_spawnable): spawns = [] def fake_spawn(task, workspace): @@ -368,7 +416,7 @@ def test_dispatch_promotes_ready_and_spawns(kanban_home): assert kb.get_task(conn, c).status == "running" -def test_dispatch_spawn_failure_releases_claim(kanban_home): +def test_dispatch_spawn_failure_releases_claim(kanban_home, all_assignees_spawnable): def boom(task, workspace): raise RuntimeError("spawn failed")