feat(kanban): worker visibility endpoints (workers/active, runs/{id}, inspect)

Adds three read-only endpoints to the kanban dashboard plugin so the SwitchUI workspace (and any other dashboard consumer) can track workers across tasks without N+1 round-trips through /tasks/{task_id}. - GET /workers/active Single SQL JOIN of task_runs + tasks where ended_at IS NULL, worker_pid IS NOT NULL, status='running'. Returns {workers: [...], count, checked_at}. - GET /runs/{run_id} Direct lookup of any task_run row by id. Reuses existing kanban_db.get_run() helper and _run_dict() serialiser. 404 when not found. Mirrors GET /tasks/{task_id} 404 shape. - GET /runs/{run_id}/inspect Live PID stats via psutil.Process.as_dict() — cpu_percent, memory_rss_bytes, memory_vms_bytes, num_threads, num_fds, status, create_time, cmdline. Short-circuits with alive:false when run has ended, has no worker_pid, the pid is gone, or psutil is unavailable. AccessDenied surfaces as alive:true with error rather than a 500. 11 new tests in tests/plugins/test_kanban_worker_runs.py cover the empty-board case, running-task case, ended-run filtering, missing-pid filtering, 404 paths, already-ended inspect, no-pid inspect, dead-pid inspect, and live-pid inspect (psutil mocked). All pass. Companion termination endpoint (POST /runs/{run_id}/terminate) is intentionally out of scope here — opening a separate issue first since the RBAC and dispatcher-mediated soft-cancel design needs maintainer input before code. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-04 07:31:58 +00:00 · 2026-05-18 21:01:42 -07:00 · 2026-05-18 21:01:42 -07:00 · 02efad704f
commit 02efad704f
parent b65dfbb453
2 changed files with 463 additions and 0 deletions
--- a/plugins/kanban/dashboard/plugin_api.py
+++ b/plugins/kanban/dashboard/plugin_api.py
@ -1050,6 +1050,168 @@ def list_diagnostics(
        conn.close()


+
+# ---------------------------------------------------------------------------
+# Worker visibility — cross-task active-worker list and per-run inspection
+# ---------------------------------------------------------------------------
+
+try:
+    import psutil as _psutil
+except ImportError:
+    _psutil = None  # type: ignore[assignment]
+
+
+@router.get("/workers/active")
+def list_active_workers(
+    board: Optional[str] = Query(None, description="Kanban board slug (omit for current)"),
+):
+    """Return every currently-running worker on the board.
+
+    A worker is a ``task_runs`` row whose ``ended_at`` is NULL and whose
+    ``worker_pid`` is non-NULL, belonging to a task with ``status='running'``.
+
+    Returns ``{workers: [...], count: N, checked_at: <epoch>}``.  Each
+    worker entry carries enough context for the dashboard to link back to
+    its task without a second round-trip.
+    """
+    board = _resolve_board(board)
+    conn = _conn(board=board)
+    try:
+        rows = conn.execute(
+            """
+            SELECT
+                r.id          AS run_id,
+                r.task_id,
+                t.title       AS task_title,
+                t.status      AS task_status,
+                t.assignee    AS task_assignee,
+                r.profile,
+                r.worker_pid,
+                r.started_at,
+                r.claim_lock,
+                r.claim_expires,
+                r.last_heartbeat_at,
+                r.max_runtime_seconds
+            FROM task_runs r
+            JOIN tasks t ON t.id = r.task_id
+            WHERE r.ended_at IS NULL
+              AND r.worker_pid IS NOT NULL
+              AND t.status = 'running'
+            ORDER BY r.started_at ASC
+            """,
+        ).fetchall()
+        workers = [
+            {
+                "run_id": row["run_id"],
+                "task_id": row["task_id"],
+                "task_title": row["task_title"],
+                "task_status": row["task_status"],
+                "task_assignee": row["task_assignee"],
+                "profile": row["profile"],
+                "worker_pid": row["worker_pid"],
+                "started_at": row["started_at"],
+                "claim_lock": row["claim_lock"],
+                "claim_expires": row["claim_expires"],
+                "last_heartbeat_at": row["last_heartbeat_at"],
+                "max_runtime_seconds": row["max_runtime_seconds"],
+            }
+            for row in rows
+        ]
+        return {"workers": workers, "count": len(workers), "checked_at": int(time.time())}
+    finally:
+        conn.close()
+
+
+@router.get("/runs/{run_id}")
+def get_run_endpoint(
+    run_id: int,
+    board: Optional[str] = Query(None, description="Kanban board slug (omit for current)"),
+):
+    """Direct lookup of a ``task_runs`` row by its integer id.
+
+    Returns ``{run: {...}}`` using the same serialisation as the
+    per-task run history embedded in ``GET /tasks/{task_id}``.
+    404 when no such run exists.
+    """
+    board = _resolve_board(board)
+    conn = _conn(board=board)
+    try:
+        r = kanban_db.get_run(conn, run_id)
+        if r is None:
+            raise HTTPException(status_code=404, detail=f"run {run_id} not found")
+        return {"run": _run_dict(r)}
+    finally:
+        conn.close()
+
+
+@router.get("/runs/{run_id}/inspect")
+def inspect_run_endpoint(
+    run_id: int,
+    board: Optional[str] = Query(None, description="Kanban board slug (omit for current)"),
+):
+    """Live PID stats for a run's worker process via psutil.
+
+    If the run has already ended, or has no recorded ``worker_pid``,
+    returns ``{alive: false}`` with a human-readable ``reason``.
+
+    When the process is live, returns CPU, memory, thread count, fd count,
+    status, create_time, and cmdline.  ``access_denied`` is set when the
+    OS refuses inspection rather than raising a 500.
+
+    psutil availability: if psutil is not installed the endpoint still
+    works but ``alive`` is always returned as ``false`` with
+    ``reason="psutil not available"``.
+    """
+    board = _resolve_board(board)
+    conn = _conn(board=board)
+    try:
+        r = kanban_db.get_run(conn, run_id)
+        if r is None:
+            raise HTTPException(status_code=404, detail=f"run {run_id} not found")
+    finally:
+        conn.close()
+
+    if r.ended_at is not None:
+        return {"run_id": run_id, "alive": False, "reason": "run already ended"}
+    if r.worker_pid is None:
+        return {"run_id": run_id, "alive": False, "reason": "no worker_pid recorded"}
+
+    pid = r.worker_pid
+
+    if _psutil is None:
+        return {"run_id": run_id, "alive": False, "pid": pid, "reason": "psutil not available"}
+
+    try:
+        proc = _psutil.Process(pid)
+        info = proc.as_dict(attrs=[
+            "cpu_percent", "memory_info", "num_threads",
+            "status", "create_time", "cmdline",
+        ])
+        # num_fds is POSIX-only; skip gracefully on Windows.
+        try:
+            num_fds = proc.num_fds()
+        except AttributeError:
+            num_fds = None
+        mem = info.get("memory_info")
+        return {
+            "run_id": run_id,
+            "alive": True,
+            "pid": pid,
+            "cpu_percent": info.get("cpu_percent"),
+            "memory_rss_bytes": mem.rss if mem else None,
+            "memory_vms_bytes": mem.vms if mem else None,
+            "num_threads": info.get("num_threads"),
+            "num_fds": num_fds,
+            "status": info.get("status"),
+            "create_time": info.get("create_time"),
+            "cmdline": info.get("cmdline"),
+        }
+    except _psutil.NoSuchProcess:
+        return {"run_id": run_id, "alive": False, "pid": pid, "reason": "process not found"}
+    except _psutil.AccessDenied:
+        return {"run_id": run_id, "alive": True, "pid": pid, "error": "access denied"}
+
+
 # ---------------------------------------------------------------------------
 # Recovery actions — reclaim a running claim, reassign to a new profile
 # ---------------------------------------------------------------------------