feat(kanban): worker visibility endpoints (workers/active, runs/{id}, inspect)

Adds three read-only endpoints to the kanban dashboard plugin so the
SwitchUI workspace (and any other dashboard consumer) can track
workers across tasks without N+1 round-trips through /tasks/{task_id}.

- GET /workers/active
  Single SQL JOIN of task_runs + tasks where ended_at IS NULL,
  worker_pid IS NOT NULL, status='running'. Returns
  {workers: [...], count, checked_at}.

- GET /runs/{run_id}
  Direct lookup of any task_run row by id. Reuses existing
  kanban_db.get_run() helper and _run_dict() serialiser. 404 when
  not found. Mirrors GET /tasks/{task_id} 404 shape.

- GET /runs/{run_id}/inspect
  Live PID stats via psutil.Process.as_dict() — cpu_percent,
  memory_rss_bytes, memory_vms_bytes, num_threads, num_fds, status,
  create_time, cmdline. Short-circuits with alive:false when run
  has ended, has no worker_pid, the pid is gone, or psutil is
  unavailable. AccessDenied surfaces as alive:true with error
  rather than a 500.

11 new tests in tests/plugins/test_kanban_worker_runs.py cover the
empty-board case, running-task case, ended-run filtering,
missing-pid filtering, 404 paths, already-ended inspect, no-pid
inspect, dead-pid inspect, and live-pid inspect (psutil mocked).
All pass.

Companion termination endpoint (POST /runs/{run_id}/terminate) is
intentionally out of scope here — opening a separate issue first
since the RBAC and dispatcher-mediated soft-cancel design needs
maintainer input before code.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Interstellar-code 2026-05-18 21:01:42 -07:00 committed by Teknium
parent b65dfbb453
commit 02efad704f
2 changed files with 463 additions and 0 deletions

View file

@ -1050,6 +1050,168 @@ def list_diagnostics(
conn.close()
# ---------------------------------------------------------------------------
# Worker visibility — cross-task active-worker list and per-run inspection
# ---------------------------------------------------------------------------
try:
import psutil as _psutil
except ImportError:
_psutil = None # type: ignore[assignment]
@router.get("/workers/active")
def list_active_workers(
board: Optional[str] = Query(None, description="Kanban board slug (omit for current)"),
):
"""Return every currently-running worker on the board.
A worker is a ``task_runs`` row whose ``ended_at`` is NULL and whose
``worker_pid`` is non-NULL, belonging to a task with ``status='running'``.
Returns ``{workers: [...], count: N, checked_at: <epoch>}``. Each
worker entry carries enough context for the dashboard to link back to
its task without a second round-trip.
"""
board = _resolve_board(board)
conn = _conn(board=board)
try:
rows = conn.execute(
"""
SELECT
r.id AS run_id,
r.task_id,
t.title AS task_title,
t.status AS task_status,
t.assignee AS task_assignee,
r.profile,
r.worker_pid,
r.started_at,
r.claim_lock,
r.claim_expires,
r.last_heartbeat_at,
r.max_runtime_seconds
FROM task_runs r
JOIN tasks t ON t.id = r.task_id
WHERE r.ended_at IS NULL
AND r.worker_pid IS NOT NULL
AND t.status = 'running'
ORDER BY r.started_at ASC
""",
).fetchall()
workers = [
{
"run_id": row["run_id"],
"task_id": row["task_id"],
"task_title": row["task_title"],
"task_status": row["task_status"],
"task_assignee": row["task_assignee"],
"profile": row["profile"],
"worker_pid": row["worker_pid"],
"started_at": row["started_at"],
"claim_lock": row["claim_lock"],
"claim_expires": row["claim_expires"],
"last_heartbeat_at": row["last_heartbeat_at"],
"max_runtime_seconds": row["max_runtime_seconds"],
}
for row in rows
]
return {"workers": workers, "count": len(workers), "checked_at": int(time.time())}
finally:
conn.close()
@router.get("/runs/{run_id}")
def get_run_endpoint(
run_id: int,
board: Optional[str] = Query(None, description="Kanban board slug (omit for current)"),
):
"""Direct lookup of a ``task_runs`` row by its integer id.
Returns ``{run: {...}}`` using the same serialisation as the
per-task run history embedded in ``GET /tasks/{task_id}``.
404 when no such run exists.
"""
board = _resolve_board(board)
conn = _conn(board=board)
try:
r = kanban_db.get_run(conn, run_id)
if r is None:
raise HTTPException(status_code=404, detail=f"run {run_id} not found")
return {"run": _run_dict(r)}
finally:
conn.close()
@router.get("/runs/{run_id}/inspect")
def inspect_run_endpoint(
run_id: int,
board: Optional[str] = Query(None, description="Kanban board slug (omit for current)"),
):
"""Live PID stats for a run's worker process via psutil.
If the run has already ended, or has no recorded ``worker_pid``,
returns ``{alive: false}`` with a human-readable ``reason``.
When the process is live, returns CPU, memory, thread count, fd count,
status, create_time, and cmdline. ``access_denied`` is set when the
OS refuses inspection rather than raising a 500.
psutil availability: if psutil is not installed the endpoint still
works but ``alive`` is always returned as ``false`` with
``reason="psutil not available"``.
"""
board = _resolve_board(board)
conn = _conn(board=board)
try:
r = kanban_db.get_run(conn, run_id)
if r is None:
raise HTTPException(status_code=404, detail=f"run {run_id} not found")
finally:
conn.close()
if r.ended_at is not None:
return {"run_id": run_id, "alive": False, "reason": "run already ended"}
if r.worker_pid is None:
return {"run_id": run_id, "alive": False, "reason": "no worker_pid recorded"}
pid = r.worker_pid
if _psutil is None:
return {"run_id": run_id, "alive": False, "pid": pid, "reason": "psutil not available"}
try:
proc = _psutil.Process(pid)
info = proc.as_dict(attrs=[
"cpu_percent", "memory_info", "num_threads",
"status", "create_time", "cmdline",
])
# num_fds is POSIX-only; skip gracefully on Windows.
try:
num_fds = proc.num_fds()
except AttributeError:
num_fds = None
mem = info.get("memory_info")
return {
"run_id": run_id,
"alive": True,
"pid": pid,
"cpu_percent": info.get("cpu_percent"),
"memory_rss_bytes": mem.rss if mem else None,
"memory_vms_bytes": mem.vms if mem else None,
"num_threads": info.get("num_threads"),
"num_fds": num_fds,
"status": info.get("status"),
"create_time": info.get("create_time"),
"cmdline": info.get("cmdline"),
}
except _psutil.NoSuchProcess:
return {"run_id": run_id, "alive": False, "pid": pid, "reason": "process not found"}
except _psutil.AccessDenied:
return {"run_id": run_id, "alive": True, "pid": pid, "error": "access denied"}
# ---------------------------------------------------------------------------
# Recovery actions — reclaim a running claim, reassign to a new profile
# ---------------------------------------------------------------------------