From 9f008bcd5c795a685bde96db0d66758d456bd83a Mon Sep 17 00:00:00 2001 From: LeonJS <48821084+LeonJS@users.noreply.github.com> Date: Mon, 18 May 2026 20:45:24 -0700 Subject: [PATCH] fix(kanban): release scratch workspace and tmux session on task completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Salvages #27369 by @LeonJS. complete_task() now calls _cleanup_workspace() and _cleanup_worker_tmux() after marking a task complete. Scratch workspaces (used by swarm agents) accumulate on disk — hundreds of MB per task, never released. Stale tmux sessions from completed agents also persist indefinitely. Both gates are safe: - workspace_kind == 'scratch' gate preserves user worktree/dir workspaces - tmux #{pane_dead} == 1 gate only kills sessions where the worker has already exited - best-effort: cleanup failures never block task completion --- hermes_cli/kanban_db.py | 66 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index e72dae65d0f..acdac80c786 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -79,6 +79,7 @@ import sqlite3 import subprocess import sys import threading +import logging import time from dataclasses import dataclass, field from pathlib import Path @@ -86,6 +87,8 @@ from typing import Any, Iterable, Optional from toolsets import get_toolset_names +_log = logging.getLogger(__name__) + # --------------------------------------------------------------------------- # Constants @@ -2622,9 +2625,72 @@ def complete_task( _clear_failure_counter(conn, task_id) # Recompute ready status for dependents (separate txn so children see done). recompute_ready(conn) + # Clean up the scratch workspace and any stale tmux session for the worker. + _cleanup_workspace(conn, task_id) return True +# --------------------------------------------------------------------------- +# Workspace / tmux cleanup +# --------------------------------------------------------------------------- + +def _cleanup_workspace(conn: sqlite3.Connection, task_id: str) -> None: + """Remove a task's scratch workspace dir and kill its stale tmux session. + + Called from :func:`complete_task` after the DB transaction commits. + Best-effort — any error is swallowed so cleanup never blocks task completion. + Only ``scratch`` workspaces are removed; ``worktree`` and ``dir`` workspaces + are intentionally preserved. + """ + try: + row = conn.execute( + "SELECT workspace_kind, workspace_path FROM tasks WHERE id = ?", + (task_id,), + ).fetchone() + if not row: + return + kind: Optional[str] = row["workspace_kind"] + path: Optional[str] = row["workspace_path"] + if kind != "scratch" or not path: + return + import shutil + wp = Path(path) + if wp.is_dir(): + shutil.rmtree(wp, ignore_errors=True) + _log.debug("Removed scratch workspace: %s", wp) + # Also kill the tmux session for the worker that owned this task, + # if the tmux session is now dead (worker process exited). + _cleanup_worker_tmux(conn, task_id) + except Exception: + pass # best-effort — never block completion + + +def _cleanup_worker_tmux(conn: sqlite3.Connection, task_id: str) -> None: + """Kill the tmux session associated with a task's assignee, if dead.""" + try: + row = conn.execute( + "SELECT assignee FROM tasks WHERE id = ?", (task_id,) + ).fetchone() + if not row or not row["assignee"]: + return + assignee: str = row["assignee"] + # Workers named swarm1-12 use tmux sessions named swarm-swarm1 etc. + session = f"swarm-{assignee}" + # Check if session exists and pane is dead before killing + out = subprocess.run( + ["tmux", "list-panes", "-t", session, "-F", "#{pane_dead}"], + capture_output=True, text=True, timeout=5, + ) + if out.stdout.strip() == "1": + subprocess.run( + ["tmux", "kill-session", "-t", session], + capture_output=True, timeout=5, + ) + _log.debug("Killed stale tmux session: %s", session) + except Exception: + pass # best-effort — never block completion + + def edit_completed_task_result( conn: sqlite3.Connection, task_id: str,