From 9405cd0812e578ed311aa8003fa03720dcd482ce Mon Sep 17 00:00:00 2001 From: annguyenNous Date: Thu, 28 May 2026 21:27:55 +0700 Subject: [PATCH] fix: defer scratch workspace cleanup when task has active children (#33774) When a Kanban task with workspace_kind=scratch completes, the _cleanup_workspace() function immediately deletes the workspace directory. If the task has children linked via task_links, those children find the workspace deleted when they start. This fix adds two checks: 1. Before deleting, check if any children are still active (todo/ready/running). If so, defer cleanup. 2. After a child completes, check if parent workspace can now be cleaned up (all children terminal). Fixes NousResearch/hermes-agent#33774 --- hermes_cli/kanban_db.py | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/hermes_cli/kanban_db.py b/hermes_cli/kanban_db.py index c8c53dba7ec..81b17c4f923 100644 --- a/hermes_cli/kanban_db.py +++ b/hermes_cli/kanban_db.py @@ -3839,6 +3839,23 @@ def _cleanup_workspace(conn: sqlite3.Connection, task_id: str) -> None: path: Optional[str] = row["workspace_path"] if kind != "scratch" or not path: return + # Check if this task has children that still need the workspace. + # If any child is not yet done/archived, defer cleanup so the + # child can read handoff artifacts from the scratch dir (#33774). + _active_children = conn.execute( + "SELECT 1 FROM task_links l " + "JOIN tasks t ON t.id = l.child_id " + "WHERE l.parent_id = ? AND t.status NOT IN ('done', 'archived', 'failed', 'cancelled') " + "LIMIT 1", + (task_id,), + ).fetchone() + if _active_children: + _log.debug( + "Deferring scratch workspace cleanup for task %s: " + "active children still need workspace at %s", + task_id, path, + ) + return import shutil wp = Path(path) if wp.is_dir(): @@ -3860,10 +3877,54 @@ def _cleanup_workspace(conn: sqlite3.Connection, task_id: str) -> None: # Also kill the tmux session for the worker that owned this task, # if the tmux session is now dead (worker process exited). _cleanup_worker_tmux(conn, task_id) + # After cleaning up this task's workspace, check if any parent + # tasks now have all children done — their deferred cleanup can + # proceed (#33774). + _try_cleanup_parent_workspaces(conn, task_id) except Exception: pass # best-effort — never block completion +def _try_cleanup_parent_workspaces(conn: sqlite3.Connection, task_id: str) -> None: + """Clean up parent scratch workspaces now that *task_id* completed. + + When a parent task's cleanup was deferred because it had active children, + this function is called after each child completes. If all children of a + parent are now done/archived/failed/cancelled, the parent's scratch + workspace is removed (#33774). + """ + try: + parents = conn.execute( + "SELECT parent_id FROM task_links WHERE child_id = ?", + (task_id,), + ).fetchall() + for (parent_id,) in parents: + row = conn.execute( + "SELECT workspace_kind, workspace_path FROM tasks WHERE id = ?", + (parent_id,), + ).fetchone() + if not row or row["workspace_kind"] != "scratch" or not row["workspace_path"]: + continue + # Check if ALL children of this parent are terminal + active = conn.execute( + "SELECT 1 FROM task_links l " + "JOIN tasks t ON t.id = l.child_id " + "WHERE l.parent_id = ? AND t.status NOT IN ('done', 'archived', 'failed', 'cancelled') " + "LIMIT 1", + (parent_id,), + ).fetchone() + if active: + continue # still has active children + # All children done — safe to clean up parent workspace + import shutil + wp = Path(row["workspace_path"]) + if wp.is_dir() and _is_managed_scratch_path(wp): + shutil.rmtree(wp, ignore_errors=True) + _log.debug("Deferred cleanup: removed parent %s scratch workspace: %s", parent_id, wp) + except Exception: + pass # best-effort + + def _cleanup_worker_tmux(conn: sqlite3.Connection, task_id: str) -> None: """Kill the tmux session associated with a task's assignee, if dead.""" try: