fix: defer scratch workspace cleanup when task has active children (#33774)

When a Kanban task with workspace_kind=scratch completes, the
_cleanup_workspace() function immediately deletes the workspace
directory. If the task has children linked via task_links, those
children find the workspace deleted when they start.

This fix adds two checks:
1. Before deleting, check if any children are still active
   (todo/ready/running). If so, defer cleanup.
2. After a child completes, check if parent workspace can now
   be cleaned up (all children terminal).

Fixes NousResearch/hermes-agent#33774
This commit is contained in:
annguyenNous 2026-05-28 21:27:55 +07:00 committed by Teknium
parent cb3e41e2fd
commit 9405cd0812

View file

@ -3839,6 +3839,23 @@ def _cleanup_workspace(conn: sqlite3.Connection, task_id: str) -> None:
path: Optional[str] = row["workspace_path"]
if kind != "scratch" or not path:
return
# Check if this task has children that still need the workspace.
# If any child is not yet done/archived, defer cleanup so the
# child can read handoff artifacts from the scratch dir (#33774).
_active_children = conn.execute(
"SELECT 1 FROM task_links l "
"JOIN tasks t ON t.id = l.child_id "
"WHERE l.parent_id = ? AND t.status NOT IN ('done', 'archived', 'failed', 'cancelled') "
"LIMIT 1",
(task_id,),
).fetchone()
if _active_children:
_log.debug(
"Deferring scratch workspace cleanup for task %s: "
"active children still need workspace at %s",
task_id, path,
)
return
import shutil
wp = Path(path)
if wp.is_dir():
@ -3860,10 +3877,54 @@ def _cleanup_workspace(conn: sqlite3.Connection, task_id: str) -> None:
# Also kill the tmux session for the worker that owned this task,
# if the tmux session is now dead (worker process exited).
_cleanup_worker_tmux(conn, task_id)
# After cleaning up this task's workspace, check if any parent
# tasks now have all children done — their deferred cleanup can
# proceed (#33774).
_try_cleanup_parent_workspaces(conn, task_id)
except Exception:
pass # best-effort — never block completion
def _try_cleanup_parent_workspaces(conn: sqlite3.Connection, task_id: str) -> None:
"""Clean up parent scratch workspaces now that *task_id* completed.
When a parent task's cleanup was deferred because it had active children,
this function is called after each child completes. If all children of a
parent are now done/archived/failed/cancelled, the parent's scratch
workspace is removed (#33774).
"""
try:
parents = conn.execute(
"SELECT parent_id FROM task_links WHERE child_id = ?",
(task_id,),
).fetchall()
for (parent_id,) in parents:
row = conn.execute(
"SELECT workspace_kind, workspace_path FROM tasks WHERE id = ?",
(parent_id,),
).fetchone()
if not row or row["workspace_kind"] != "scratch" or not row["workspace_path"]:
continue
# Check if ALL children of this parent are terminal
active = conn.execute(
"SELECT 1 FROM task_links l "
"JOIN tasks t ON t.id = l.child_id "
"WHERE l.parent_id = ? AND t.status NOT IN ('done', 'archived', 'failed', 'cancelled') "
"LIMIT 1",
(parent_id,),
).fetchone()
if active:
continue # still has active children
# All children done — safe to clean up parent workspace
import shutil
wp = Path(row["workspace_path"])
if wp.is_dir() and _is_managed_scratch_path(wp):
shutil.rmtree(wp, ignore_errors=True)
_log.debug("Deferred cleanup: removed parent %s scratch workspace: %s", parent_id, wp)
except Exception:
pass # best-effort
def _cleanup_worker_tmux(conn: sqlite3.Connection, task_id: str) -> None:
"""Kill the tmux session associated with a task's assignee, if dead."""
try: