mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(tui): session.create build thread must clean up if session.close races (#12555)
When a user hits /new or /resume before the previous session finishes initializing, session.close runs while the previous session.create's _build thread is still constructing the agent. session.close pops _sessions[sid] and closes whatever slash_worker it finds (None at that point — _build hasn't installed it yet), then returns. _build keeps running in the background, installs the slash_worker subprocess and registers an approval-notify callback on a session dict that's now unreachable via _sessions. The subprocess leaks until process exit; the notify callback lingers in the global registry. Fix: _build now tracks what it allocates (worker, notify_registered) and checks in its finally block whether _sessions[sid] still points to the session it's building for. If not, the build was orphaned by a racing close, so clean up the subprocess and unregister the notify ourselves. tui_gateway/server.py: - _build reads _sessions.get(sid) safely (returns early if already gone) - tracks allocated worker + notify registration - finally checks orphan status and cleans up Tests (tests/test_tui_gateway_server.py): 2 new cases. - test_session_create_close_race_does_not_orphan_worker: slow _make_agent, close mid-build, verify worker.close() and unregister_gateway_notify both fire from the build thread's cleanup path. - test_session_create_no_race_keeps_worker_alive: regression guard — happy path does NOT over-eagerly clean up a live worker. Validated: against the unpatched code, the race test fails with 'orphan worker was not cleaned up — closed_workers=[]'. Live E2E against the live Python environment confirmed the cleanup fires exactly when the race happens.
This commit is contained in:
parent
37524a574e
commit
c567adb58a
2 changed files with 196 additions and 2 deletions
|
|
@ -1088,7 +1088,23 @@ def _(rid, params: dict) -> dict:
|
|||
}
|
||||
|
||||
def _build() -> None:
|
||||
session = _sessions[sid]
|
||||
session = _sessions.get(sid)
|
||||
if session is None:
|
||||
# session.close ran before the build thread got scheduled.
|
||||
ready.set()
|
||||
return
|
||||
|
||||
# Track what we allocate so we can clean up if session.close
|
||||
# races us to the finish line. session.close pops _sessions[sid]
|
||||
# unconditionally and tries to close the slash_worker it finds;
|
||||
# if _build is still mid-construction when close runs, close
|
||||
# finds slash_worker=None / notify unregistered and returns
|
||||
# cleanly — leaving us, the build thread, to later install the
|
||||
# worker + notify on an orphaned session dict. The finally
|
||||
# block below detects the orphan and cleans up instead of
|
||||
# leaking a subprocess and a global notify registration.
|
||||
worker = None
|
||||
notify_registered = False
|
||||
try:
|
||||
tokens = _set_session_context(key)
|
||||
try:
|
||||
|
|
@ -1100,13 +1116,15 @@ def _(rid, params: dict) -> dict:
|
|||
session["agent"] = agent
|
||||
|
||||
try:
|
||||
session["slash_worker"] = _SlashWorker(key, getattr(agent, "model", _resolve_model()))
|
||||
worker = _SlashWorker(key, getattr(agent, "model", _resolve_model()))
|
||||
session["slash_worker"] = worker
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
from tools.approval import register_gateway_notify, load_permanent_allowlist
|
||||
register_gateway_notify(key, lambda data: _emit("approval.request", sid, data))
|
||||
notify_registered = True
|
||||
load_permanent_allowlist()
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -1122,6 +1140,23 @@ def _(rid, params: dict) -> dict:
|
|||
session["agent_error"] = str(e)
|
||||
_emit("error", sid, {"message": f"agent init failed: {e}"})
|
||||
finally:
|
||||
# Orphan check: if session.close raced us and popped
|
||||
# _sessions[sid] while we were building, the dict we just
|
||||
# populated is unreachable. Clean up the subprocess and
|
||||
# the global notify registration ourselves — session.close
|
||||
# couldn't see them at the time it ran.
|
||||
if _sessions.get(sid) is not session:
|
||||
if worker is not None:
|
||||
try:
|
||||
worker.close()
|
||||
except Exception:
|
||||
pass
|
||||
if notify_registered:
|
||||
try:
|
||||
from tools.approval import unregister_gateway_notify
|
||||
unregister_gateway_notify(key)
|
||||
except Exception:
|
||||
pass
|
||||
ready.set()
|
||||
|
||||
threading.Thread(target=_build, daemon=True).start()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue