mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(environments): prevent terminal hang when commands background children (#8340)
When a user's command backgrounds a child (`cmd &`, `setsid cmd & disown`, etc.), the backgrounded grandchild inherits the write-end of our stdout pipe via fork(). The old `for line in proc.stdout` drain never EOF'd until the grandchild closed the pipe — so for a uvicorn server, the terminal tool hung indefinitely (users reported the whole session deadlocking when asking the agent to restart a backend). Fix: switch _drain() to select()-based non-blocking reads and stop draining shortly after bash exits even if the pipe hasn't EOF'd. Any output the grandchild writes after that point goes to an orphaned pipe, which is exactly what the user asked for when they said '&'. Adds regression tests covering the issue's exact repro and 5 related patterns (plain bg, setsid+disown, streaming output, high volume, timeout, UTF-8).
This commit is contained in:
parent
611657487f
commit
0a02fbd842
2 changed files with 162 additions and 11 deletions
|
|
@ -9,6 +9,7 @@ or a temp file (local).
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import select
|
||||
import shlex
|
||||
import subprocess
|
||||
import threading
|
||||
|
|
@ -436,17 +437,53 @@ class BaseEnvironment(ABC):
|
|||
"""
|
||||
output_chunks: list[str] = []
|
||||
|
||||
# Non-blocking drain via select().
|
||||
#
|
||||
# The old pattern — ``for line in proc.stdout`` — blocks on
|
||||
# ``readline()`` until the pipe reaches EOF. When the user's command
|
||||
# backgrounds a process (``cmd &``, ``setsid cmd & disown``, etc.),
|
||||
# that backgrounded grandchild inherits the write-end of our stdout
|
||||
# pipe via ``fork()``. Even after ``bash`` itself exits, the pipe
|
||||
# stays open because the grandchild still holds it — so the drain
|
||||
# thread never returns and the tool hangs for the full lifetime of
|
||||
# the grandchild (issue #8340: users reported indefinite hangs when
|
||||
# restarting uvicorn with ``setsid ... & disown``).
|
||||
#
|
||||
# The fix: select() with a short poll interval, and stop draining
|
||||
# shortly after ``bash`` exits even if the pipe hasn't EOF'd yet.
|
||||
# Any output the grandchild writes after that point goes to an
|
||||
# orphaned pipe (harmless — the kernel reaps it when our end closes).
|
||||
def _drain():
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
output_chunks.append(line)
|
||||
except UnicodeDecodeError:
|
||||
output_chunks.clear()
|
||||
output_chunks.append(
|
||||
"[binary output detected — raw bytes not displayable]"
|
||||
)
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
fd = proc.stdout.fileno()
|
||||
idle_after_exit = 0
|
||||
while True:
|
||||
try:
|
||||
ready, _, _ = select.select([fd], [], [], 0.1)
|
||||
except (ValueError, OSError):
|
||||
break # fd already closed
|
||||
if ready:
|
||||
try:
|
||||
chunk = os.read(fd, 4096)
|
||||
except (ValueError, OSError):
|
||||
break
|
||||
if not chunk:
|
||||
break # true EOF — all writers closed
|
||||
try:
|
||||
output_chunks.append(chunk.decode("utf-8"))
|
||||
except UnicodeDecodeError:
|
||||
output_chunks.clear()
|
||||
output_chunks.append(
|
||||
"[binary output detected — raw bytes not displayable]"
|
||||
)
|
||||
break
|
||||
idle_after_exit = 0
|
||||
elif proc.poll() is not None:
|
||||
# bash is gone and the pipe was idle for ~100ms. Give
|
||||
# it two more cycles to catch any buffered tail, then
|
||||
# stop — otherwise we wait forever on a grandchild pipe.
|
||||
idle_after_exit += 1
|
||||
if idle_after_exit >= 3:
|
||||
break
|
||||
|
||||
drain_thread = threading.Thread(target=_drain, daemon=True)
|
||||
drain_thread.start()
|
||||
|
|
@ -553,7 +590,10 @@ class BaseEnvironment(ABC):
|
|||
pass # cleanup is best-effort
|
||||
raise
|
||||
|
||||
drain_thread.join(timeout=5)
|
||||
# Drain thread now exits promptly after bash does (~300ms idle
|
||||
# check). A short join is enough; a long one would be a bug since
|
||||
# it means the non-blocking loop itself stopped cooperating.
|
||||
drain_thread.join(timeout=2)
|
||||
|
||||
try:
|
||||
proc.stdout.close()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue