diff --git a/tests/tools/test_local_background_child_hang.py b/tests/tools/test_local_background_child_hang.py new file mode 100644 index 0000000000..e8014e875c --- /dev/null +++ b/tests/tools/test_local_background_child_hang.py @@ -0,0 +1,111 @@ +"""Regression tests for issue #8340. + +When a user command backgrounds a child process (``cmd &``, ``setsid cmd & +disown``, etc.), the backgrounded grandchild inherits the write-end of our +stdout pipe via fork(). Before the fix, the drain thread's blocking +``for line in proc.stdout`` would never see EOF until that grandchild +closed the pipe — causing the terminal tool to hang for the full lifetime +of the backgrounded service (indefinitely for a uvicorn server). + +The fix switches ``_drain()`` to select()-based non-blocking reads and +stops draining shortly after bash exits even if the pipe hasn't EOF'd. +""" +import json +import subprocess +import time + +import pytest + +from tools.environments.local import LocalEnvironment + + +def _pkill(pattern: str) -> None: + subprocess.run(f"pkill -9 -f {pattern!r} 2>/dev/null", shell=True) + + +@pytest.fixture +def local_env(): + env = LocalEnvironment(cwd="/tmp") + try: + yield env + finally: + env.cleanup() + + +class TestBackgroundChildDoesNotHang: + """Regression guard for issue #8340.""" + + def test_plain_background_returns_promptly(self, local_env): + """``cmd &`` with no output redirection must not hang on pipe inherit.""" + marker = "hermes_8340_plain_bg" + cmd = f'python3 -c "import time; time.sleep(60)" & echo {marker}' + try: + t0 = time.monotonic() + result = local_env.execute(cmd, timeout=15) + elapsed = time.monotonic() - t0 + + assert elapsed < 4.0, ( + f"terminal_tool hung for {elapsed:.1f}s — drain thread " + f"is still blocking on backgrounded child's inherited pipe fd" + ) + assert result["returncode"] == 0 + assert marker in result["output"] + finally: + _pkill("time.sleep(60)") + + def test_setsid_disown_pattern_returns_promptly(self, local_env): + """The exact pattern from the issue: setsid ... & disown.""" + cmd = ( + 'setsid python3 -c "import time; time.sleep(60)" ' + '> /dev/null 2>&1 < /dev/null & disown; echo started' + ) + try: + t0 = time.monotonic() + result = local_env.execute(cmd, timeout=15) + elapsed = time.monotonic() - t0 + + assert elapsed < 4.0, f"setsid+disown path hung for {elapsed:.1f}s" + assert result["returncode"] == 0 + assert "started" in result["output"] + finally: + _pkill("time.sleep(60)") + + def test_foreground_streaming_output_still_captured(self, local_env): + """Sanity: incremental output over time must still be captured in full.""" + cmd = 'for i in 1 2 3; do echo "tick $i"; sleep 0.2; done; echo done' + t0 = time.monotonic() + result = local_env.execute(cmd, timeout=10) + elapsed = time.monotonic() - t0 + + # Loop body sleeps ~0.6s total — elapsed should be close to that. + assert 0.5 < elapsed < 3.0 + assert result["returncode"] == 0 + for expected in ("tick 1", "tick 2", "tick 3", "done"): + assert expected in result["output"], f"missing {expected!r}" + + def test_high_volume_output_complete(self, local_env): + """Sanity: select-based drain must not drop lines under load.""" + result = local_env.execute("seq 1 3000", timeout=10) + lines = result["output"].strip().split("\n") + assert result["returncode"] == 0 + assert len(lines) == 3000 + assert lines[0] == "1" + assert lines[-1] == "3000" + + def test_timeout_path_still_works(self, local_env): + """Foreground command exceeding timeout must still be killed.""" + t0 = time.monotonic() + result = local_env.execute("sleep 30", timeout=2) + elapsed = time.monotonic() - t0 + + assert elapsed < 4.0 + assert result["returncode"] == 124 + assert "timed out" in result["output"].lower() + + def test_utf8_output_decoded_correctly(self, local_env): + """Multibyte UTF-8 chunks must decode cleanly under select-based reads.""" + result = local_env.execute("echo 日本語 café résumé", timeout=5) + assert result["returncode"] == 0 + assert "日本語" in result["output"] + assert "café" in result["output"] + assert "résumé" in result["output"] diff --git a/tools/environments/base.py b/tools/environments/base.py index 1bc08449e4..a7f0a849b5 100644 --- a/tools/environments/base.py +++ b/tools/environments/base.py @@ -9,6 +9,7 @@ or a temp file (local). import json import logging import os +import select import shlex import subprocess import threading @@ -436,17 +437,53 @@ class BaseEnvironment(ABC): """ output_chunks: list[str] = [] + # Non-blocking drain via select(). + # + # The old pattern — ``for line in proc.stdout`` — blocks on + # ``readline()`` until the pipe reaches EOF. When the user's command + # backgrounds a process (``cmd &``, ``setsid cmd & disown``, etc.), + # that backgrounded grandchild inherits the write-end of our stdout + # pipe via ``fork()``. Even after ``bash`` itself exits, the pipe + # stays open because the grandchild still holds it — so the drain + # thread never returns and the tool hangs for the full lifetime of + # the grandchild (issue #8340: users reported indefinite hangs when + # restarting uvicorn with ``setsid ... & disown``). + # + # The fix: select() with a short poll interval, and stop draining + # shortly after ``bash`` exits even if the pipe hasn't EOF'd yet. + # Any output the grandchild writes after that point goes to an + # orphaned pipe (harmless — the kernel reaps it when our end closes). def _drain(): - try: - for line in proc.stdout: - output_chunks.append(line) - except UnicodeDecodeError: - output_chunks.clear() - output_chunks.append( - "[binary output detected — raw bytes not displayable]" - ) - except (ValueError, OSError): - pass + fd = proc.stdout.fileno() + idle_after_exit = 0 + while True: + try: + ready, _, _ = select.select([fd], [], [], 0.1) + except (ValueError, OSError): + break # fd already closed + if ready: + try: + chunk = os.read(fd, 4096) + except (ValueError, OSError): + break + if not chunk: + break # true EOF — all writers closed + try: + output_chunks.append(chunk.decode("utf-8")) + except UnicodeDecodeError: + output_chunks.clear() + output_chunks.append( + "[binary output detected — raw bytes not displayable]" + ) + break + idle_after_exit = 0 + elif proc.poll() is not None: + # bash is gone and the pipe was idle for ~100ms. Give + # it two more cycles to catch any buffered tail, then + # stop — otherwise we wait forever on a grandchild pipe. + idle_after_exit += 1 + if idle_after_exit >= 3: + break drain_thread = threading.Thread(target=_drain, daemon=True) drain_thread.start() @@ -553,7 +590,10 @@ class BaseEnvironment(ABC): pass # cleanup is best-effort raise - drain_thread.join(timeout=5) + # Drain thread now exits promptly after bash does (~300ms idle + # check). A short join is enough; a long one would be a bug since + # it means the non-blocking loop itself stopped cooperating. + drain_thread.join(timeout=2) try: proc.stdout.close()