fix(process): reconcile session.exited against real child exit in poll/wait (#17430)

When a background terminal process spawns a descendant daemon that
inherits the stdout pipe (e.g. 'hermes update' triggering a gateway
systemctl restart), the reader thread's stdout.read() never returns EOF
and its finally: block never runs. session.exited stays False forever,
so process(action='poll') returns 'running' indefinitely even though
the direct child exited long ago.

Issue #17327: Feishu user polled 74 times over 7 minutes before killing
the gateway manually.

Fix: add _reconcile_local_exit() that checks the direct Popen.poll()
before trusting session.exited. If the direct child has exited, drain
any immediately-readable bytes non-blocking and flip session.exited.
Called from poll() and wait(). The stuck reader thread remains blocked
but is a daemon thread and gets reaped with the process.

Safe no-op for env/PTY sessions, already-exited sessions, and live
children (returns None from Popen.poll()).
This commit is contained in:
Teknium 2026-04-29 04:59:21 -07:00 committed by GitHub
parent 13683c0842
commit 20b759cd02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 208 additions and 0 deletions

View file

@ -800,6 +800,78 @@ class ProcessRegistry:
session = self._running.get(session_id) or self._finished.get(session_id)
return self._refresh_detached_session(session)
def _reconcile_local_exit(self, session: "ProcessSession") -> None:
"""Reconcile session.exited against the real child process state.
The reader thread (`_reader_loop`) sets `session.exited = True` only
in its `finally` block, which runs when `stdout.read()` returns EOF.
If the direct `Popen` child has exited but a descendant process (e.g.
a daemon spawned by `hermes update` restarting the gateway) is still
holding the stdout pipe open, the reader blocks forever and poll()
keeps returning "running" indefinitely (issue #17327 — 74 polls over
7 minutes on Feishu).
This helper closes that window: when `session.exited` is still False
but the direct child's `Popen.poll()` reports an exit code, drain any
readable bytes non-blocking and flip `session.exited`. The orphaned
reader thread remains stuck on its blocking `read()` but is a daemon
thread and will be reaped with the process.
Safe no-op on sessions without a local `Popen` (env/PTY), already-
exited sessions, and detached-recovered sessions.
"""
if session is None or session.exited:
return
proc = getattr(session, "process", None)
if proc is None:
return
try:
rc = proc.poll()
except Exception:
return
if rc is None:
return # Direct child still running — reader block is legitimate.
# Direct child exited. Try to drain any bytes the reader hasn't
# consumed yet. This is best-effort: if the pipe is held open by a
# descendant, the non-blocking read returns what's immediately
# available and we stop.
drained = ""
stdout = getattr(proc, "stdout", None)
if stdout is not None and not _IS_WINDOWS:
try:
import fcntl
fd = stdout.fileno()
flags = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
try:
chunk = stdout.read()
if chunk:
drained = chunk if isinstance(chunk, str) else chunk.decode("utf-8", errors="replace")
except (BlockingIOError, OSError, ValueError):
pass
finally:
try:
fcntl.fcntl(fd, fcntl.F_SETFL, flags)
except Exception:
pass
except Exception as e:
logger.debug("Non-blocking drain failed for %s: %s", session.id, e)
with session._lock:
if drained:
session.output_buffer += drained
if len(session.output_buffer) > session.max_output_chars:
session.output_buffer = session.output_buffer[-session.max_output_chars:]
session.exited = True
session.exit_code = rc
logger.info(
"Reconciled session %s: direct child exited with code %s but reader "
"was still blocked (orphaned pipe). Flipped to exited.",
session.id, rc,
)
self._move_to_finished(session)
def poll(self, session_id: str) -> dict:
"""Check status and get new output for a background process."""
from tools.ansi_strip import strip_ansi
@ -808,6 +880,10 @@ class ProcessRegistry:
if session is None:
return {"status": "not_found", "error": f"No process with ID {session_id}"}
# Reconcile against real child state before reading session.exited.
# Guards against orphaned-pipe reader hangs (issue #17327).
self._reconcile_local_exit(session)
with session._lock:
output_preview = strip_ansi(session.output_buffer[-1000:]) if session.output_buffer else ""
@ -898,6 +974,10 @@ class ProcessRegistry:
while time.monotonic() < deadline:
session = self._refresh_detached_session(session)
# Reconcile against real child state — guards against orphaned-
# pipe reader hangs where the reader is blocked but the direct
# child has already exited (issue #17327).
self._reconcile_local_exit(session)
if session.exited:
self._completion_consumed.add(session_id)
result = {