fix(process): reconcile session.exited against real child exit in poll/wait (#17430)

When a background terminal process spawns a descendant daemon that
inherits the stdout pipe (e.g. 'hermes update' triggering a gateway
systemctl restart), the reader thread's stdout.read() never returns EOF
and its finally: block never runs. session.exited stays False forever,
so process(action='poll') returns 'running' indefinitely even though
the direct child exited long ago.

Issue #17327: Feishu user polled 74 times over 7 minutes before killing
the gateway manually.

Fix: add _reconcile_local_exit() that checks the direct Popen.poll()
before trusting session.exited. If the direct child has exited, drain
any immediately-readable bytes non-blocking and flip session.exited.
Called from poll() and wait(). The stuck reader thread remains blocked
but is a daemon thread and gets reaped with the process.

Safe no-op for env/PTY sessions, already-exited sessions, and live
children (returns None from Popen.poll()).
This commit is contained in:
Teknium 2026-04-29 04:59:21 -07:00 committed by GitHub
parent 13683c0842
commit 20b759cd02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 208 additions and 0 deletions

View file

@ -103,6 +103,134 @@ class TestGetAndPoll:
assert result["exit_code"] == 0
# =========================================================================
# Orphaned-pipe reconciliation (issue #17327)
# =========================================================================
@pytest.mark.skipif(sys.platform == "win32", reason="POSIX-only: uses setsid/fcntl")
class TestOrphanedPipeReconciliation:
"""Regression tests for issue #17327.
`hermes update` in Feishu spawned a background subprocess that restarted
the gateway; the direct child exited quickly but a descendant daemon
held the stdout pipe open. `_reader_loop.finally` never ran, so
`session.exited` stayed False and the agent polled 74 times over 7
minutes, all returning `status: running`.
The fix is `_reconcile_local_exit()`: poll() and wait() now check the
direct `Popen.poll()` before trusting `session.exited`.
"""
def test_reconcile_flips_exited_when_direct_child_done(self, registry):
"""Direct child exited but reader thread is blocked on orphaned pipe."""
# Simulate the orphaned-pipe scenario: direct child exited, but a
# descendant holds stdout open so the reader never sees EOF.
# Approach: spawn `sh -c 'sleep 10 &'` with setsid — sh forks the
# sleep into a new session group, exits immediately, but sleep
# inherits the stdout pipe and keeps it open.
proc = subprocess.Popen(
["sh", "-c", "exec 1>&2; ( sleep 30 ) & disown; exit 0"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
preexec_fn=os.setsid,
)
s = _make_session(sid="proc_orphan_test")
s.process = proc
s.pid = proc.pid
registry._running[s.id] = s
# Wait for the direct child to exit. We don't start a reader thread,
# so session.exited stays False (mimicking the stuck-reader state).
assert _wait_until(lambda: proc.poll() is not None, timeout=5.0), (
"Direct child should exit quickly (sh exits, sleep descendant "
"holds the pipe open)"
)
# Before the fix: poll would return "running" forever.
# After the fix: poll reconciles against proc.poll() and flips.
assert s.exited is False # Precondition: reader hasn't updated it.
result = registry.poll(s.id)
assert result["status"] == "exited", (
f"Expected reconciled 'exited' status; got {result!r}. "
"This is issue #17327 — reader is blocked on orphaned pipe."
)
assert result["exit_code"] == 0
assert s.exited is True
assert s.id in registry._finished
assert s.id not in registry._running
# Clean up the orphaned descendant.
try:
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass
def test_reconcile_noop_when_child_still_running(self, registry):
"""Reconcile must NOT flip exited when the direct child is alive."""
proc = _spawn_python_sleep(5.0)
s = _make_session(sid="proc_running_test")
s.process = proc
s.pid = proc.pid
registry._running[s.id] = s
result = registry.poll(s.id)
assert result["status"] == "running"
assert s.exited is False
proc.kill()
proc.wait()
def test_reconcile_noop_on_already_exited(self, registry):
"""Reconcile is a no-op when session.exited is already True."""
s = _make_session(sid="proc_already_exited", exited=True, exit_code=7)
s.process = MagicMock()
s.process.poll = MagicMock(return_value=0) # Would say exit 0
registry._finished[s.id] = s
registry._reconcile_local_exit(s)
# Must not overwrite the existing exit_code with proc.poll()'s 0.
assert s.exit_code == 7
def test_reconcile_noop_on_no_process(self, registry):
"""Reconcile is a no-op for sessions without a local Popen (env/PTY)."""
s = _make_session(sid="proc_no_popen")
assert getattr(s, "process", None) is None
# Must not raise.
registry._reconcile_local_exit(s)
assert s.exited is False
def test_wait_returns_when_reader_blocked(self, registry):
"""wait() must also reconcile — not just poll()."""
proc = subprocess.Popen(
["sh", "-c", "( sleep 30 ) & disown; exit 0"],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
preexec_fn=os.setsid,
)
s = _make_session(sid="proc_wait_orphan")
s.process = proc
s.pid = proc.pid
registry._running[s.id] = s
assert _wait_until(lambda: proc.poll() is not None, timeout=5.0)
start = time.monotonic()
result = registry.wait(s.id, timeout=10)
elapsed = time.monotonic() - start
assert result["status"] == "exited", result
assert elapsed < 5.0, (
f"wait() should return ~immediately via reconcile; took {elapsed:.1f}s"
)
try:
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
except (ProcessLookupError, PermissionError):
pass
# =========================================================================
# Read log
# =========================================================================