fix(dashboard): close PTY WebSocket on child EOF to stop FD leak (#54028) (#54123)

* fix(dashboard): close PTY WebSocket on child EOF to stop FD leak

The /api/pty handler's reader task returns on child EOF, but the writer
loop stayed blocked on ws.receive() until the browser sent a disconnect.
When the browser socket is half-open (no FIN delivered — common on
macOS/launchd), that disconnect never arrives, so the handler never
reaches its finally and the PTY master fd + child process leak. With
dashboard auto-reconnect (#52962), every dropped socket then spawns a
fresh PTY on top of the orphaned one, exhausting file descriptors within
hours (EMFILE / Errno 24).

Fix: the reader task now closes the WebSocket in a finally when the child
EOFs or the send side breaks, which unblocks ws.receive() so the existing
finally runs bridge.close(). The writer loop also guards ws.receive()
against the RuntimeError Starlette raises once the socket is closed.

Reported by @fifteenzhang.

Fixes #54028

* docs: add infographic for #54028 PTY FD leak fix
This commit is contained in:
Teknium 2026-06-28 02:42:21 -07:00 committed by GitHub
parent 7ef04ae7a7
commit 6d879d486b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 69 additions and 13 deletions

View file

@ -12188,26 +12188,44 @@ async def pty_ws(ws: WebSocket) -> None:
# --- reader task: PTY master → WebSocket ----------------------------
async def pump_pty_to_ws() -> None:
while True:
chunk = await loop.run_in_executor(
None, bridge.read, _PTY_READ_CHUNK_TIMEOUT
)
if chunk is None: # EOF
return
if not chunk: # no data this tick; yield control and retry
await asyncio.sleep(0)
continue
try:
while True:
chunk = await loop.run_in_executor(
None, bridge.read, _PTY_READ_CHUNK_TIMEOUT
)
if chunk is None: # EOF
return
if not chunk: # no data this tick; yield control and retry
await asyncio.sleep(0)
continue
try:
await ws.send_bytes(chunk)
except Exception:
return
finally:
# The child has exited (EOF) or the send side broke. Close the
# WebSocket so the writer loop's ``ws.receive()`` returns instead
# of blocking forever — otherwise, when the browser's socket is
# half-open (no FIN delivered, common on macOS/launchd) the
# handler never reaches its ``finally`` and the PTY's fds leak.
# With dashboard auto-reconnect (#52962) every dropped socket then
# stacks a fresh PTY on top of the orphaned one, exhausting fds.
try:
await ws.send_bytes(chunk)
await ws.close()
except Exception:
return
pass
reader_task = asyncio.create_task(pump_pty_to_ws())
# --- writer loop: WebSocket → PTY master ----------------------------
try:
while True:
msg = await ws.receive()
try:
msg = await ws.receive()
except RuntimeError:
# Raised when ws.receive() is called after the socket is
# already disconnected (e.g. closed by the reader task above).
break
msg_type = msg.get("type")
if msg_type == "websocket.disconnect":
break

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 MiB

View file

@ -16,6 +16,7 @@ pytestmark = pytest.mark.skipif(
class _OneFrameBridge:
def __init__(self):
self._sent = False
self.closed = False
@classmethod
def spawn(cls, *args, **kwargs):
@ -34,7 +35,7 @@ class _OneFrameBridge:
pass
def close(self):
pass
self.closed = True
@pytest.fixture
@ -128,3 +129,40 @@ def test_fresh_param_ignores_channel_active_session_file(pty_client, monkeypatch
assert captured["resume"] is None
assert captured["active_session_file"] == str(active_file)
assert not active_file.exists()
def test_child_eof_closes_socket_and_bridge(pty_client, monkeypatch):
"""Child EOF must close the WS server-side and reap the PTY.
Regression for the FD leak (#54028): the reader task hits EOF when the
PTY child exits, but if the browser's socket is half-open (no FIN), the
writer loop's ``ws.receive()`` would block forever and the PTY fds would
never be closed. The reader now closes the WebSocket on EOF so the
handler's ``finally`` runs ``bridge.close()``.
"""
ws, client, token = pty_client
bridges = []
class _RecordingBridge(_OneFrameBridge):
@classmethod
def spawn(cls, *args, **kwargs):
b = cls()
bridges.append(b)
return b
monkeypatch.setattr(ws.PtyBridge, "spawn", _RecordingBridge.spawn)
monkeypatch.setattr(
ws, "_resolve_chat_argv", lambda **kw: (["fake-hermes-tui"], None, None)
)
# The client never sends a disconnect of its own — it only reads the one
# frame then the server side must tear everything down on child EOF.
with client.websocket_connect(_url(token, channel="eof-chan")) as conn:
assert conn.receive_bytes() == b"ready"
# Server closes the socket after the child EOFs; receiving again
# surfaces the close rather than hanging.
with pytest.raises(Exception):
conn.receive_bytes()
assert len(bridges) == 1
assert bridges[0].closed is True