From a06d0198cd23086d96db66df1335e2d06b3a0207 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 28 Jun 2026 03:58:18 -0700 Subject: [PATCH] fix(dashboard): reap PTY bridge on child EOF, not only in writer finally (#54190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /api/pty handler only closed the PtyBridge in the writer loop's finally. On child EOF the reader task closes the WebSocket, but if the handler task is cancelled the instant the socket closes, the writer's finally can be skipped and the PTY fds leak (#54028) — the FD-leak the regression test guards. Under dashboard auto-reconnect this stacks orphaned PTYs until fds are exhausted. Reap the bridge in the reader's EOF finally too (close() is idempotent), so the PTY is reaped independently of the writer-loop cancellation race. Harden the regression test to poll for teardown instead of asserting on the same tick. Was flaky on main (2/20); now 25/25. --- hermes_cli/web_server.py | 10 ++++++++++ tests/hermes_cli/test_web_server_pty_reconnect.py | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 583b83b0410..f489d43bcc6 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -12210,6 +12210,16 @@ async def pty_ws(ws: WebSocket) -> None: # handler never reaches its ``finally`` and the PTY's fds leak. # With dashboard auto-reconnect (#52962) every dropped socket then # stacks a fresh PTY on top of the orphaned one, exhausting fds. + # + # Reap the bridge here too (close() is idempotent): on child EOF the + # writer loop's ``finally`` is the usual closer, but if the handler + # task is cancelled the instant we close the WS, that ``finally`` + # can be skipped, leaking the PTY. Closing from the EOF path makes + # the reap independent of that cancellation race (#54028). + try: + await asyncio.to_thread(bridge.close) + except Exception: + pass try: await ws.close() except Exception: diff --git a/tests/hermes_cli/test_web_server_pty_reconnect.py b/tests/hermes_cli/test_web_server_pty_reconnect.py index 5a399407084..a4d12687383 100644 --- a/tests/hermes_cli/test_web_server_pty_reconnect.py +++ b/tests/hermes_cli/test_web_server_pty_reconnect.py @@ -165,4 +165,12 @@ def test_child_eof_closes_socket_and_bridge(pty_client, monkeypatch): conn.receive_bytes() assert len(bridges) == 1 + # bridge.close() runs in the handler's `finally` via asyncio.to_thread, + # which can lag the client-side context exit by a tick or two. Poll briefly + # instead of asserting immediately so the teardown isn't a race. + import time + + deadline = time.monotonic() + 5.0 + while not bridges[0].closed and time.monotonic() < deadline: + time.sleep(0.01) assert bridges[0].closed is True