diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index ccb20c60fc7..583b83b0410 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -12188,26 +12188,44 @@ async def pty_ws(ws: WebSocket) -> None: # --- reader task: PTY master → WebSocket ---------------------------- async def pump_pty_to_ws() -> None: - while True: - chunk = await loop.run_in_executor( - None, bridge.read, _PTY_READ_CHUNK_TIMEOUT - ) - if chunk is None: # EOF - return - if not chunk: # no data this tick; yield control and retry - await asyncio.sleep(0) - continue + try: + while True: + chunk = await loop.run_in_executor( + None, bridge.read, _PTY_READ_CHUNK_TIMEOUT + ) + if chunk is None: # EOF + return + if not chunk: # no data this tick; yield control and retry + await asyncio.sleep(0) + continue + try: + await ws.send_bytes(chunk) + except Exception: + return + finally: + # The child has exited (EOF) or the send side broke. Close the + # WebSocket so the writer loop's ``ws.receive()`` returns instead + # of blocking forever — otherwise, when the browser's socket is + # half-open (no FIN delivered, common on macOS/launchd) the + # handler never reaches its ``finally`` and the PTY's fds leak. + # With dashboard auto-reconnect (#52962) every dropped socket then + # stacks a fresh PTY on top of the orphaned one, exhausting fds. try: - await ws.send_bytes(chunk) + await ws.close() except Exception: - return + pass reader_task = asyncio.create_task(pump_pty_to_ws()) # --- writer loop: WebSocket → PTY master ---------------------------- try: while True: - msg = await ws.receive() + try: + msg = await ws.receive() + except RuntimeError: + # Raised when ws.receive() is called after the socket is + # already disconnected (e.g. closed by the reader task above). + break msg_type = msg.get("type") if msg_type == "websocket.disconnect": break diff --git a/infographic/pr-54028-pty-fd-leak/infographic.png b/infographic/pr-54028-pty-fd-leak/infographic.png new file mode 100644 index 00000000000..15eca67af4d Binary files /dev/null and b/infographic/pr-54028-pty-fd-leak/infographic.png differ diff --git a/tests/hermes_cli/test_web_server_pty_reconnect.py b/tests/hermes_cli/test_web_server_pty_reconnect.py index 4f7878f53d0..5a399407084 100644 --- a/tests/hermes_cli/test_web_server_pty_reconnect.py +++ b/tests/hermes_cli/test_web_server_pty_reconnect.py @@ -16,6 +16,7 @@ pytestmark = pytest.mark.skipif( class _OneFrameBridge: def __init__(self): self._sent = False + self.closed = False @classmethod def spawn(cls, *args, **kwargs): @@ -34,7 +35,7 @@ class _OneFrameBridge: pass def close(self): - pass + self.closed = True @pytest.fixture @@ -128,3 +129,40 @@ def test_fresh_param_ignores_channel_active_session_file(pty_client, monkeypatch assert captured["resume"] is None assert captured["active_session_file"] == str(active_file) assert not active_file.exists() + + +def test_child_eof_closes_socket_and_bridge(pty_client, monkeypatch): + """Child EOF must close the WS server-side and reap the PTY. + + Regression for the FD leak (#54028): the reader task hits EOF when the + PTY child exits, but if the browser's socket is half-open (no FIN), the + writer loop's ``ws.receive()`` would block forever and the PTY fds would + never be closed. The reader now closes the WebSocket on EOF so the + handler's ``finally`` runs ``bridge.close()``. + """ + ws, client, token = pty_client + bridges = [] + + class _RecordingBridge(_OneFrameBridge): + @classmethod + def spawn(cls, *args, **kwargs): + b = cls() + bridges.append(b) + return b + + monkeypatch.setattr(ws.PtyBridge, "spawn", _RecordingBridge.spawn) + monkeypatch.setattr( + ws, "_resolve_chat_argv", lambda **kw: (["fake-hermes-tui"], None, None) + ) + + # The client never sends a disconnect of its own — it only reads the one + # frame then the server side must tear everything down on child EOF. + with client.websocket_connect(_url(token, channel="eof-chan")) as conn: + assert conn.receive_bytes() == b"ready" + # Server closes the socket after the child EOFs; receiving again + # surfaces the close rather than hanging. + with pytest.raises(Exception): + conn.receive_bytes() + + assert len(bridges) == 1 + assert bridges[0].closed is True