mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
* fix(dashboard): close PTY WebSocket on child EOF to stop FD leak The /api/pty handler's reader task returns on child EOF, but the writer loop stayed blocked on ws.receive() until the browser sent a disconnect. When the browser socket is half-open (no FIN delivered — common on macOS/launchd), that disconnect never arrives, so the handler never reaches its finally and the PTY master fd + child process leak. With dashboard auto-reconnect (#52962), every dropped socket then spawns a fresh PTY on top of the orphaned one, exhausting file descriptors within hours (EMFILE / Errno 24). Fix: the reader task now closes the WebSocket in a finally when the child EOFs or the send side breaks, which unblocks ws.receive() so the existing finally runs bridge.close(). The writer loop also guards ws.receive() against the RuntimeError Starlette raises once the socket is closed. Reported by @fifteenzhang. Fixes #54028 * docs: add infographic for #54028 PTY FD leak fix
This commit is contained in:
parent
7ef04ae7a7
commit
6d879d486b
3 changed files with 69 additions and 13 deletions
|
|
@ -12188,26 +12188,44 @@ async def pty_ws(ws: WebSocket) -> None:
|
|||
|
||||
# --- reader task: PTY master → WebSocket ----------------------------
|
||||
async def pump_pty_to_ws() -> None:
|
||||
while True:
|
||||
chunk = await loop.run_in_executor(
|
||||
None, bridge.read, _PTY_READ_CHUNK_TIMEOUT
|
||||
)
|
||||
if chunk is None: # EOF
|
||||
return
|
||||
if not chunk: # no data this tick; yield control and retry
|
||||
await asyncio.sleep(0)
|
||||
continue
|
||||
try:
|
||||
while True:
|
||||
chunk = await loop.run_in_executor(
|
||||
None, bridge.read, _PTY_READ_CHUNK_TIMEOUT
|
||||
)
|
||||
if chunk is None: # EOF
|
||||
return
|
||||
if not chunk: # no data this tick; yield control and retry
|
||||
await asyncio.sleep(0)
|
||||
continue
|
||||
try:
|
||||
await ws.send_bytes(chunk)
|
||||
except Exception:
|
||||
return
|
||||
finally:
|
||||
# The child has exited (EOF) or the send side broke. Close the
|
||||
# WebSocket so the writer loop's ``ws.receive()`` returns instead
|
||||
# of blocking forever — otherwise, when the browser's socket is
|
||||
# half-open (no FIN delivered, common on macOS/launchd) the
|
||||
# handler never reaches its ``finally`` and the PTY's fds leak.
|
||||
# With dashboard auto-reconnect (#52962) every dropped socket then
|
||||
# stacks a fresh PTY on top of the orphaned one, exhausting fds.
|
||||
try:
|
||||
await ws.send_bytes(chunk)
|
||||
await ws.close()
|
||||
except Exception:
|
||||
return
|
||||
pass
|
||||
|
||||
reader_task = asyncio.create_task(pump_pty_to_ws())
|
||||
|
||||
# --- writer loop: WebSocket → PTY master ----------------------------
|
||||
try:
|
||||
while True:
|
||||
msg = await ws.receive()
|
||||
try:
|
||||
msg = await ws.receive()
|
||||
except RuntimeError:
|
||||
# Raised when ws.receive() is called after the socket is
|
||||
# already disconnected (e.g. closed by the reader task above).
|
||||
break
|
||||
msg_type = msg.get("type")
|
||||
if msg_type == "websocket.disconnect":
|
||||
break
|
||||
|
|
|
|||
BIN
infographic/pr-54028-pty-fd-leak/infographic.png
Normal file
BIN
infographic/pr-54028-pty-fd-leak/infographic.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.2 MiB |
|
|
@ -16,6 +16,7 @@ pytestmark = pytest.mark.skipif(
|
|||
class _OneFrameBridge:
|
||||
def __init__(self):
|
||||
self._sent = False
|
||||
self.closed = False
|
||||
|
||||
@classmethod
|
||||
def spawn(cls, *args, **kwargs):
|
||||
|
|
@ -34,7 +35,7 @@ class _OneFrameBridge:
|
|||
pass
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
self.closed = True
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
@ -128,3 +129,40 @@ def test_fresh_param_ignores_channel_active_session_file(pty_client, monkeypatch
|
|||
assert captured["resume"] is None
|
||||
assert captured["active_session_file"] == str(active_file)
|
||||
assert not active_file.exists()
|
||||
|
||||
|
||||
def test_child_eof_closes_socket_and_bridge(pty_client, monkeypatch):
|
||||
"""Child EOF must close the WS server-side and reap the PTY.
|
||||
|
||||
Regression for the FD leak (#54028): the reader task hits EOF when the
|
||||
PTY child exits, but if the browser's socket is half-open (no FIN), the
|
||||
writer loop's ``ws.receive()`` would block forever and the PTY fds would
|
||||
never be closed. The reader now closes the WebSocket on EOF so the
|
||||
handler's ``finally`` runs ``bridge.close()``.
|
||||
"""
|
||||
ws, client, token = pty_client
|
||||
bridges = []
|
||||
|
||||
class _RecordingBridge(_OneFrameBridge):
|
||||
@classmethod
|
||||
def spawn(cls, *args, **kwargs):
|
||||
b = cls()
|
||||
bridges.append(b)
|
||||
return b
|
||||
|
||||
monkeypatch.setattr(ws.PtyBridge, "spawn", _RecordingBridge.spawn)
|
||||
monkeypatch.setattr(
|
||||
ws, "_resolve_chat_argv", lambda **kw: (["fake-hermes-tui"], None, None)
|
||||
)
|
||||
|
||||
# The client never sends a disconnect of its own — it only reads the one
|
||||
# frame then the server side must tear everything down on child EOF.
|
||||
with client.websocket_connect(_url(token, channel="eof-chan")) as conn:
|
||||
assert conn.receive_bytes() == b"ready"
|
||||
# Server closes the socket after the child EOFs; receiving again
|
||||
# surfaces the close rather than hanging.
|
||||
with pytest.raises(Exception):
|
||||
conn.receive_bytes()
|
||||
|
||||
assert len(bridges) == 1
|
||||
assert bridges[0].closed is True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue