diff --git a/run_agent.py b/run_agent.py index cb5dbf4b1..944217e6b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -4366,6 +4366,29 @@ class AIAgent: self._client_log_context(), ) return client + # Inject TCP keepalives to detect dead connections faster (#10324). + # Without keepalives, a provider that drops mid-stream leaves the + # socket in CLOSE-WAIT and epoll_wait may never fire, causing the + # agent to hang indefinitely. Keepalive probes detect the dead + # peer within ~60s (30s idle + 3×10s probes). + if "http_client" not in client_kwargs: + try: + import httpx as _httpx + import socket as _socket + _sock_opts = [(_socket.SOL_SOCKET, _socket.SO_KEEPALIVE, 1)] + if hasattr(_socket, "TCP_KEEPIDLE"): + # Linux + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPIDLE, 30)) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPINTVL, 10)) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPCNT, 3)) + elif hasattr(_socket, "TCP_KEEPALIVE"): + # macOS (uses TCP_KEEPALIVE instead of TCP_KEEPIDLE) + _sock_opts.append((_socket.IPPROTO_TCP, _socket.TCP_KEEPALIVE, 30)) + client_kwargs["http_client"] = _httpx.Client( + transport=_httpx.HTTPTransport(socket_options=_sock_opts), + ) + except Exception: + pass # Fall through to default transport if socket opts fail client = OpenAI(**client_kwargs) logger.info( "OpenAI client created (%s, shared=%s) %s",