[pyfr] 23/88: Enable the CUDA backend to exploit GPUDirect.

Wed Nov 16 12:05:26 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository pyfr.

commit 69bb1904c2ef52504a55fd8cacd23bbd77ee4399
Author: Freddie Witherden <freddie at witherden.org>
Date:   Sun Apr 24 09:22:16 2016 -0700

    Enable the CUDA backend to exploit GPUDirect.
    
    By pairing the CUDA backend with a CUDA-aware MPI library and
    setting
    
    [backend-cuda]
    mpi-type = cuda-aware
    
    PyFR will pass CUDA pointers directly down to the MPI library.
---
 doc/src/user_guide.rst        |  7 +++++-
 pyfr/backends/cuda/base.py    |  5 ++++
 pyfr/backends/cuda/packing.py | 57 ++++++++++++++++++++++++++++---------------
 pyfr/backends/cuda/types.py   | 24 +++++++++++++-----
 setup.py                      |  2 +-
 5 files changed, 67 insertions(+), 28 deletions(-)

diff --git a/doc/src/user_guide.rst b/doc/src/user_guide.rst
index acf4e81..07c3d53 100644
--- a/doc/src/user_guide.rst
+++ b/doc/src/user_guide.rst
@@ -37,7 +37,7 @@ The CUDA backend targets NVIDIA GPUs with a compute capability of 2.0
 or greater. The backend requires:
 
 1. `CUDA <https://developer.nvidia.com/cuda-downloads>`_ >= 4.2
-2. `pycuda <http://mathema.tician.de/software/pycuda/>`_ >= 2011.2
+2. `pycuda <http://mathema.tician.de/software/pycuda/>`_ >= 2015.1
 
 MIC Backend
 ^^^^^^^^^^^
@@ -195,11 +195,16 @@ Parameterises the CUDA backend with
 
      *int*
 
+3. ``mpi-type`` --- type of MPI library that is being used:
+
+     ``standard`` | ``cuda-aware``
+
 Example::
 
     [backend-cuda]
     device-id = round-robin
     gimmik-max-nnz = 512
+    mpi-type = standard
 
 [backend-mic]
 ^^^^^^^^^^^^^^^^
diff --git a/pyfr/backends/cuda/base.py b/pyfr/backends/cuda/base.py
index b379810..2402440 100644
--- a/pyfr/backends/cuda/base.py
+++ b/pyfr/backends/cuda/base.py
@@ -36,6 +36,11 @@ class CUDABackend(BaseBackend):
         # Take the required alignment to be 128 bytes
         self.alignb = 128
 
+        # Get the MPI runtime type
+        self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard')
+        if self.mpitype not in {'standard', 'cuda-aware'}:
+            raise ValueError('Invalid CUDA backend MPI type')
+
         # Some CUDA devices share L1 cache and shared memory; on these
         # devices CUDA allows us to specify a preference between L1
         # cache and shared memory.  For the sake of CUBLAS (which
diff --git a/pyfr/backends/cuda/packing.py b/pyfr/backends/cuda/packing.py
index 193d581..d2c1654 100644
--- a/pyfr/backends/cuda/packing.py
+++ b/pyfr/backends/cuda/packing.py
@@ -2,7 +2,7 @@
 
 import pycuda.driver as cuda
 
-from pyfr.backends.base import ComputeKernel
+from pyfr.backends.base import ComputeKernel, NullComputeKernel
 from pyfr.backends.base.packing import BasePackingKernels
 from pyfr.backends.cuda.provider import CUDAKernelProvider, get_grid_for_block
 
@@ -22,30 +22,47 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
         block = (128, 1, 1)
         grid = get_grid_for_block(block, v.n)
 
-        # Create a CUDA event
-        event = cuda.Event(cuda.event_flags.DISABLE_TIMING)
+        # If MPI is CUDA aware then we just need to pack the buffer
+        if self.backend.mpitype == 'cuda-aware':
+            class PackXchgViewKernel(ComputeKernel):
+                def run(self, queue):
+                    scomp = queue.cuda_stream_comp
 
-        class PackXchgViewKernel(ComputeKernel):
-            def run(self, queue):
-                scomp = queue.cuda_stream_comp
-                scopy = queue.cuda_stream_copy
+                    # Pack
+                    kern.prepared_async_call(
+                        grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
+                        v.mapping, v.cstrides or 0, v.rstrides or 0, m
+                    )
+        # Otherwise, we need to both pack the buffer and copy it back
+        else:
+            # Create a CUDA event
+            event = cuda.Event(cuda.event_flags.DISABLE_TIMING)
 
-                # Pack
-                kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
-                                         v.nvcol, v.basedata, v.mapping,
-                                         v.cstrides or 0, v.rstrides or 0, m)
+            class PackXchgViewKernel(ComputeKernel):
+                def run(self, queue):
+                    scomp = queue.cuda_stream_comp
+                    scopy = queue.cuda_stream_copy
 
-                # Copy the packed buffer to the host
-                event.record(scomp)
-                scopy.wait_for_event(event)
-                cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
+                    # Pack
+                    kern.prepared_async_call(
+                        grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
+                        v.mapping, v.cstrides or 0, v.rstrides or 0, m
+                    )
+
+                    # Copy the packed buffer to the host
+                    event.record(scomp)
+                    scopy.wait_for_event(event)
+                    cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
 
         return PackXchgViewKernel()
 
     def unpack(self, mv):
-        class UnpackXchgMatrixKernel(ComputeKernel):
-            def run(self, queue):
-                cuda.memcpy_htod_async(mv.data, mv.hdata,
-                                       queue.cuda_stream_comp)
+        if self.backend.mpitype == 'cuda-aware':
+            return NullComputeKernel()
+        else:
+            class UnpackXchgMatrixKernel(ComputeKernel):
+                def run(self, queue):
+                    cuda.memcpy_htod_async(mv.data, mv.hdata,
+                                           queue.cuda_stream_comp)
 
-        return UnpackXchgMatrixKernel()
+            return UnpackXchgMatrixKernel()
diff --git a/pyfr/backends/cuda/types.py b/pyfr/backends/cuda/types.py
index 7770183..8ca891e 100644
--- a/pyfr/backends/cuda/types.py
+++ b/pyfr/backends/cuda/types.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 
+from ctypes import c_int, c_ssize_t, c_void_p, pythonapi, py_object
+
 import numpy as np
 import pycuda.driver as cuda
 
@@ -7,10 +9,15 @@ import pyfr.backends.base as base
 from pyfr.util import lazyprop
 
 
+_make_pybuf = pythonapi.PyMemoryView_FromMemory
+_make_pybuf.argtypes = [c_void_p, c_ssize_t, c_int]
+_make_pybuf.restype = py_object
+
+
 class CUDAMatrixBase(base.MatrixBase):
     def onalloc(self, basedata, offset):
-        self.basedata = int(basedata)
-        self.data = self.basedata + offset
+        self.basedata = basedata
+        self.data = int(self.basedata) + offset
         self.offset = offset
 
         # Process any initial value
@@ -53,7 +60,7 @@ class CUDAMatrix(CUDAMatrixBase, base.Matrix):
 class CUDAMatrixRSlice(base.MatrixRSlice):
     @lazyprop
     def data(self):
-        return int(self.parent.basedata + self.offset)
+        return int(self.parent.basedata) + int(self.offset)
 
     @property
     def _as_parameter_(self):
@@ -81,9 +88,14 @@ class CUDAXchgMatrix(CUDAMatrix, base.XchgMatrix):
         # Call the standard matrix constructor
         super().__init__(backend, ioshape, initval, extent, aliases, tags)
 
-        # Allocate a page-locked buffer on the host for MPI to send/recv from
-        self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
-                                           self.dtype, 'C')
+        # If MPI is CUDA-aware then construct a buffer out of our CUDA
+        # device allocation and pass this directly to MPI
+        if backend.mpitype == 'cuda-aware':
+            self.hdata = _make_pybuf(self.data, self.nbytes, 0x200)
+        # Otherwise, allocate a buffer on the host for MPI to send/recv from
+        else:
+            self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
+                                               self.dtype, 'C')
 
 
 class CUDAXchgView(base.XchgView):
diff --git a/setup.py b/setup.py
index 4970ec3..b24b7f3 100755
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@ install_requires = [
 
 # Soft dependencies
 extras_require = {
-    'cuda': ['pycuda >= 2011.2'],
+    'cuda': ['pycuda >= 2015.1'],
     'mic': ['pymic >= 0.7'],
     'opencl': ['pyopencl >= 2015.2.4']
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git