[pyfr] 23/88: Enable the CUDA backend to exploit GPUDirect.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Nov 16 12:05:26 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository pyfr.
commit 69bb1904c2ef52504a55fd8cacd23bbd77ee4399
Author: Freddie Witherden <freddie at witherden.org>
Date: Sun Apr 24 09:22:16 2016 -0700
Enable the CUDA backend to exploit GPUDirect.
By pairing the CUDA backend with a CUDA-aware MPI library and
setting
[backend-cuda]
mpi-type = cuda-aware
PyFR will pass CUDA pointers directly down to the MPI library.
---
doc/src/user_guide.rst | 7 +++++-
pyfr/backends/cuda/base.py | 5 ++++
pyfr/backends/cuda/packing.py | 57 ++++++++++++++++++++++++++++---------------
pyfr/backends/cuda/types.py | 24 +++++++++++++-----
setup.py | 2 +-
5 files changed, 67 insertions(+), 28 deletions(-)
diff --git a/doc/src/user_guide.rst b/doc/src/user_guide.rst
index acf4e81..07c3d53 100644
--- a/doc/src/user_guide.rst
+++ b/doc/src/user_guide.rst
@@ -37,7 +37,7 @@ The CUDA backend targets NVIDIA GPUs with a compute capability of 2.0
or greater. The backend requires:
1. `CUDA <https://developer.nvidia.com/cuda-downloads>`_ >= 4.2
-2. `pycuda <http://mathema.tician.de/software/pycuda/>`_ >= 2011.2
+2. `pycuda <http://mathema.tician.de/software/pycuda/>`_ >= 2015.1
MIC Backend
^^^^^^^^^^^
@@ -195,11 +195,16 @@ Parameterises the CUDA backend with
*int*
+3. ``mpi-type`` --- type of MPI library that is being used:
+
+ ``standard`` | ``cuda-aware``
+
Example::
[backend-cuda]
device-id = round-robin
gimmik-max-nnz = 512
+ mpi-type = standard
[backend-mic]
^^^^^^^^^^^^^^^^
diff --git a/pyfr/backends/cuda/base.py b/pyfr/backends/cuda/base.py
index b379810..2402440 100644
--- a/pyfr/backends/cuda/base.py
+++ b/pyfr/backends/cuda/base.py
@@ -36,6 +36,11 @@ class CUDABackend(BaseBackend):
# Take the required alignment to be 128 bytes
self.alignb = 128
+ # Get the MPI runtime type
+ self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard')
+ if self.mpitype not in {'standard', 'cuda-aware'}:
+ raise ValueError('Invalid CUDA backend MPI type')
+
# Some CUDA devices share L1 cache and shared memory; on these
# devices CUDA allows us to specify a preference between L1
# cache and shared memory. For the sake of CUBLAS (which
diff --git a/pyfr/backends/cuda/packing.py b/pyfr/backends/cuda/packing.py
index 193d581..d2c1654 100644
--- a/pyfr/backends/cuda/packing.py
+++ b/pyfr/backends/cuda/packing.py
@@ -2,7 +2,7 @@
import pycuda.driver as cuda
-from pyfr.backends.base import ComputeKernel
+from pyfr.backends.base import ComputeKernel, NullComputeKernel
from pyfr.backends.base.packing import BasePackingKernels
from pyfr.backends.cuda.provider import CUDAKernelProvider, get_grid_for_block
@@ -22,30 +22,47 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
block = (128, 1, 1)
grid = get_grid_for_block(block, v.n)
- # Create a CUDA event
- event = cuda.Event(cuda.event_flags.DISABLE_TIMING)
+ # If MPI is CUDA aware then we just need to pack the buffer
+ if self.backend.mpitype == 'cuda-aware':
+ class PackXchgViewKernel(ComputeKernel):
+ def run(self, queue):
+ scomp = queue.cuda_stream_comp
- class PackXchgViewKernel(ComputeKernel):
- def run(self, queue):
- scomp = queue.cuda_stream_comp
- scopy = queue.cuda_stream_copy
+ # Pack
+ kern.prepared_async_call(
+ grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
+ v.mapping, v.cstrides or 0, v.rstrides or 0, m
+ )
+ # Otherwise, we need to both pack the buffer and copy it back
+ else:
+ # Create a CUDA event
+ event = cuda.Event(cuda.event_flags.DISABLE_TIMING)
- # Pack
- kern.prepared_async_call(grid, block, scomp, v.n, v.nvrow,
- v.nvcol, v.basedata, v.mapping,
- v.cstrides or 0, v.rstrides or 0, m)
+ class PackXchgViewKernel(ComputeKernel):
+ def run(self, queue):
+ scomp = queue.cuda_stream_comp
+ scopy = queue.cuda_stream_copy
- # Copy the packed buffer to the host
- event.record(scomp)
- scopy.wait_for_event(event)
- cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
+ # Pack
+ kern.prepared_async_call(
+ grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
+ v.mapping, v.cstrides or 0, v.rstrides or 0, m
+ )
+
+ # Copy the packed buffer to the host
+ event.record(scomp)
+ scopy.wait_for_event(event)
+ cuda.memcpy_dtoh_async(m.hdata, m.data, scopy)
return PackXchgViewKernel()
def unpack(self, mv):
- class UnpackXchgMatrixKernel(ComputeKernel):
- def run(self, queue):
- cuda.memcpy_htod_async(mv.data, mv.hdata,
- queue.cuda_stream_comp)
+ if self.backend.mpitype == 'cuda-aware':
+ return NullComputeKernel()
+ else:
+ class UnpackXchgMatrixKernel(ComputeKernel):
+ def run(self, queue):
+ cuda.memcpy_htod_async(mv.data, mv.hdata,
+ queue.cuda_stream_comp)
- return UnpackXchgMatrixKernel()
+ return UnpackXchgMatrixKernel()
diff --git a/pyfr/backends/cuda/types.py b/pyfr/backends/cuda/types.py
index 7770183..8ca891e 100644
--- a/pyfr/backends/cuda/types.py
+++ b/pyfr/backends/cuda/types.py
@@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
+from ctypes import c_int, c_ssize_t, c_void_p, pythonapi, py_object
+
import numpy as np
import pycuda.driver as cuda
@@ -7,10 +9,15 @@ import pyfr.backends.base as base
from pyfr.util import lazyprop
+_make_pybuf = pythonapi.PyMemoryView_FromMemory
+_make_pybuf.argtypes = [c_void_p, c_ssize_t, c_int]
+_make_pybuf.restype = py_object
+
+
class CUDAMatrixBase(base.MatrixBase):
def onalloc(self, basedata, offset):
- self.basedata = int(basedata)
- self.data = self.basedata + offset
+ self.basedata = basedata
+ self.data = int(self.basedata) + offset
self.offset = offset
# Process any initial value
@@ -53,7 +60,7 @@ class CUDAMatrix(CUDAMatrixBase, base.Matrix):
class CUDAMatrixRSlice(base.MatrixRSlice):
@lazyprop
def data(self):
- return int(self.parent.basedata + self.offset)
+ return int(self.parent.basedata) + int(self.offset)
@property
def _as_parameter_(self):
@@ -81,9 +88,14 @@ class CUDAXchgMatrix(CUDAMatrix, base.XchgMatrix):
# Call the standard matrix constructor
super().__init__(backend, ioshape, initval, extent, aliases, tags)
- # Allocate a page-locked buffer on the host for MPI to send/recv from
- self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
- self.dtype, 'C')
+ # If MPI is CUDA-aware then construct a buffer out of our CUDA
+ # device allocation and pass this directly to MPI
+ if backend.mpitype == 'cuda-aware':
+ self.hdata = _make_pybuf(self.data, self.nbytes, 0x200)
+ # Otherwise, allocate a buffer on the host for MPI to send/recv from
+ else:
+ self.hdata = cuda.pagelocked_empty((self.nrow, self.ncol),
+ self.dtype, 'C')
class CUDAXchgView(base.XchgView):
diff --git a/setup.py b/setup.py
index 4970ec3..b24b7f3 100755
--- a/setup.py
+++ b/setup.py
@@ -102,7 +102,7 @@ install_requires = [
# Soft dependencies
extras_require = {
- 'cuda': ['pycuda >= 2011.2'],
+ 'cuda': ['pycuda >= 2015.1'],
'mic': ['pymic >= 0.7'],
'opencl': ['pyopencl >= 2015.2.4']
}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git
More information about the debian-science-commits
mailing list