[pyfr] 01/88: Rework axnpby to operate on matrices and add support for masking.

Wed Nov 16 12:05:23 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository pyfr.

commit a07bef79a1f3a044bb7109472785f1c2ff6d61d1
Author: Freddie Witherden <freddie at witherden.org>
Date:   Wed Feb 3 21:34:31 2016 +0000

    Rework axnpby to operate on matrices and add support for masking.
---
 pyfr/backends/cuda/blasext.py            | 33 ++++++++++++---------
 pyfr/backends/cuda/kernels/axnpby.mako   | 41 +++++++++++++++-----------
 pyfr/backends/opencl/blasext.py          | 22 +++++++-------
 pyfr/backends/opencl/kernels/axnpby.mako | 39 +++++++++++++++----------
 pyfr/backends/openmp/blasext.py          | 22 ++++++++------
 pyfr/backends/openmp/kernels/axnpby.mako | 49 +++++++++++++++-----------------
 6 files changed, 115 insertions(+), 91 deletions(-)

diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py
index 812f02f..1d89cf1 100644
--- a/pyfr/backends/cuda/blasext.py
+++ b/pyfr/backends/cuda/blasext.py
@@ -2,36 +2,43 @@
 
 import numpy as np
 import pycuda.driver as cuda
-from pycuda.gpuarray import GPUArray, splay
+from pycuda.gpuarray import GPUArray
 from pycuda.reduction import ReductionKernel
 
-from pyfr.backends.cuda.provider import CUDAKernelProvider
+from pyfr.backends.cuda.provider import CUDAKernelProvider, get_grid_for_block
 from pyfr.backends.base import ComputeKernel
 from pyfr.nputil import npdtype_to_ctype
 
 
 class CUDABlasExtKernels(CUDAKernelProvider):
-    def axnpby(self, y, *xn):
-        if any(y.traits != x.traits for x in xn):
+    def axnpby(self, *arr, subdims=None):
+        if any(arr[0].traits != x.traits for x in arr[1:]):
             raise ValueError('Incompatible matrix types')
 
-        nv, cnt = len(xn), y.leaddim*y.nrow
+        nv = len(arr)
+        ncola, ncolb = arr[0].datashape[1:]
+        nrow, ldim, lsdim, dtype = arr[0].traits
 
         # Render the kernel template
-        src = self.backend.lookup.get_template('axnpby').render(n=nv)
+        src = self.backend.lookup.get_template('axnpby').render(
+            subdims=subdims or range(ncola), nv=nv
+        )
 
-        # Build
+        # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32] + [np.intp, y.dtype]*(1 + nv))
+                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
 
-        # Compute a suitable block and grid
-        grid, block = splay(cnt)
+        # Determine the grid/block
+        block = (128, 1, 1)
+        grid = get_grid_for_block(block, ncolb)
 
         class AxnpbyKernel(ComputeKernel):
-            def run(self, queue, beta, *alphan):
-                args = [i for axn in zip(xn, alphan) for i in axn]
+            def run(self, queue, *consts):
+                args = list(arr) + list(consts)
+
                 kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
-                                         cnt, y, beta, *args)
+                                         nrow, ncolb, ldim, lsdim,
+                                         *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako
index 4b2db1e..d2fbfe5 100644
--- a/pyfr/backends/cuda/kernels/axnpby.mako
+++ b/pyfr/backends/cuda/kernels/axnpby.mako
@@ -3,22 +3,31 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 __global__ void
-axnpby(int n, fpdtype_t* y, fpdtype_t beta,
-       ${', '.join('const fpdtype_t* x{0}, fpdtype_t a{0}'.format(i)
-                   for i in range(n))})
+axnpby(int nrow, int ncolb, int ldim, int lsdim,
+       ${', '.join('fpdtype_t* __restrict__ x' + str(i) for i in range(nv))},
+       ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
 {
-    int strt = blockIdx.x*blockDim.x + threadIdx.x;
-    int incr = gridDim.x*blockDim.x;
+    int j = blockIdx.x*blockDim.x + threadIdx.x;
+    int idx;
 
-    for (int i = strt; i < n; i += incr)
-    {
-        fpdtype_t axn = ${pyfr.dot('a{j}', 'x{j}[i]', j=n)};
-
-        if (beta == 0.0)
-            y[i] = axn;
-        else if (beta == 1.0)
-            y[i] += axn;
-        else
-            y[i] = beta*y[i] + axn;
-    }
+    % for k in subdims:
+    if (j < ncolb && a0 == 0.0)
+        for (int i = 0; i < nrow; ++i)
+        {
+            idx = i*ldim + ${k}*lsdim + j;
+            x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        }
+    else if (j < ncolb && a0 == 1.0)
+        for (int i = 0; i < nrow; ++i)
+        {
+            idx = i*ldim + ${k}*lsdim + j;
+            x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        }
+    else if (j < ncolb)
+        for (int i = 0; i < nrow; ++i)
+        {
+            idx = i*ldim + ${k}*lsdim + j;
+            x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+        }
+    % endfor
 }
diff --git a/pyfr/backends/opencl/blasext.py b/pyfr/backends/opencl/blasext.py
index 45be9e6..98faf5d 100644
--- a/pyfr/backends/opencl/blasext.py
+++ b/pyfr/backends/opencl/blasext.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pyopencl as cl
-from pyopencl.array import Array, splay
+from pyopencl.array import Array
 from pyopencl.reduction import ReductionKernel
 
 from pyfr.backends.opencl.provider import OpenCLKernelProvider
@@ -11,30 +11,28 @@ from pyfr.nputil import npdtype_to_ctype
 
 
 class OpenCLBlasExtKernels(OpenCLKernelProvider):
-    def axnpby(self, *arr):
+    def axnpby(self, *arr, subdims=None):
         if any(arr[0].traits != x.traits for x in arr[1:]):
             raise ValueError('Incompatible matrix types')
 
         nv = len(arr)
-        nrow, leaddim, leadsubdim, dtype = arr[0].traits
+        ncola, ncolb = arr[0].datashape[1:]
+        nrow, ldim, lsdim, dtype = arr[0].traits
 
         # Render the kernel template
-        src = self.backend.lookup.get_template('axnpby').render(nv=nv)
+        src = self.backend.lookup.get_template('axnpby').render(
+            subdims=subdims or range(ncola), nv=nv
+        )
 
         # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32] + [np.intp]*nv + [dtype]*nv)
-
-        # Determine the total element count in the matrices
-        cnt = leaddim*nrow
-
-        # Compute a suitable global and local workgroup sizes
-        gs, ls = splay(self.backend.qdflt, cnt)
+                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
 
         class AxnpbyKernel(ComputeKernel):
             def run(self, queue, *consts):
                 args = [x.data for x in arr] + list(consts)
-                kern(queue.cl_queue_comp, gs, ls, cnt, *args)
+                kern(queue.cl_queue_comp, (ncolb,), None, nrow, ncolb,
+                     ldim, lsdim, *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/opencl/kernels/axnpby.mako b/pyfr/backends/opencl/kernels/axnpby.mako
index b0079ca..4d00d3d 100644
--- a/pyfr/backends/opencl/kernels/axnpby.mako
+++ b/pyfr/backends/opencl/kernels/axnpby.mako
@@ -3,22 +3,31 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 __kernel void
-axnpby(int n,
+axnpby(int nrow, int ncolb, int ldim, int lsdim,
        ${', '.join('__global fpdtype_t* restrict x' + str(i) for i in range(nv))},
-       ${', '.join('fpdtype_t alpha' + str(i) for i in range(nv))})
+       ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
 {
-    int strt = get_global_id(0);
-    int incr = get_global_size(0);
+    int j = get_global_id(0);
+    int idx;
 
-    for (int i = strt; i < n; i += incr)
-    {
-        fpdtype_t axn = ${pyfr.dot('alpha{j}', 'x{j}[i]', j=(1, nv))};
-
-        if (alpha0 == 0.0)
-            x0[i] = axn;
-        else if (alpha0 == 1.0)
-            x0[i] += axn;
-        else
-            x0[i] = alpha0*x0[i] + axn;
-    }
+    % for k in subdims:
+    if (j < ncolb && a0 == 0.0)
+        for (int i = 0; i < nrow; ++i)
+        {
+            idx = i*ldim + ${k}*lsdim + j;
+            x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        }
+    else if (j < ncolb && a0 == 1.0)
+        for (int i = 0; i < nrow; ++i)
+        {
+            idx = i*ldim + ${k}*lsdim + j;
+            x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        }
+    else if (j < ncolb)
+        for (int i = 0; i < nrow; ++i)
+        {
+            idx = i*ldim + ${k}*lsdim + j;
+            x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+        }
+    % endfor
 }
diff --git a/pyfr/backends/openmp/blasext.py b/pyfr/backends/openmp/blasext.py
index 278e026..c97bf4c 100644
--- a/pyfr/backends/openmp/blasext.py
+++ b/pyfr/backends/openmp/blasext.py
@@ -7,23 +7,27 @@ from pyfr.backends.base import ComputeKernel
 
 
 class OpenMPBlasExtKernels(OpenMPKernelProvider):
-    def axnpby(self, y, *xn):
-        if any(y.traits != x.traits for x in xn):
+    def axnpby(self, *arr, subdims=None):
+        if any(arr[0].traits != x.traits for x in arr[1:]):
             raise ValueError('Incompatible matrix types')
 
-        nv, cnt = len(xn), y.leaddim*y.nrow
+        nv = len(arr)
+        ncola, ncolb = arr[0].datashape[1:]
+        nrow, ldim, lsdim, dtype = arr[0].traits
 
         # Render the kernel template
-        src = self.backend.lookup.get_template('axnpby').render(n=nv)
+        src = self.backend.lookup.get_template('axnpby').render(
+            subdims=subdims or range(ncola), nv=nv
+        )
 
-        # Build
+        # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32] + [np.intp, y.dtype]*(1 + nv))
+                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
 
         class AxnpbyKernel(ComputeKernel):
-            def run(self, queue, beta, *alphan):
-                args = [i for axn in zip(xn, alphan) for i in axn]
-                kern(cnt, y, beta, *args)
+            def run(self, queue, *consts):
+                args = list(arr) + list(consts)
+                kern(nrow, ncolb, ldim, lsdim, *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/openmp/kernels/axnpby.mako b/pyfr/backends/openmp/kernels/axnpby.mako
index a72c0de..de55a3f 100644
--- a/pyfr/backends/openmp/kernels/axnpby.mako
+++ b/pyfr/backends/openmp/kernels/axnpby.mako
@@ -2,36 +2,33 @@
 <%inherit file='base'/>
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
-static PYFR_NOINLINE void
-axnpby_inner(int n, fpdtype_t *__restrict__ y, fpdtype_t beta,
-             ${', '.join('const fpdtype_t *__restrict__ x{0}, '
-                         'fpdtype_t a{0}'.format(i) for i in range(n))})
-{
-    for (int i = 0; i < n; i++)
-    {
-        fpdtype_t axn = ${pyfr.dot('a{j}', 'x{j}[i]', j=n)};
-
-        if (beta == 0.0)
-            y[i] = axn;
-        else if (beta == 1.0)
-            y[i] += axn;
-        else
-            y[i] = beta*y[i] + axn;
-    }
-}
-
 void
-axnpby(int n, fpdtype_t *__restrict__ y, fpdtype_t beta,
-       ${', '.join('const fpdtype_t *__restrict__ x{0}, '
-                   'fpdtype_t a{0}'.format(i) for i in range(n))})
+axnpby(int nrow, int ncolb, int ldim, int lsdim,
+       ${', '.join('fpdtype_t *__restrict__ x' + str(i) for i in range(nv))},
+       ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
 {
     #pragma omp parallel
     {
-        int begin, end;
-        loop_sched_1d(n, PYFR_ALIGN_BYTES / sizeof(fpdtype_t), &begin, &end);
+        int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+        int rb, re, cb, ce;
+        loop_sched_2d(nrow, ncolb, align, &rb, &re, &cb, &ce);
+
+        for (int r = rb; r < re; r++)
+        {
+            % for k in subdims:
+            for (int i = cb; i < ce; i++)
+            {
+                int idx = i + ldim*r + ${k}*lsdim;
+                fpdtype_t axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
 
-        axnpby_inner(end - begin, y + begin, beta,
-                     ${', '.join('x{0} + begin, a{0}'.format(i)
-                                 for i in range(n))});
+                if (a0 == 0.0)
+                    x0[idx] = axn;
+                else if (a0 == 1.0)
+                    x0[idx] += axn;
+                else
+                    x0[idx] = a0*x0[idx] + axn;
+            }
+            % endfor
+        }
     }
 }

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git