[pyfr] 56/88: Switch from SoA to AoSoA(k).

Wed Nov 16 12:05:29 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository pyfr.

commit 61c45aa98cf0a41d72282cb3eca66edb5e4c8348
Author: Freddie Witherden <freddie at witherden.org>
Date:   Sat Jun 25 21:43:07 2016 -0700

    Switch from SoA to AoSoA(k).
    
    This reduces memory usage and improves performance.
---
 pyfr/backends/base/backend.py            | 11 ++--
 pyfr/backends/base/generator.py          | 57 ++++++++++++--------
 pyfr/backends/base/kernels.py            |  5 +-
 pyfr/backends/base/types.py              | 73 ++++++++++++++++---------
 pyfr/backends/cuda/base.py               |  3 ++
 pyfr/backends/cuda/blasext.py            | 11 ++--
 pyfr/backends/cuda/generator.py          | 11 ++--
 pyfr/backends/cuda/kernels/axnpby.mako   | 16 +++---
 pyfr/backends/cuda/kernels/base.mako     |  4 ++
 pyfr/backends/cuda/kernels/pack.mako     |  7 ++-
 pyfr/backends/cuda/packing.py            |  6 +--
 pyfr/backends/cuda/types.py              | 12 ++---
 pyfr/backends/mic/base.py                |  3 ++
 pyfr/backends/mic/blasext.py             |  9 ++--
 pyfr/backends/mic/generator.py           | 92 ++++++++++++++++++--------------
 pyfr/backends/mic/kernels/axnpby.mako    | 40 ++++++++++----
 pyfr/backends/mic/kernels/base.mako      |  1 +
 pyfr/backends/mic/kernels/pack.mako      |  9 ++--
 pyfr/backends/mic/packing.py             |  4 +-
 pyfr/backends/mic/types.py               | 12 ++---
 pyfr/backends/opencl/base.py             |  3 ++
 pyfr/backends/opencl/blasext.py          | 10 ++--
 pyfr/backends/opencl/generator.py        | 10 ++--
 pyfr/backends/opencl/kernels/axnpby.mako | 16 +++---
 pyfr/backends/opencl/kernels/base.mako   |  4 ++
 pyfr/backends/opencl/kernels/pack.mako   |  5 +-
 pyfr/backends/opencl/packing.py          |  4 +-
 pyfr/backends/opencl/types.py            | 12 ++---
 pyfr/backends/openmp/base.py             |  3 ++
 pyfr/backends/openmp/blasext.py          | 10 ++--
 pyfr/backends/openmp/generator.py        | 92 ++++++++++++++++++--------------
 pyfr/backends/openmp/kernels/axnpby.mako | 37 ++++++++++---
 pyfr/backends/openmp/kernels/base.mako   |  1 +
 pyfr/backends/openmp/kernels/pack.mako   |  7 ++-
 pyfr/backends/openmp/packing.py          |  4 +-
 pyfr/backends/openmp/types.py            | 15 ++----
 pyfr/solvers/base/elements.py            |  7 ++-
 37 files changed, 375 insertions(+), 251 deletions(-)

diff --git a/pyfr/backends/base/backend.py b/pyfr/backends/base/backend.py
index 40ab6a4..e28e9f2 100644
--- a/pyfr/backends/base/backend.py
+++ b/pyfr/backends/base/backend.py
@@ -56,7 +56,8 @@ class BaseBackend(object, metaclass=ABCMeta):
     @lazyprop
     def lookup(self):
         pkg = 'pyfr.backends.{0}.kernels'.format(self.name)
-        dfltargs = dict(alignb=self.alignb, fpdtype=self.fpdtype, math=math)
+        dfltargs = dict(alignb=self.alignb, fpdtype=self.fpdtype,
+                        soasz=self.soasz, math=math)
 
         return DottedTemplateLookup(pkg, dfltargs)
 
@@ -147,12 +148,12 @@ class BaseBackend(object, metaclass=ABCMeta):
     def xchg_matrix_for_view(self, view, tags=set()):
         return self.xchg_matrix((view.nvrow, view.nvcol, view.n), tags=tags)
 
-    def view(self, matmap, rcmap, stridemap=None, vshape=tuple(), tags=set()):
-        return self.view_cls(self, matmap, rcmap, stridemap, vshape, tags)
+    def view(self, matmap, rcmap, rstridemap=None, vshape=tuple(), tags=set()):
+        return self.view_cls(self, matmap, rcmap, rstridemap, vshape, tags)
 
-    def xchg_view(self, matmap, rcmap, stridemap=None, vshape=tuple(),
+    def xchg_view(self, matmap, rcmap, rstridemap=None, vshape=tuple(),
                   tags=set()):
-        return self.xchg_view_cls(self, matmap, rcmap, stridemap, vshape,
+        return self.xchg_view_cls(self, matmap, rcmap, rstridemap, vshape,
                                   tags)
 
     def kernel(self, name, *args, **kwargs):
diff --git a/pyfr/backends/base/generator.py b/pyfr/backends/base/generator.py
index 9a7e7ca..e9bf888 100644
--- a/pyfr/backends/base/generator.py
+++ b/pyfr/backends/base/generator.py
@@ -104,7 +104,7 @@ class BaseKernelGenerator(object, metaclass=ABCMeta):
 
             # View
             if va.isview:
-                argt.append([np.intp]*(2 + va.ncdim))
+                argt.append([np.intp]*(2 + (va.ncdim == 2)))
             # Non-stacked vector or MPI type
             elif self.ndim == 1 and (va.ncdim == 0 or va.ismpi):
                 argt.append([np.intp])
@@ -118,7 +118,7 @@ class BaseKernelGenerator(object, metaclass=ABCMeta):
         # Return
         return self.ndim, argn, argt
 
-    def needs_lsdim(self, arg):
+    def needs_ldim(self, arg):
         return ((self.ndim == 2 and not arg.isbroadcast) or
                 (arg.ncdim > 0 and not arg.ismpi))
 
@@ -127,42 +127,55 @@ class BaseKernelGenerator(object, metaclass=ABCMeta):
         pass
 
     def _deref_arg_view(self, arg):
-        ptns = ['{0}_v[{0}_vix[_x]]',
-                r'{0}_v[{0}_vix[_x] + {0}_vcstri[_x]*\1]',
-                r'{0}_v[{0}_vix[_x] + {0}_vrstri[_x]*\1 + {0}_vcstri[_x]*\2]']
+        ptns = ['{0}_v[{0}_vix[X_IDX]]',
+                r'{0}_v[{0}_vix[X_IDX] + SOA_SZ*\1]',
+                r'{0}_v[{0}_vix[X_IDX] + {0}_vrstri[X_IDX]*\1 + SOA_SZ*\2]']
 
         return ptns[arg.ncdim].format(arg.name)
 
     def _deref_arg_array_1d(self, arg):
-        # Leading (sub) dimension
-        lsdim = 'lsd' + arg.name if not arg.ismpi else '_nx'
+        # Leading dimension
+        ldim = 'ld' + arg.name if not arg.ismpi else '_nx'
 
-        # Vector: name_v[_x]
+        # Vector:
+        #   name => name_v[X_IDX]
         if arg.ncdim == 0:
-            ix = '_x'
-        # Stacked vector: name_v[lsdim*\1 + _x]
+            ix = 'X_IDX'
+        # Stacked vector:
+        #   name[\1] => name_v[ldim*\1 + X_IDX]
         elif arg.ncdim == 1:
-            ix = r'{0}*\1 + _x'.format(lsdim)
-        # Doubly stacked vector: name_v[(nv*\1 + \2)*lsdim + _x]
+            ix = r'{0}*\1 + X_IDX'.format(ldim)
+        # Doubly stacked MPI vector:
+        #   name[\1][\2] => name_v[(nv*\1 + \2)*ldim + X_IDX]
+        elif arg.ismpi:
+            ix = r'({0}*\1 + \2)*{1} + X_IDX'.format(arg.cdims[1], ldim)
+        # Doubly stacked vector:
+        #   name[\1][\2] => name_v[ldim*\1 + X_IDX_AOSOA(\2, nv)]
         else:
-            ix = r'({0}*\1 + \2)*{1} + _x'.format(arg.cdims[1], lsdim)
+            ix = (r'ld{0}*\1 + X_IDX_AOSOA(\2, {1})'
+                   .format(arg.name, arg.cdims[1]))
 
         return '{0}_v[{1}]'.format(arg.name, ix)
 
     def _deref_arg_array_2d(self, arg):
-        # Broadcast vector: name_v[_x]
+        # Broadcast vector:
+        #   name => name_v[X_IDX]
         if arg.isbroadcast:
-            ix = '_x'
-        # Matrix: name_v[lsdim*_y + _x]
+            ix = 'X_IDX'
+        # Matrix:
+        #   name => name_v[ldim*_y + X_IDX]
         elif arg.ncdim == 0:
-            ix = 'lsd{}*_y + _x'.format(arg.name)
-        # Stacked matrix: name_v[(_y*nv + \1)*lsdim + _x]
+            ix = 'ld{0}*_y + X_IDX'.format(arg.name)
+        # Stacked matrix:
+        #   name[\1] => name_v[ldim*_y + X_IDX_AOSOA(\1, nv)]
         elif arg.ncdim == 1:
-            ix = r'(_y*{0} + \1)*lsd{1} + _x'.format(arg.cdims[0], arg.name)
-        # Doubly stacked matrix: name_v[((\1*_ny + _y)*nv + \2)*lsdim + _x]
+            ix = (r'ld{0}*_y + X_IDX_AOSOA(\1, {1})'
+                   .format(arg.name, arg.cdims[0]))
+        # Doubly stacked matrix:
+        #   name[\1][\2] => name_v[(\1*ny + _y)*ldim + X_IDX_AOSOA(\2, nv)]
         else:
-            ix = (r'((\1*_ny + _y)*{0} + \2)*lsd{1} + _x'
-                  .format(arg.cdims[1], arg.name))
+            ix = (r'(\1*_ny + _y)*ld{0} + X_IDX_AOSOA(\2, {1})'
+                   .format(arg.name, arg.cdims[1]))
 
         return '{0}_v[{1}]'.format(arg.name, ix)
 
diff --git a/pyfr/backends/base/kernels.py b/pyfr/backends/base/kernels.py
index fe07006..ef00a34 100644
--- a/pyfr/backends/base/kernels.py
+++ b/pyfr/backends/base/kernels.py
@@ -118,7 +118,7 @@ class BasePointwiseKernelProvider(BaseKernelProvider, metaclass=ABCMeta):
 
             # Matrix
             if isinstance(ka, mattypes):
-                arglst += [ka, ka.leadsubdim] if len(atypes) == 2 else [ka]
+                arglst += [ka, ka.leaddim] if len(atypes) == 2 else [ka]
             # View
             elif isinstance(ka, viewtypes):
                 if isinstance(ka, self.backend.view_cls):
@@ -127,8 +127,7 @@ class BasePointwiseKernelProvider(BaseKernelProvider, metaclass=ABCMeta):
                     view = ka.view
 
                 arglst += [view.basedata, view.mapping]
-                arglst += [view.cstrides] if len(atypes) >= 3 else []
-                arglst += [view.rstrides] if len(atypes) == 4 else []
+                arglst += [view.rstrides] if len(atypes) == 3 else []
             # Other; let the backend handle it
             else:
                 arglst.append(ka)
diff --git a/pyfr/backends/base/types.py b/pyfr/backends/base/types.py
index 73f7c27..d1f3735 100644
--- a/pyfr/backends/base/types.py
+++ b/pyfr/backends/base/types.py
@@ -25,23 +25,26 @@ class MatrixBase(object, metaclass=ABCMeta):
 
         if ndim == 2:
             nrow, ncol = shape
-        elif ndim == 3 or ndim == 4:
-            nrow = shape[0] if ndim == 3 else shape[0]*shape[1]
-            ncol = shape[-2]*shape[-1] + (1 - shape[-2])*(shape[-1] % -ldmod)
+            aosoashape = shape
+        else:
+            nvar, narr, k = shape[-2], shape[-1], backend.soasz
+            nparr = narr - narr % -k
 
-        # Pad the final dimension
-        shape[-1] -= shape[-1] % -ldmod
+            nrow = shape[0] if ndim == 3 else shape[0]*shape[1]
+            ncol = nvar*nparr
+            aosoashape = shape[:-2] + [nparr // k, nvar, k]
 
         # Assign
         self.nrow, self.ncol = int(nrow), int(ncol)
-        self.ioshape, self.datashape = ioshape, shape
+
+        self.datashape = aosoashape
+        self.ioshape = ioshape
 
         self.leaddim = self.ncol - (self.ncol % -ldmod)
-        self.leadsubdim = shape[-1]
 
         self.pitch = self.leaddim*self.itemsize
         self.nbytes = self.nrow*self.pitch
-        self.traits = (self.nrow, self.leaddim, self.leadsubdim, self.dtype)
+        self.traits = (self.nrow, self.leaddim, self.dtype)
 
         # Process the initial value
         if initval is not None:
@@ -76,6 +79,28 @@ class MatrixBase(object, metaclass=ABCMeta):
     def _get(self):
         pass
 
+    def _pack(self, ary):
+        # If necessary convert from SoA to AoSoA packing
+        if ary.ndim > 2:
+            n, k = ary.shape[-1], self.backend.soasz
+
+            ary = np.pad(ary, [(0, 0)]*(ary.ndim - 1) + [(0, -n % k)],
+                         mode='constant')
+            ary = ary.reshape(ary.shape[:-1] + (-1, k)).swapaxes(-2, -3)
+            ary = ary.reshape(self.nrow, self.ncol)
+
+        return np.ascontiguousarray(ary, dtype=self.dtype)
+
+    def _unpack(self, ary):
+        # If necessary unpack from AoSoA to SoA
+        if len(self.ioshape) > 2:
+            ary = ary.reshape(self.datashape)
+            ary = ary.swapaxes(-2, -3)
+            ary = ary.reshape(self.ioshape[:-1] + (-1,))
+            ary = ary[..., :self.ioshape[-1]]
+
+        return ary
+
 
 class Matrix(MatrixBase):
     _base_tags = {'dense'}
@@ -114,10 +139,10 @@ class MatrixRSlice(object):
         self.p, self.q = int(p), int(q)
         self.nrow, self.ncol = self.q - self.p, mat.ncol
         self.dtype, self.itemsize = mat.dtype, mat.itemsize
-        self.leaddim, self.leadsubdim = mat.leaddim, mat.leadsubdim
+        self.leaddim = mat.leaddim
 
         self.pitch = self.leaddim*self.itemsize
-        self.traits = (self.nrow, self.leaddim, self.leadsubdim, self.dtype)
+        self.traits = (self.nrow, self.leaddim, self.dtype)
 
         self.tags = mat.tags | {'slice'}
 
@@ -184,11 +209,11 @@ class MatrixBank(Sequence):
 
 
 class View(object):
-    def __init__(self, backend, matmap, rcmap, stridemap, vshape, tags):
+    def __init__(self, backend, matmap, rcmap, rstridemap, vshape, tags):
         self.n = len(matmap)
         self.nvrow = vshape[-2] if len(vshape) == 2 else 1
         self.nvcol = vshape[-1] if len(vshape) >= 1 else 1
-        self.rstrides = self.cstrides = None
+        self.rstrides = None
 
         # Get the different matrices which we map onto
         self._mats = [backend.mats[i] for i in np.unique(matmap)]
@@ -211,6 +236,9 @@ class View(object):
         if any(m.dtype != self.refdtype for m in self._mats):
             raise TypeError('Mixed data types are not supported')
 
+        # SoA size
+        k = backend.soasz
+
         # Base offsets and leading dimensions for each point
         offset = np.empty(self.n, dtype=np.int32)
         leaddim = np.empty(self.n, dtype=np.int32)
@@ -219,32 +247,27 @@ class View(object):
             ix = np.where(matmap == m.mid)
             offset[ix], leaddim[ix] = m.offset // m.itemsize, m.leaddim
 
-        # Go from matrices + (row, column) indcies to displacements
-        # relative to the base allocation address
-        mapping = (offset + rcmap[:,0]*leaddim + rcmap[:,1])[None,:]
+        # Row/column displacements
+        rowdisp = rcmap[:, 0]*leaddim
+        coldisp = (rcmap[:, 1] // k)*(self.nvcol*k) + rcmap[:, 1] % k
+
+        mapping = (offset + rowdisp + coldisp)[None,:]
         self.mapping = backend.base_matrix_cls(
             backend, np.int32, (1, self.n), mapping, None, None, tags
         )
 
         # Row strides
         if self.nvrow > 1:
-            rstrides = (stridemap[:,0]*leaddim)[None,:]
+            rstrides = (rstridemap*leaddim)[None,:]
             self.rstrides = backend.base_matrix_cls(
                 backend, np.int32, (1, self.n), rstrides, None, None, tags
             )
 
-        # Column strides
-        if self.nvcol > 1:
-            cstrides = stridemap[:,-1][None,:]
-            self.cstrides = backend.base_matrix_cls(
-                backend, np.int32, (1, self.n), cstrides, None, None, tags
-            )
-
 
 class XchgView(object):
-    def __init__(self, backend, matmap, rcmap, stridemap, vshape, tags):
+    def __init__(self, backend, matmap, rcmap, rstridemap, vshape, tags):
         # Create a normal view
-        self.view = backend.view(matmap, rcmap, stridemap, vshape, tags)
+        self.view = backend.view(matmap, rcmap, rstridemap, vshape, tags)
 
         # Dimensions
         self.n = n = self.view.n
diff --git a/pyfr/backends/cuda/base.py b/pyfr/backends/cuda/base.py
index 2296df4..2541f9a 100644
--- a/pyfr/backends/cuda/base.py
+++ b/pyfr/backends/cuda/base.py
@@ -35,6 +35,9 @@ class CUDABackend(BaseBackend):
         # Take the required alignment to be 128 bytes
         self.alignb = 128
 
+        # Take the SoA size to be 32 elements
+        self.soasz = 32
+
         # Get the MPI runtime type
         self.mpitype = cfg.get('backend-cuda', 'mpi-type', 'standard')
         if self.mpitype not in {'standard', 'cuda-aware'}:
diff --git a/pyfr/backends/cuda/blasext.py b/pyfr/backends/cuda/blasext.py
index bc4df16..96b94fe 100644
--- a/pyfr/backends/cuda/blasext.py
+++ b/pyfr/backends/cuda/blasext.py
@@ -16,17 +16,17 @@ class CUDABlasExtKernels(CUDAKernelProvider):
             raise ValueError('Incompatible matrix types')
 
         nv = len(arr)
-        ncola, ncolb = arr[0].datashape[1:]
-        nrow, ldim, lsdim, dtype = arr[0].traits
+        nrow, ldim, dtype = arr[0].traits
+        ncola, ncolb = arr[0].ioshape[1:]
 
         # Render the kernel template
         src = self.backend.lookup.get_template('axnpby').render(
-            subdims=subdims or range(ncola), nv=nv
+            subdims=subdims or range(ncola), ncola=ncola, nv=nv
         )
 
         # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+                                  [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
 
         # Determine the grid/block
         block = (128, 1, 1)
@@ -37,8 +37,7 @@ class CUDABlasExtKernels(CUDAKernelProvider):
                 args = list(arr) + list(consts)
 
                 kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
-                                         nrow, ncolb, ldim, lsdim,
-                                         *args)
+                                         nrow, ncolb, ldim, *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/cuda/generator.py b/pyfr/backends/cuda/generator.py
index 46ae875..c2e7e28 100644
--- a/pyfr/backends/cuda/generator.py
+++ b/pyfr/backends/cuda/generator.py
@@ -24,10 +24,14 @@ class CUDAKernelGenerator(BaseKernelGenerator):
         return '''{spec}
                {{
                    int _x = blockIdx.x*blockDim.x + threadIdx.x;
+                   #define X_IDX (_x)
+                   #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
                    {limits}
                    {{
                        {body}
                    }}
+                   #undef X_IDX
+                   #undef X_IDX_AOSOA
                }}'''.format(spec=spec, limits=limits, body=self.body)
 
     def _render_spec(self):
@@ -45,9 +49,6 @@ class CUDAKernelGenerator(BaseKernelGenerator):
                 kargs.append('const int* __restrict__ {0.name}_vix'
                              .format(va))
 
-                if va.ncdim >= 1:
-                    kargs.append('const int* __restrict__ {0.name}_vcstri'
-                                 .format(va))
                 if va.ncdim == 2:
                     kargs.append('const int* __restrict__ {0.name}_vrstri'
                                  .format(va))
@@ -59,7 +60,7 @@ class CUDAKernelGenerator(BaseKernelGenerator):
                 kargs.append('{0} {1.dtype}* __restrict__ {1.name}_v'
                              .format(const, va).strip())
 
-                if self.needs_lsdim(va):
-                    kargs.append('int lsd{0.name}'.format(va))
+                if self.needs_ldim(va):
+                    kargs.append('int ld{0.name}'.format(va))
 
         return '__global__ void {0}({1})'.format(self.name, ', '.join(kargs))
diff --git a/pyfr/backends/cuda/kernels/axnpby.mako b/pyfr/backends/cuda/kernels/axnpby.mako
index d2fbfe5..c19f273 100644
--- a/pyfr/backends/cuda/kernels/axnpby.mako
+++ b/pyfr/backends/cuda/kernels/axnpby.mako
@@ -3,31 +3,35 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 __global__ void
-axnpby(int nrow, int ncolb, int ldim, int lsdim,
+axnpby(int nrow, int ncolb, int ldim,
        ${', '.join('fpdtype_t* __restrict__ x' + str(i) for i in range(nv))},
        ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
 {
     int j = blockIdx.x*blockDim.x + threadIdx.x;
     int idx;
 
-    % for k in subdims:
     if (j < ncolb && a0 == 0.0)
         for (int i = 0; i < nrow; ++i)
         {
-            idx = i*ldim + ${k}*lsdim + j;
+        % for k in subdims:
+            idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
             x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        % endfor
         }
     else if (j < ncolb && a0 == 1.0)
         for (int i = 0; i < nrow; ++i)
         {
-            idx = i*ldim + ${k}*lsdim + j;
+        % for k in subdims:
+            idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
             x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        % endfor
         }
     else if (j < ncolb)
         for (int i = 0; i < nrow; ++i)
         {
-            idx = i*ldim + ${k}*lsdim + j;
+        % for k in subdims:
+            idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
             x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+        % endfor
         }
-    % endfor
 }
diff --git a/pyfr/backends/cuda/kernels/base.mako b/pyfr/backends/cuda/kernels/base.mako
index 4b6e3b7..5eeff1e 100644
--- a/pyfr/backends/cuda/kernels/base.mako
+++ b/pyfr/backends/cuda/kernels/base.mako
@@ -1,6 +1,10 @@
 # -*- coding: utf-8 -*-
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
+// AoSoA macros
+#define SOA_SZ ${soasz}
+#define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)
+
 // Typedefs
 typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
 
diff --git a/pyfr/backends/cuda/kernels/pack.mako b/pyfr/backends/cuda/kernels/pack.mako
index a0e503f..1e987e3 100644
--- a/pyfr/backends/cuda/kernels/pack.mako
+++ b/pyfr/backends/cuda/kernels/pack.mako
@@ -5,9 +5,8 @@ __global__ void
 pack_view(int n, int nrv, int ncv,
           const fpdtype_t* __restrict__ v,
           const int* __restrict__ vix,
-          const int* __restrict__ vcstri,
           const int* __restrict__ vrstri,
-          fpdtype_t* __restrict__  pmat)
+          fpdtype_t* __restrict__ pmat)
 {
     int i = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -15,9 +14,9 @@ pack_view(int n, int nrv, int ncv,
         pmat[i] = v[vix[i]];
     else if (i < n && nrv == 1)
         for (int c = 0; c < ncv; ++c)
-            pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+            pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
     else if (i < n)
         for (int r = 0; r < nrv; ++r)
             for (int c = 0; c < ncv; ++c)
-                pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + vcstri[i]*c];
+                pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c];
 }
diff --git a/pyfr/backends/cuda/packing.py b/pyfr/backends/cuda/packing.py
index d2c1654..a8168f3 100644
--- a/pyfr/backends/cuda/packing.py
+++ b/pyfr/backends/cuda/packing.py
@@ -16,7 +16,7 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
         src = self.backend.lookup.get_template('pack').render()
 
         # Build
-        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+        kern = self._build_kernel('pack_view', src, 'iiiPPPP')
 
         # Compute the grid and thread-block size
         block = (128, 1, 1)
@@ -31,7 +31,7 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
                     # Pack
                     kern.prepared_async_call(
                         grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
-                        v.mapping, v.cstrides or 0, v.rstrides or 0, m
+                        v.mapping, v.rstrides or 0, m
                     )
         # Otherwise, we need to both pack the buffer and copy it back
         else:
@@ -46,7 +46,7 @@ class CUDAPackingKernels(CUDAKernelProvider, BasePackingKernels):
                     # Pack
                     kern.prepared_async_call(
                         grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
-                        v.mapping, v.cstrides or 0, v.rstrides or 0, m
+                        v.mapping, v.rstrides or 0, m
                     )
 
                     # Copy the packed buffer to the host
diff --git a/pyfr/backends/cuda/types.py b/pyfr/backends/cuda/types.py
index 8ca891e..70e7aa2 100644
--- a/pyfr/backends/cuda/types.py
+++ b/pyfr/backends/cuda/types.py
@@ -29,18 +29,18 @@ class CUDAMatrixBase(base.MatrixBase):
 
     def _get(self):
         # Allocate an empty buffer
-        buf = np.empty(self.datashape, dtype=self.dtype)
+        buf = np.empty((self.nrow, self.leaddim), dtype=self.dtype)
 
         # Copy
         cuda.memcpy_dtoh(buf, self.data)
 
-        # Slice to give the expected I/O shape
-        return buf[...,:self.ioshape[-1]]
+        # Unpack
+        return self._unpack(buf[:, :self.ncol])
 
     def _set(self, ary):
-        # Allocate a new buffer with suitable padding and assign
-        buf = np.zeros(self.datashape, dtype=self.dtype)
-        buf[...,:self.ioshape[-1]] = ary
+        # Allocate a new buffer with suitable padding and pack it
+        buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
+        buf[:, :self.ncol] = self._pack(ary)
 
         # Copy
         cuda.memcpy_htod(self.data, buf)
diff --git a/pyfr/backends/mic/base.py b/pyfr/backends/mic/base.py
index 2a74f15..cbb1b31 100644
--- a/pyfr/backends/mic/base.py
+++ b/pyfr/backends/mic/base.py
@@ -30,6 +30,9 @@ class MICBackend(BaseBackend):
         # Take the alignment requirement to be 64-bytes
         self.alignb = 64
 
+        # Compute the SoA size
+        self.soasz = self.alignb // np.dtype(self.fpdtype).itemsize
+
         from pyfr.backends.mic import (blasext, cblas, packing, provider,
                                        types)
 
diff --git a/pyfr/backends/mic/blasext.py b/pyfr/backends/mic/blasext.py
index f00a284..e0aba5b 100644
--- a/pyfr/backends/mic/blasext.py
+++ b/pyfr/backends/mic/blasext.py
@@ -12,8 +12,8 @@ class MICBlasExtKernels(MICKernelProvider):
             raise ValueError('Incompatible matrix types')
 
         nv = len(arr)
-        ncola, ncolb = arr[0].datashape[1:]
-        nrow, ldim, lsdim, dtype = arr[0].traits
+        nrow, ldim, dtype = arr[0].traits
+        ncola, ncolb = arr[0].ioshape[1:]
 
         # Render the kernel template
         src = self.backend.lookup.get_template('axnpby').render(
@@ -22,13 +22,12 @@ class MICBlasExtKernels(MICKernelProvider):
 
         # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+                                  [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
 
         class AxnpbyKernel(ComputeKernel):
             def run(self, queue, *consts):
                 args = [x.data for x in arr] + list(consts)
-                queue.mic_stream_comp.invoke(kern, nrow, ncolb, ldim, lsdim,
-                                             *args)
+                queue.mic_stream_comp.invoke(kern, nrow, ncolb, ldim, *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/mic/generator.py b/pyfr/backends/mic/generator.py
index 8befc78..dc2ab76 100644
--- a/pyfr/backends/mic/generator.py
+++ b/pyfr/backends/mic/generator.py
@@ -9,42 +9,58 @@ class MICKernelGenerator(BaseKernelGenerator):
         spec, unpack = self._render_spec_unpack()
 
         if self.ndim == 1:
-            tpl = '''{spec}
-                  {{
-                      {unpack}
-                      #pragma omp parallel
-                      {{
-                          int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
-                          int cb, ce;
-                          loop_sched_1d(_nx, align, &cb, &ce);
-                          #pragma omp simd
-                          for (int _x = cb; _x < ce; _x++)
-                          {{
-                              {body}
-                          }}
-                      }}
-                  }}'''
+            inner = '''
+                    int cb, ce;
+                    loop_sched_1d(_nx, align, &cb, &ce);
+                    int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+                    for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+                    {{
+                        #pragma omp simd
+                        for (int _xj = 0; _xj < SOA_SZ; _xj++)
+                        {{
+                            {body}
+                        }}
+                    }}
+                    for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi; _xj++)
+                    {{
+                        {body}
+                    }}'''.format(body=self.body)
         else:
-            tpl = '''{spec}
-                  {{
-                      {unpack}
-                      #pragma omp parallel
-                      {{
-                          int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
-                          int rb, re, cb, ce;
-                          loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
-                          for (int _y = rb; _y < re; _y++)
-                          {{
-                              #pragma omp simd
-                              for (int _x = cb; _x < ce; _x++)
-                              {{
-                                  {body}
-                              }}
-                          }}
-                      }}
-                  }}'''
+            inner = '''
+                    int rb, re, cb, ce;
+                    loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
+                    int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+                    for (int _y = rb; _y < re; _y++)
+                    {{
+                        for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+                        {{
+                            #pragma omp simd
+                            for (int _xj = 0; _xj < SOA_SZ; _xj++)
+                            {{
+                                {body}
+                            }}
+                        }}
+                        for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi;
+                             _xj++)
+                        {{
+                            {body}
+                        }}
+                    }}'''.format(body=self.body)
 
-        return tpl.format(spec=spec, unpack=unpack, body=self.body)
+        return '''{spec}
+               {{
+                   {unpack}
+                   #define X_IDX (_xi + _xj)
+                   #define X_IDX_AOSOA(v, nv)\
+                       ((_xi/SOA_SZ*(nv) + (v))*SOA_SZ + _xj)
+                   int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+                   #pragma omp parallel
+                   {{
+                       {inner}
+                   }}
+                   #undef X_IDX
+                   #undef X_IDX_AOSOA
+               }}'''.format(spec=spec, unpack=unpack, inner=inner)
 
     def _render_spec_unpack(self):
         # Start by unpacking the dimensions
@@ -68,10 +84,6 @@ class MICKernelGenerator(BaseKernelGenerator):
                 kpack.append('const int *{0.name}_vix = *arg{{0}};'
                              .format(va))
 
-                if va.ncdim >= 1:
-                    kspec.append('void **arg{0}')
-                    kpack.append('const int *{0.name}_vcstri = *arg{{0}};'
-                                 .format(va))
                 if va.ncdim == 2:
                     kspec.append('void **arg{0}')
                     kpack.append('const int *{0.name}_vrstri = *arg{{0}};'
@@ -85,9 +97,9 @@ class MICKernelGenerator(BaseKernelGenerator):
                 kpack.append('{0} {1.dtype}* {1.name}_v = *arg{{0}};'
                              .format(const, va).strip())
 
-                if self.needs_lsdim(va):
+                if self.needs_ldim(va):
                     kspec.append('long *arg{0}')
-                    kpack.append('int lsd{0.name} = *arg{{0}};'.format(va))
+                    kpack.append('int ld{0.name} = *arg{{0}};'.format(va))
 
         # Number the arguments
         params = ', '.join(a.format(i) for i, a in enumerate(kspec))
diff --git a/pyfr/backends/mic/kernels/axnpby.mako b/pyfr/backends/mic/kernels/axnpby.mako
index ddbea4b..96f1211 100644
--- a/pyfr/backends/mic/kernels/axnpby.mako
+++ b/pyfr/backends/mic/kernels/axnpby.mako
@@ -3,12 +3,12 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 void
-axnpby(long *nrowp, long *ncolbp, long *ldimp, long *lsdimp,
+axnpby(long *nrowp, long *ncolbp, long *ldimp,
        ${', '.join('fpdtype_t **xp' + str(i) for i in range(nv))},
        ${', '.join('double *ap' + str(i) for i in range(nv))})
 {
+    #define X_IDX_AOSOA(v, nv) ((ci/SOA_SZ*(nv) + (v))*SOA_SZ + cj)
     int ldim = *ldimp;
-    int lsdim = *lsdimp;
 
 % for i in range(nv):
     fpdtype_t *x${i} = *xp${i};
@@ -18,16 +18,37 @@ axnpby(long *nrowp, long *ncolbp, long *ldimp, long *lsdimp,
     #pragma omp parallel
     {
         int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
-        int rb, re, cb, ce;
-        loop_sched_2d(*nrowp, *ncolbp, align, &rb, &re, &cb, &ce);
+        int rb, re, cb, ce, idx;
+        fpdtype_t axn;
+        loop_sched_2d(nrow, ncolb, align, &rb, &re, &cb, &ce);
+        int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
 
         for (int r = rb; r < re; r++)
         {
-        % for k in subdims:
-            for (int i = cb; i < ce; i++)
+            for (int ci = cb; ci < cb + nci; ci += SOA_SZ)
             {
-                int idx = i + ldim*r + ${k}*lsdim;
-                fpdtype_t axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+                #pragma omp simd
+                for (int cj = 0; cj < SOA_SZ; cj++)
+                {
+                % for k in subdims:
+                    idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+                    axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+
+                    if (a0 == 0.0)
+                        x0[idx] = axn;
+                    else if (a0 == 1.0)
+                        x0[idx] += axn;
+                    else
+                        x0[idx] = a0*x0[idx] + axn;
+                % endfor
+                }
+            }
+
+            for (int ci = cb + nci, cj = 0; cj < ce - ci; cj++)
+            {
+            % for k in subdims:
+                idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+                axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
 
                 if (a0 == 0.0)
                     x0[idx] = axn;
@@ -35,8 +56,9 @@ axnpby(long *nrowp, long *ncolbp, long *ldimp, long *lsdimp,
                     x0[idx] += axn;
                 else
                     x0[idx] = a0*x0[idx] + axn;
+            % endfor
             }
-        % endfor
         }
     }
+    #undef X_IDX_AOSOA
 }
diff --git a/pyfr/backends/mic/kernels/base.mako b/pyfr/backends/mic/kernels/base.mako
index 7cd4d75..2980406 100644
--- a/pyfr/backends/mic/kernels/base.mako
+++ b/pyfr/backends/mic/kernels/base.mako
@@ -6,6 +6,7 @@
 #include <tgmath.h>
 
 #define PYFR_ALIGN_BYTES ${alignb}
+#define SOA_SZ ${soasz}
 
 #define min(a, b) ((a) < (b) ? (a) : (b))
 #define max(a, b) ((a) > (b) ? (a) : (b))
diff --git a/pyfr/backends/mic/kernels/pack.mako b/pyfr/backends/mic/kernels/pack.mako
index 9f32a92..8b7874d 100644
--- a/pyfr/backends/mic/kernels/pack.mako
+++ b/pyfr/backends/mic/kernels/pack.mako
@@ -3,7 +3,7 @@
 
 void
 pack_view(long *n_a, long *nrv_a, long *ncv_a,
-          void **v_a, void **vix_a, void **vcstri_a, void **vrstri_a,
+          void **v_a, void **vix_a, void **vrstri_a,
           void **pmat_a)
 {
     int n = *n_a;
@@ -12,7 +12,6 @@ pack_view(long *n_a, long *nrv_a, long *ncv_a,
 
     fpdtype_t *v = *v_a;
     int *vix = *vix_a;
-    int *vcstri = (vcstri_a) ? *vcstri_a : 0;
     int *vrstri = (vrstri_a) ? *vrstri_a : 0;
     fpdtype_t *pmat = *pmat_a;
 
@@ -22,11 +21,11 @@ pack_view(long *n_a, long *nrv_a, long *ncv_a,
     else if (nrv == 1)
         for (int i = 0; i < n; i++)
             for (int c = 0; c < ncv; c++)
-                pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+                pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
     else
         for (int i = 0; i < n; i++)
             for (int r = 0; r < nrv; r++)
                 for (int c = 0; c < ncv; c++)
-                    pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r
-                                                + vcstri[i]*c];
+                    pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r +
+                                                SOA_SZ*c];
 }
diff --git a/pyfr/backends/mic/packing.py b/pyfr/backends/mic/packing.py
index 361965c..11dd7e6 100644
--- a/pyfr/backends/mic/packing.py
+++ b/pyfr/backends/mic/packing.py
@@ -14,13 +14,13 @@ class MICPackingKernels(MICKernelProvider, BasePackingKernels):
         src = self.backend.lookup.get_template('pack').render()
 
         # Build
-        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+        kern = self._build_kernel('pack_view', src, 'iiiPPPP')
 
         class PackXchgViewKernel(ComputeKernel):
             def run(self, queue):
                 # Kernel arguments
                 args = [v.n, v.nvrow, v.nvcol, v.basedata.dev_ptr,
-                        v.mapping, v.cstrides, v.rstrides, m]
+                        v.mapping, v.rstrides, m]
                 args = [getattr(arg, 'data', arg) for arg in args]
 
                 # Pack
diff --git a/pyfr/backends/mic/types.py b/pyfr/backends/mic/types.py
index 7abe53f..ec83c77 100644
--- a/pyfr/backends/mic/types.py
+++ b/pyfr/backends/mic/types.py
@@ -21,7 +21,7 @@ class MICMatrixBase(base.MatrixBase):
 
     def _get(self):
         # Allocate an empty buffer
-        buf = np.empty(self.datashape, dtype=self.dtype)
+        buf = np.empty((self.nrow, self.leaddim), dtype=self.dtype)
 
         # Copy using the default stream
         self.backend.sdflt.transfer_device2host(
@@ -30,13 +30,13 @@ class MICMatrixBase(base.MatrixBase):
         )
         self.backend.sdflt.sync()
 
-        # Slice to give the expected I/O shape
-        return buf[...,:self.ioshape[-1]]
+        # Unpack
+        return self._unpack(buf[:, :self.ncol])
 
     def _set(self, ary):
-        # Allocate a new buffer with suitable padding and assign
-        buf = np.zeros(self.datashape, dtype=self.dtype)
-        buf[...,:self.ioshape[-1]] = ary
+        # Allocate a new buffer with suitable padding and pack it
+        buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
+        buf[:, :self.ncol] = self._pack(ary)
 
         # Copy using the default stream
         self.backend.sdflt.transfer_host2device(
diff --git a/pyfr/backends/opencl/base.py b/pyfr/backends/opencl/base.py
index fea11c7..03a67b4 100644
--- a/pyfr/backends/opencl/base.py
+++ b/pyfr/backends/opencl/base.py
@@ -53,6 +53,9 @@ class OpenCLBackend(BaseBackend):
         # Compute the alignment requirement for the context
         self.alignb = device.mem_base_addr_align // 8
 
+        # Compute the SoA size
+        self.soasz = 2*self.alignb // np.dtype(self.fpdtype).itemsize
+
         from pyfr.backends.opencl import (blasext, clblas, gimmik, packing,
                                           provider, types)
 
diff --git a/pyfr/backends/opencl/blasext.py b/pyfr/backends/opencl/blasext.py
index c4e378a..f388770 100644
--- a/pyfr/backends/opencl/blasext.py
+++ b/pyfr/backends/opencl/blasext.py
@@ -16,23 +16,23 @@ class OpenCLBlasExtKernels(OpenCLKernelProvider):
             raise ValueError('Incompatible matrix types')
 
         nv = len(arr)
-        ncola, ncolb = arr[0].datashape[1:]
-        nrow, ldim, lsdim, dtype = arr[0].traits
+        nrow, ldim, dtype = arr[0].traits
+        ncola, ncolb = arr[0].ioshape[1:]
 
         # Render the kernel template
         src = self.backend.lookup.get_template('axnpby').render(
-            subdims=subdims or range(ncola), nv=nv
+            subdims=subdims or range(ncola), ncola=ncola, nv=nv
         )
 
         # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+                                  [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
 
         class AxnpbyKernel(ComputeKernel):
             def run(self, queue, *consts):
                 args = [x.data for x in arr] + list(consts)
                 kern(queue.cl_queue_comp, (ncolb,), None, nrow, ncolb,
-                     ldim, lsdim, *args)
+                     ldim, *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/opencl/generator.py b/pyfr/backends/opencl/generator.py
index 94320ba..d6e7d1e 100644
--- a/pyfr/backends/opencl/generator.py
+++ b/pyfr/backends/opencl/generator.py
@@ -24,10 +24,14 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
         return '''{spec}
                {{
                    int _x = get_global_id(0);
+                   #define X_IDX (_x)
+                   #define X_IDX_AOSOA(v, nv) SOA_IX(X_IDX, v, nv)
                    {limits}
                    {{
                        {body}
                    }}
+                   #undef X_IDX
+                   #undef X_IDX_AOSOA
                }}'''.format(spec=spec, limits=limits, body=self.body)
 
     def _render_spec(self):
@@ -46,8 +50,6 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
                 ka.append('__global {0.dtype}* restrict {0.name}_v')
                 ka.append('__global const int* restrict {0.name}_vix')
 
-                if va.ncdim >= 1:
-                    ka.append('__global const int* restrict {0.name}_vcstri')
                 if va.ncdim == 2:
                     ka.append('__global const int* restrict {0.name}_vrstri')
             # Arrays
@@ -57,8 +59,8 @@ class OpenCLKernelGenerator(BaseKernelGenerator):
                 else:
                     ka.append('__global {0.dtype}* restrict {0.name}_v')
 
-                if self.needs_lsdim(va):
-                    ka.append('int lsd{0.name}')
+                if self.needs_ldim(va):
+                    ka.append('int ld{0.name}')
 
             # Format
             kargs.extend(k.format(va) for k in ka)
diff --git a/pyfr/backends/opencl/kernels/axnpby.mako b/pyfr/backends/opencl/kernels/axnpby.mako
index 4d00d3d..14c346e 100644
--- a/pyfr/backends/opencl/kernels/axnpby.mako
+++ b/pyfr/backends/opencl/kernels/axnpby.mako
@@ -3,31 +3,35 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 __kernel void
-axnpby(int nrow, int ncolb, int ldim, int lsdim,
+axnpby(int nrow, int ncolb, int ldim,
        ${', '.join('__global fpdtype_t* restrict x' + str(i) for i in range(nv))},
        ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
 {
     int j = get_global_id(0);
     int idx;
 
-    % for k in subdims:
     if (j < ncolb && a0 == 0.0)
         for (int i = 0; i < nrow; ++i)
         {
-            idx = i*ldim + ${k}*lsdim + j;
+        % for k in subdims:
+            idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
             x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        % endfor
         }
     else if (j < ncolb && a0 == 1.0)
         for (int i = 0; i < nrow; ++i)
         {
-            idx = i*ldim + ${k}*lsdim + j;
+        % for k in subdims:
+            idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
             x0[idx] += ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+        % endfor
         }
     else if (j < ncolb)
         for (int i = 0; i < nrow; ++i)
         {
-            idx = i*ldim + ${k}*lsdim + j;
+        % for k in subdims:
+            idx = i*ldim + SOA_IX(j, ${k}, ${ncola});
             x0[idx] = ${pyfr.dot('a{l}', 'x{l}[idx]', l=nv)};
+        % endfor
         }
-    % endfor
 }
diff --git a/pyfr/backends/opencl/kernels/base.mako b/pyfr/backends/opencl/kernels/base.mako
index 13ce4d8..a4ae785 100644
--- a/pyfr/backends/opencl/kernels/base.mako
+++ b/pyfr/backends/opencl/kernels/base.mako
@@ -6,6 +6,10 @@
 # pragma OPENCL EXTENSION cl_khr_fp64: enable
 #endif
 
+// AoSoA macros
+#define SOA_SZ ${soasz}
+#define SOA_IX(a, v, nv) ((((a) / SOA_SZ)*(nv) + (v))*SOA_SZ + (a) % SOA_SZ)
+
 // Typedefs
 typedef ${pyfr.npdtype_to_ctype(fpdtype)} fpdtype_t;
 
diff --git a/pyfr/backends/opencl/kernels/pack.mako b/pyfr/backends/opencl/kernels/pack.mako
index 04e3762..64647fd 100644
--- a/pyfr/backends/opencl/kernels/pack.mako
+++ b/pyfr/backends/opencl/kernels/pack.mako
@@ -5,7 +5,6 @@ __kernel void
 pack_view(int n, int nrv, int ncv,
           __global const fpdtype_t* restrict v,
           __global const int* restrict vix,
-          __global const int* restrict vcstri,
           __global const int* restrict vrstri,
           __global fpdtype_t* restrict pmat)
 {
@@ -15,9 +14,9 @@ pack_view(int n, int nrv, int ncv,
         pmat[i] = v[vix[i]];
     else if (i < n && nrv == 1)
         for (int c = 0; c < ncv; ++c)
-            pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+            pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
     else if (i < n)
         for (int r = 0; r < nrv; ++r)
             for (int c = 0; c < ncv; ++c)
-                pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + vcstri[i]*c];
+                pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r + SOA_SZ*c];
 }
diff --git a/pyfr/backends/opencl/packing.py b/pyfr/backends/opencl/packing.py
index c0b88b0..8a731fe 100644
--- a/pyfr/backends/opencl/packing.py
+++ b/pyfr/backends/opencl/packing.py
@@ -17,13 +17,13 @@ class OpenCLPackingKernels(OpenCLKernelProvider, BasePackingKernels):
         src = self.backend.lookup.get_template('pack').render()
 
         # Build
-        kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*5)
+        kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*4)
 
         class PackXchgViewKernel(ComputeKernel):
             def run(self, queue):
                 # Kernel arguments
                 args = [v.n, v.nvrow, v.nvcol, v.basedata, v.mapping,
-                        v.cstrides, v.rstrides, m]
+                        v.rstrides, m]
                 args = [getattr(arg, 'data', arg) for arg in args]
 
                 # Pack
diff --git a/pyfr/backends/opencl/types.py b/pyfr/backends/opencl/types.py
index 31abf18..3cc5cfd 100644
--- a/pyfr/backends/opencl/types.py
+++ b/pyfr/backends/opencl/types.py
@@ -22,18 +22,18 @@ class OpenCLMatrixBase(base.MatrixBase):
 
     def _get(self):
         # Allocate an empty buffer
-        buf = np.empty(self.datashape, dtype=self.dtype)
+        buf = np.empty((self.nrow, self.leaddim), dtype=self.dtype)
 
         # Copy
         cl.enqueue_copy(self.backend.qdflt, buf, self.data)
 
-        # Slice to give the expected I/O shape
-        return buf[...,:self.ioshape[-1]]
+        # Unpack
+        return self._unpack(buf[:, :self.ncol])
 
     def _set(self, ary):
-        # Allocate a new buffer with suitable padding and assign
-        buf = np.zeros(self.datashape, dtype=self.dtype)
-        buf[...,:self.ioshape[-1]] = ary
+        # Allocate a new buffer with suitable padding and pack it
+        buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
+        buf[:, :self.ncol] = self._pack(ary)
 
         # Copy
         cl.enqueue_copy(self.backend.qdflt, self.data, buf)
diff --git a/pyfr/backends/openmp/base.py b/pyfr/backends/openmp/base.py
index 115d657..6c6a6e5 100644
--- a/pyfr/backends/openmp/base.py
+++ b/pyfr/backends/openmp/base.py
@@ -14,6 +14,9 @@ class OpenMPBackend(BaseBackend):
         # Take the alignment requirement to be 32-bytes
         self.alignb = 32
 
+        # Compute the SoA size
+        self.soasz = self.alignb // np.dtype(self.fpdtype).itemsize
+
         from pyfr.backends.openmp import (blasext, cblas, gimmik, packing,
                                           provider, types)
 
diff --git a/pyfr/backends/openmp/blasext.py b/pyfr/backends/openmp/blasext.py
index 8343004..ebc29a9 100644
--- a/pyfr/backends/openmp/blasext.py
+++ b/pyfr/backends/openmp/blasext.py
@@ -12,22 +12,22 @@ class OpenMPBlasExtKernels(OpenMPKernelProvider):
             raise ValueError('Incompatible matrix types')
 
         nv = len(arr)
-        ncola, ncolb = arr[0].datashape[1:]
-        nrow, ldim, lsdim, dtype = arr[0].traits
+        nrow, ldim, dtype = arr[0].traits
+        ncola, ncolb = arr[0].ioshape[1:]
 
         # Render the kernel template
         src = self.backend.lookup.get_template('axnpby').render(
-            subdims=subdims or range(ncola), nv=nv
+            subdims=subdims or range(ncola), ncola=ncola, nv=nv
         )
 
         # Build the kernel
         kern = self._build_kernel('axnpby', src,
-                                  [np.int32]*4 + [np.intp]*nv + [dtype]*nv)
+                                  [np.int32]*3 + [np.intp]*nv + [dtype]*nv)
 
         class AxnpbyKernel(ComputeKernel):
             def run(self, queue, *consts):
                 args = list(arr) + list(consts)
-                kern(nrow, ncolb, ldim, lsdim, *args)
+                kern(nrow, ncolb, ldim, *args)
 
         return AxnpbyKernel()
 
diff --git a/pyfr/backends/openmp/generator.py b/pyfr/backends/openmp/generator.py
index c9677b4..2b0874c 100644
--- a/pyfr/backends/openmp/generator.py
+++ b/pyfr/backends/openmp/generator.py
@@ -5,45 +5,58 @@ from pyfr.backends.base.generator import BaseKernelGenerator
 
 class OpenMPKernelGenerator(BaseKernelGenerator):
     def render(self):
-        # Kernel spec
-        spec = self._render_spec()
-
         if self.ndim == 1:
-            tpl = '''
-                  {spec}
-                  {{
-                      #pragma omp parallel
-                      {{
-                          int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
-                          int cb, ce;
-                          loop_sched_1d(_nx, align, &cb, &ce);
-                          #pragma omp simd
-                          for (int _x = cb; _x < ce; _x++)
-                          {{
-                              {body}
-                          }}
-                      }}
-                  }}'''
+            inner = '''
+                    int cb, ce;
+                    loop_sched_1d(_nx, align, &cb, &ce);
+                    int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+                    for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+                    {{
+                        #pragma omp simd
+                        for (int _xj = 0; _xj < SOA_SZ; _xj++)
+                        {{
+                            {body}
+                        }}
+                    }}
+                    for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi; _xj++)
+                    {{
+                        {body}
+                    }}'''.format(body=self.body)
         else:
-            tpl = '''{spec}
-                  {{
-                      #pragma omp parallel
-                      {{
-                          int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
-                          int rb, re, cb, ce;
-                          loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
-                          for (int _y = rb; _y < re; _y++)
-                          {{
-                              #pragma omp simd
-                              for (int _x = cb; _x < ce; _x++)
-                              {{
-                                  {body}
-                              }}
-                          }}
-                      }}
-                  }}'''
+            inner = '''
+                    int rb, re, cb, ce;
+                    loop_sched_2d(_ny, _nx, align, &rb, &re, &cb, &ce);
+                    int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
+                    for (int _y = rb; _y < re; _y++)
+                    {{
+                        for (int _xi = cb; _xi < cb + nci; _xi += SOA_SZ)
+                        {{
+                            #pragma omp simd
+                            for (int _xj = 0; _xj < SOA_SZ; _xj++)
+                            {{
+                                {body}
+                            }}
+                        }}
+                        for (int _xi = cb + nci, _xj = 0; _xj < ce - _xi;
+                             _xj++)
+                        {{
+                            {body}
+                        }}
+                    }}'''.format(body=self.body)
 
-        return tpl.format(spec=spec, body=self.body)
+        return '''{spec}
+               {{
+                   #define X_IDX (_xi + _xj)
+                   #define X_IDX_AOSOA(v, nv)\
+                       ((_xi/SOA_SZ*(nv) + (v))*SOA_SZ + _xj)
+                   int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
+                   #pragma omp parallel
+                   {{
+                       {inner}
+                   }}
+                   #undef X_IDX
+                   #undef X_IDX_AOSOA
+               }}'''.format(spec=self._render_spec(), inner=inner)
 
     def _render_spec(self):
         # We first need the argument list; starting with the dimensions
@@ -60,9 +73,6 @@ class OpenMPKernelGenerator(BaseKernelGenerator):
                 kargs.append('const int* __restrict__ {0.name}_vix'
                              .format(va))
 
-                if va.ncdim >= 1:
-                    kargs.append('const int* __restrict__ {0.name}_vcstri'
-                                 .format(va))
                 if va.ncdim == 2:
                     kargs.append('const int* __restrict__ {0.name}_vrstri'
                                  .format(va))
@@ -74,7 +84,7 @@ class OpenMPKernelGenerator(BaseKernelGenerator):
                 kargs.append('{0} {1.dtype}* __restrict__ {1.name}_v'
                              .format(const, va).strip())
 
-                if self.needs_lsdim(va):
-                    kargs.append('int lsd{0.name}'.format(va))
+                if self.needs_ldim(va):
+                    kargs.append('int ld{0.name}'.format(va))
 
         return 'void {0}({1})'.format(self.name, ', '.join(kargs))
diff --git a/pyfr/backends/openmp/kernels/axnpby.mako b/pyfr/backends/openmp/kernels/axnpby.mako
index de55a3f..7e2d591 100644
--- a/pyfr/backends/openmp/kernels/axnpby.mako
+++ b/pyfr/backends/openmp/kernels/axnpby.mako
@@ -3,23 +3,45 @@
 <%namespace module='pyfr.backends.base.makoutil' name='pyfr'/>
 
 void
-axnpby(int nrow, int ncolb, int ldim, int lsdim,
+axnpby(int nrow, int ncolb, int ldim,
        ${', '.join('fpdtype_t *__restrict__ x' + str(i) for i in range(nv))},
        ${', '.join('fpdtype_t a' + str(i) for i in range(nv))})
 {
+    #define X_IDX_AOSOA(v, nv) ((ci/SOA_SZ*(nv) + (v))*SOA_SZ + cj)
     #pragma omp parallel
     {
         int align = PYFR_ALIGN_BYTES / sizeof(fpdtype_t);
-        int rb, re, cb, ce;
+        int rb, re, cb, ce, idx;
+        fpdtype_t axn;
         loop_sched_2d(nrow, ncolb, align, &rb, &re, &cb, &ce);
+        int nci = ((ce - cb) / SOA_SZ)*SOA_SZ;
 
         for (int r = rb; r < re; r++)
         {
-            % for k in subdims:
-            for (int i = cb; i < ce; i++)
+            for (int ci = cb; ci < cb + nci; ci += SOA_SZ)
+            {
+                #pragma omp simd
+                for (int cj = 0; cj < SOA_SZ; cj++)
+                {
+                % for k in subdims:
+                    idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+                    axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+
+                    if (a0 == 0.0)
+                        x0[idx] = axn;
+                    else if (a0 == 1.0)
+                        x0[idx] += axn;
+                    else
+                        x0[idx] = a0*x0[idx] + axn;
+                % endfor
+                }
+            }
+
+            for (int ci = cb + nci, cj = 0; cj < ce - ci; cj++)
             {
-                int idx = i + ldim*r + ${k}*lsdim;
-                fpdtype_t axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
+            % for k in subdims:
+                idx = r*ldim + X_IDX_AOSOA(${k}, ${ncola});
+                axn = ${pyfr.dot('a{l}', 'x{l}[idx]', l=(1, nv))};
 
                 if (a0 == 0.0)
                     x0[idx] = axn;
@@ -27,8 +49,9 @@ axnpby(int nrow, int ncolb, int ldim, int lsdim,
                     x0[idx] += axn;
                 else
                     x0[idx] = a0*x0[idx] + axn;
-            }
             % endfor
+            }
         }
     }
+    #undef X_IDX_AOSOA
 }
diff --git a/pyfr/backends/openmp/kernels/base.mako b/pyfr/backends/openmp/kernels/base.mako
index 7cd4d75..2980406 100644
--- a/pyfr/backends/openmp/kernels/base.mako
+++ b/pyfr/backends/openmp/kernels/base.mako
@@ -6,6 +6,7 @@
 #include <tgmath.h>
 
 #define PYFR_ALIGN_BYTES ${alignb}
+#define SOA_SZ ${soasz}
 
 #define min(a, b) ((a) < (b) ? (a) : (b))
 #define max(a, b) ((a) > (b) ? (a) : (b))
diff --git a/pyfr/backends/openmp/kernels/pack.mako b/pyfr/backends/openmp/kernels/pack.mako
index 2ccf3d1..ba72df5 100644
--- a/pyfr/backends/openmp/kernels/pack.mako
+++ b/pyfr/backends/openmp/kernels/pack.mako
@@ -5,7 +5,6 @@ void
 pack_view(int n, int nrv, int ncv,
           const fpdtype_t *__restrict__ v,
           const int *__restrict__ vix,
-          const int *__restrict__ vcstri,
           const int *__restrict__ vrstri,
           fpdtype_t *__restrict__  pmat)
 {
@@ -15,11 +14,11 @@ pack_view(int n, int nrv, int ncv,
     else if (nrv == 1)
         for (int i = 0; i < n; i++)
             for (int c = 0; c < ncv; c++)
-                pmat[c*n + i] = v[vix[i] + vcstri[i]*c];
+                pmat[c*n + i] = v[vix[i] + SOA_SZ*c];
     else
         for (int i = 0; i < n; i++)
             for (int r = 0; r < nrv; r++)
                 for (int c = 0; c < ncv; c++)
-                    pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r
-                                                + vcstri[i]*c];
+                    pmat[(r*ncv + c)*n + i] = v[vix[i] + vrstri[i]*r +
+                                                SOA_SZ*c];
 }
diff --git a/pyfr/backends/openmp/packing.py b/pyfr/backends/openmp/packing.py
index ef916c4..ed1548a 100644
--- a/pyfr/backends/openmp/packing.py
+++ b/pyfr/backends/openmp/packing.py
@@ -14,12 +14,12 @@ class OpenMPPackingKernels(OpenMPKernelProvider, BasePackingKernels):
         src = self.backend.lookup.get_template('pack').render()
 
         # Build
-        kern = self._build_kernel('pack_view', src, 'iiiPPPPP')
+        kern = self._build_kernel('pack_view', src, 'iiiPPPP')
 
         class PackXchgViewKernel(ComputeKernel):
             def run(self, queue):
                 kern(v.n, v.nvrow, v.nvcol, v.basedata, v.mapping,
-                     v.cstrides or 0, v.rstrides or 0, m)
+                     v.rstrides or 0, m)
 
         return PackXchgViewKernel()
 
diff --git a/pyfr/backends/openmp/types.py b/pyfr/backends/openmp/types.py
index a2726aa..a888a65 100644
--- a/pyfr/backends/openmp/types.py
+++ b/pyfr/backends/openmp/types.py
@@ -9,7 +9,8 @@ class OpenMPMatrixBase(base.MatrixBase):
         self.basedata = basedata.ctypes.data
 
         self.data = basedata[offset:offset + self.nrow*self.pitch]
-        self.data = self.data.view(self.dtype).reshape(self.datashape)
+        self.data = self.data.view(self.dtype)
+        self.data = self.data.reshape(self.nrow, self.leaddim)
 
         self.offset = offset
 
@@ -24,12 +25,10 @@ class OpenMPMatrixBase(base.MatrixBase):
         del self._initval
 
     def _get(self):
-        # Trim any padding in the final dimension
-        return self.data[...,:self.ioshape[-1]]
+        return self._unpack(self.data[:, :self.ncol])
 
     def _set(self, ary):
-        # Assign
-        self.data[...,:ary.shape[-1]] = ary
+        self.data[:, :self.ncol] = self._pack(ary)
 
 
 class OpenMPMatrix(OpenMPMatrixBase, base.Matrix):
@@ -41,11 +40,7 @@ class OpenMPMatrix(OpenMPMatrixBase, base.Matrix):
 class OpenMPMatrixRSlice(base.MatrixRSlice):
     @lazyprop
     def data(self):
-        # Since slices do not retain any information about the
-        # high-order structure of an array it is fine to compact mat
-        # down to two dimensions and simply slice this
-        mat = self.parent
-        return mat.data.reshape(mat.nrow, mat.leaddim)[self.p:self.q]
+        return self.parent.data[self.p:self.q]
 
     @lazyprop
     def _as_parameter_(self):
diff --git a/pyfr/solvers/base/elements.py b/pyfr/solvers/base/elements.py
index 2bab10a..4b453f2 100644
--- a/pyfr/solvers/base/elements.py
+++ b/pyfr/solvers/base/elements.py
@@ -347,17 +347,16 @@ class BaseElements(object, metaclass=ABCMeta):
         nfp = self.nfacefpts[fidx]
 
         rcmap = [(fpidx, eidx) for fpidx in self._srtd_face_fpts[fidx][eidx]]
-        cstri = ((self._scal_fpts.leadsubdim,),)*nfp
 
-        return (self._scal_fpts.mid,)*nfp, rcmap, cstri
+        return (self._scal_fpts.mid,)*nfp, rcmap
 
     def get_vect_fpts_for_inter(self, eidx, fidx):
         nfp = self.nfacefpts[fidx]
 
         rcmap = [(fpidx, eidx) for fpidx in self._srtd_face_fpts[fidx][eidx]]
-        rcstri = ((self.nfpts, self._vect_fpts.leadsubdim),)*nfp
+        rstri = (self.nfpts,)*nfp
 
-        return (self._vect_fpts.mid,)*nfp, rcmap, rcstri
+        return (self._vect_fpts.mid,)*nfp, rcmap, rstri
 
     def get_ploc_for_inter(self, eidx, fidx):
         fpts_idx = self._srtd_face_fpts[fidx][eidx]

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/pyfr.git