[arrayfire] 98/284: moved the left over fns to cpu kernel namespace

Sun Feb 7 18:59:23 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.

commit 7d7f32ffd165f952e85cfe8d711ba147afbbe65d
Author: pradeep <pradeep at arrayfire.com>
Date:   Sat Dec 19 16:04:37 2015 -0500

    moved the left over fns to cpu kernel namespace
---
 .../nearest_neighbour.hpp}                         |  71 +--
 src/backend/cpu/{orb.cpp => kernel/orb.hpp}        | 308 +-----------
 src/backend/cpu/{random.cpp => kernel/random.hpp}  |  98 +---
 src/backend/cpu/kernel/range.hpp                   |  52 +++
 src/backend/cpu/kernel/reduce.hpp                  |  71 +++
 src/backend/cpu/kernel/regions.hpp                 | 194 ++++++++
 .../cpu/{reorder.cpp => kernel/reorder.hpp}        |  44 +-
 src/backend/cpu/{resize.cpp => kernel/resize.hpp}  |  55 +--
 src/backend/cpu/{rotate.cpp => kernel/rotate.hpp}  |  56 +--
 src/backend/cpu/{scan.cpp => kernel/scan.hpp}      |  61 +--
 src/backend/cpu/kernel/select.hpp                  | 124 +++++
 src/backend/cpu/kernel/shift.hpp                   |  69 +++
 src/backend/cpu/{ => kernel}/sift_nonfree.hpp      |   0
 src/backend/cpu/{sobel.cpp => kernel/sobel.hpp}    |  47 +-
 src/backend/cpu/kernel/sort.hpp                    |  51 ++
 .../{sort_by_key.cpp => kernel/sort_by_key.hpp}    |  80 +---
 .../cpu/{sort_index.cpp => kernel/sort_index.hpp}  |  59 +--
 src/backend/cpu/{susan.cpp => kernel/susan.hpp}    |  77 +--
 src/backend/cpu/kernel/tile.hpp                    |  55 +++
 .../cpu/{transform.cpp => kernel/transform.hpp}    |  60 +--
 .../cpu/{transpose.cpp => kernel/transpose.hpp}    |  70 +--
 src/backend/cpu/kernel/triangle.hpp                |  61 +++
 src/backend/cpu/{unwrap.cpp => kernel/unwrap.hpp}  |  57 +--
 src/backend/cpu/{wrap.cpp => kernel/wrap.hpp}      |  55 +--
 src/backend/cpu/nearest_neighbour.cpp              | 131 +-----
 src/backend/cpu/orb.cpp                            | 520 +--------------------
 src/backend/cpu/random.cpp                         | 176 ++-----
 src/backend/cpu/range.cpp                          |  46 +-
 src/backend/cpu/reduce.cpp                         |  60 +--
 src/backend/cpu/regions.cpp                        | 175 +------
 src/backend/cpu/reorder.cpp                        |  39 +-
 src/backend/cpu/resize.cpp                         | 166 +------
 src/backend/cpu/rotate.cpp                         |  71 +--
 src/backend/cpu/scan.cpp                           |  61 +--
 src/backend/cpu/select.cpp                         | 103 +---
 src/backend/cpu/shift.cpp                          |  52 +--
 src/backend/cpu/sift.cpp                           |   2 +-
 src/backend/cpu/sobel.cpp                          |  71 +--
 src/backend/cpu/sort.cpp                           |  44 +-
 src/backend/cpu/sort_by_key.cpp                    |  83 +---
 src/backend/cpu/sort_index.cpp                     |  61 +--
 src/backend/cpu/susan.cpp                          |  84 +---
 src/backend/cpu/tile.cpp                           |  38 +-
 src/backend/cpu/transform.cpp                      |  93 +---
 src/backend/cpu/transform_interp.hpp               |   2 +
 src/backend/cpu/transpose.cpp                      | 115 +----
 src/backend/cpu/triangle.cpp                       |  42 +-
 src/backend/cpu/unwrap.cpp                         |  67 +--
 src/backend/cpu/wrap.cpp                           |  66 +--
 49 files changed, 883 insertions(+), 3360 deletions(-)

diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/kernel/nearest_neighbour.hpp
similarity index 56%
copy from src/backend/cpu/nearest_neighbour.cpp
copy to src/backend/cpu/kernel/nearest_neighbour.hpp
index b6f50c2..4916463 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/kernel/nearest_neighbour.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,19 +7,14 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
+#pragma once
 #include <af/defines.h>
-#include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
 
 namespace cpu
 {
+namespace kernel
+{
 
 #if defined(_WIN32) || defined(_MSC_VER)
 
@@ -92,9 +87,9 @@ struct dist_op<ushort, To, AF_SHD>
 };
 
 template<typename T, typename To, af_match_type dist_type>
-void nearest_neighbour_(Array<uint> idx, Array<To> dist,
-                        const Array<T> query, const Array<T> train,
-                        const uint dist_dim, const uint n_dist)
+void nearest_neighbour(Array<uint> idx, Array<To> dist,
+                       const Array<T> query, const Array<T> train,
+                       const uint dist_dim, const uint n_dist)
 {
     uint sample_dim = (dist_dim == 0) ? 1 : 0;
     const dim4 qDims = query.dims();
@@ -144,57 +139,5 @@ void nearest_neighbour_(Array<uint> idx, Array<To> dist,
     }
 }
 
-template<typename T, typename To>
-void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
-                       const Array<T>& query, const Array<T>& train,
-                       const uint dist_dim, const uint n_dist,
-                       const af_match_type dist_type)
-{
-    if (n_dist > 1) {
-        CPU_NOT_SUPPORTED();
-    }
-
-    query.eval();
-    train.eval();
-
-    uint sample_dim  = (dist_dim == 0) ? 1 : 0;
-    const dim4 qDims = query.dims();
-    const dim4 outDims(n_dist, qDims[sample_dim]);
-
-    idx  = createEmptyArray<uint>(outDims);
-    dist = createEmptyArray<To  >(outDims);
-
-    switch(dist_type) {
-        case AF_SAD:
-            getQueue().enqueue(nearest_neighbour_<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
-            break;
-        case AF_SSD:
-            getQueue().enqueue(nearest_neighbour_<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
-            break;
-        case AF_SHD:
-            getQueue().enqueue(nearest_neighbour_<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
-            break;
-        default:
-            AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
-    }
 }
-
-#define INSTANTIATE(T, To)                                                              \
-    template void nearest_neighbour<T, To>(Array<uint>& idx, Array<To>& dist,           \
-                                         const Array<T>& query, const Array<T>& train,  \
-                                         const uint dist_dim, const uint n_dist,        \
-                                         const af_match_type dist_type);
-
-INSTANTIATE(float , float)
-INSTANTIATE(double, double)
-INSTANTIATE(int   , int)
-INSTANTIATE(uint  , uint)
-INSTANTIATE(intl  , intl)
-INSTANTIATE(uintl , uintl)
-INSTANTIATE(uchar , uint)
-INSTANTIATE(ushort, uint)
-INSTANTIATE(short , int)
-
-INSTANTIATE(uintl , uint)    // For Hamming
-
 }
diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/kernel/orb.hpp
similarity index 52%
copy from src/backend/cpu/orb.cpp
copy to src/backend/cpu/kernel/orb.hpp
index 4b6629c..acd508c 100644
--- a/src/backend/cpu/orb.cpp
+++ b/src/backend/cpu/kernel/orb.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,27 +7,15 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
+#pragma once
 #include <af/defines.h>
-#include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-#include <resize.hpp>
-#include <fast.hpp>
-#include <sort_index.hpp>
-#include <convolve.hpp>
-#include <memory.hpp>
-#include <cstring>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
+#include <utility.hpp>
 
 namespace cpu
 {
-
-static const float PI_VAL = 3.14159265358979323846f;
+namespace kernel
+{
 
 // Reference pattern, generated for a patch size of 31x31, as suggested by
 // original ORB paper
@@ -299,24 +287,6 @@ const int ref_pat[REF_PAT_LENGTH] = {
 };
 
 template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
 void keep_features(
     float* x_out,
     float* y_out,
@@ -535,273 +505,5 @@ void extract_orb(
 
 
 
-template<typename T, typename convAccT>
-unsigned orb(Array<float> &x, Array<float> &y,
-             Array<float> &score, Array<float> &ori,
-             Array<float> &size, Array<uint> &desc,
-             const Array<T>& image,
-             const float fast_thr, const unsigned max_feat,
-             const float scl_fctr, const unsigned levels,
-             const bool blur_img)
-{
-    image.eval();
-    getQueue().sync();
-
-    unsigned patch_size = REF_PAT_SIZE;
-
-    const af::dim4 idims = image.dims();
-    unsigned min_side = std::min(idims[0], idims[1]);
-    unsigned max_levels = 0;
-    float scl_sum = 0.f;
-
-    for (unsigned i = 0; i < levels; i++) {
-        min_side /= scl_fctr;
-
-        // Minimum image side for a descriptor to be computed
-        if (min_side < patch_size || max_levels == levels) break;
-
-        max_levels++;
-        scl_sum += 1.f / (float)std::pow(scl_fctr,(float)i);
-    }
-
-    std::vector<float*> h_x_pyr(max_levels);
-    std::vector<float*> h_y_pyr(max_levels);
-    std::vector<float*> h_score_pyr(max_levels);
-    std::vector<float*> h_ori_pyr(max_levels);
-    std::vector<float*> h_size_pyr(max_levels);
-    std::vector<unsigned*> h_desc_pyr(max_levels);
-
-    std::vector<unsigned> feat_pyr(max_levels);
-    unsigned total_feat = 0;
-
-    // Compute number of features to keep for each level
-    std::vector<unsigned> lvl_best(max_levels);
-    unsigned feat_sum = 0;
-    for (unsigned i = 0; i < max_levels-1; i++) {
-        float lvl_scl = (float)std::pow(scl_fctr,(float)i);
-        lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl);
-        feat_sum += lvl_best[i];
-    }
-    lvl_best[max_levels-1] = max_feat - feat_sum;
-
-    // Maintain a reference to previous level image
-    Array<T> prev_img = createEmptyArray<T>(af::dim4());
-    af::dim4 prev_ldims;
-
-    af::dim4 gauss_dims(9);
-    T* h_gauss = nullptr;
-    Array<T> gauss_filter = createEmptyArray<T>(af::dim4());
-
-    for (unsigned i = 0; i < max_levels; i++) {
-        af::dim4 ldims;
-        const float lvl_scl = (float)std::pow(scl_fctr,(float)i);
-        Array<T> lvl_img = createEmptyArray<T>(af::dim4());
-
-        if (i == 0) {
-            // First level is used in its original size
-            lvl_img = image;
-            ldims = image.dims();
-
-            prev_img = image;
-            prev_ldims = image.dims();
-        }
-        else {
-            // Resize previous level image to current level dimensions
-            ldims[0] = round(idims[0] / lvl_scl);
-            ldims[1] = round(idims[1] / lvl_scl);
-
-            lvl_img = resize<T>(prev_img, ldims[0], ldims[1], AF_INTERP_BILINEAR);
-            lvl_img.eval();
-            getQueue().sync();
-
-            prev_img = lvl_img;
-            prev_ldims = lvl_img.dims();
-        }
-
-
-        Array<float> x_feat = createEmptyArray<float>(dim4());
-        Array<float> y_feat = createEmptyArray<float>(dim4());
-        Array<float> score_feat = createEmptyArray<float>(dim4());
-
-        // Round feature size to nearest odd integer
-        float size = 2.f * floor(patch_size / 2.f) + 1.f;
-
-        // Avoid keeping features that might be too wide and might not fit on
-        // the image, sqrt(2.f) is the radius when angle is 45 degrees and
-        // represents widest case possible
-        unsigned edge = ceil(size * sqrt(2.f) / 2.f);
-
-        unsigned lvl_feat = fast(x_feat, y_feat, score_feat,
-                                 lvl_img, fast_thr, 9, 1, 0.15f, edge);
-        x_feat.eval();
-        y_feat.eval();
-        score_feat.eval();
-        getQueue().sync();
-
-        if (lvl_feat == 0) {
-            continue;
-        }
-
-        float* h_x_feat = x_feat.get();
-        float* h_y_feat = y_feat.get();
-
-        float* h_x_harris = memAlloc<float>(lvl_feat);
-        float* h_y_harris = memAlloc<float>(lvl_feat);
-        float* h_score_harris = memAlloc<float>(lvl_feat);
-
-        // Calculate Harris responses
-        // Good block_size >= 7 (must be an odd number)
-        unsigned usable_feat = 0;
-        harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
-                                  h_x_feat, h_y_feat, nullptr,
-                                  lvl_feat, &usable_feat,
-                                  lvl_img,
-                                  7, 0.04f, patch_size);
-
-        if (usable_feat == 0) {
-            memFree(h_x_harris);
-            memFree(h_y_harris);
-            memFree(h_score_harris);
-            continue;
-        }
-
-        // Sort features according to Harris responses
-        af::dim4 usable_feat_dims(usable_feat);
-        Array<float> score_harris = createDeviceDataArray<float>(usable_feat_dims, h_score_harris);
-        Array<float> harris_sorted = createEmptyArray<float>(af::dim4());
-        Array<unsigned> harris_idx = createEmptyArray<unsigned>(af::dim4());
-
-        sort_index<float, false>(harris_sorted, harris_idx, score_harris, 0);
-        harris_sorted.eval();
-        harris_idx.eval();
-        getQueue().sync();
-
-        usable_feat = std::min(usable_feat, lvl_best[i]);
-
-        if (usable_feat == 0) {
-            memFree(h_x_harris);
-            memFree(h_y_harris);
-            continue;
-        }
-
-        float* h_x_lvl = memAlloc<float>(usable_feat);
-        float* h_y_lvl = memAlloc<float>(usable_feat);
-        float* h_score_lvl = memAlloc<float>(usable_feat);
-
-        // Keep only features with higher Harris responses
-        keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
-                         h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(),
-                         nullptr, usable_feat);
-
-        memFree(h_x_harris);
-        memFree(h_y_harris);
-
-        float* h_ori_lvl = memAlloc<float>(usable_feat);
-        float* h_size_lvl = memAlloc<float>(usable_feat);
-
-        // Compute orientation of features
-        centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
-                          lvl_img, patch_size);
-
-        Array<T> lvl_filt = createEmptyArray<T>(dim4());
-
-        if (blur_img) {
-            // Calculate a separable Gaussian kernel, if one is not already stored
-            if (!h_gauss) {
-                h_gauss = memAlloc<T>(gauss_dims[0]);
-                gaussian1D(h_gauss, gauss_dims[0], 2.f);
-                gauss_filter = createDeviceDataArray<T>(gauss_dims, h_gauss);
-            }
-
-            // Filter level image with Gaussian kernel to reduce noise sensitivity
-            lvl_filt = convolve2<T, convAccT, false>(lvl_img, gauss_filter, gauss_filter);
-        }
-        lvl_filt.eval();
-        getQueue().sync();
-
-        // Compute ORB descriptors
-        unsigned* h_desc_lvl = memAlloc<unsigned>(usable_feat * 8);
-        memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned));
-        if (blur_img)
-            extract_orb<T>(h_desc_lvl, usable_feat,
-                           h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
-                           lvl_filt, lvl_scl, patch_size);
-        else
-            extract_orb<T>(h_desc_lvl, usable_feat,
-                           h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
-                           lvl_img, lvl_scl, patch_size);
-
-        // Store results to pyramids
-        total_feat += usable_feat;
-        feat_pyr[i] = usable_feat;
-        h_x_pyr[i] = h_x_lvl;
-        h_y_pyr[i] = h_y_lvl;
-        h_score_pyr[i] = h_score_lvl;
-        h_ori_pyr[i] = h_ori_lvl;
-        h_size_pyr[i] = h_size_lvl;
-        h_desc_pyr[i] = h_desc_lvl;
-
-    }
-
-    if (total_feat > 0 ) {
-
-        // Allocate feature Arrays
-        const af::dim4 total_feat_dims(total_feat);
-        const af::dim4 desc_dims(8, total_feat);
-
-        x     = createEmptyArray<float>(total_feat_dims);
-        y     = createEmptyArray<float>(total_feat_dims);
-        score = createEmptyArray<float>(total_feat_dims);
-        ori   = createEmptyArray<float>(total_feat_dims);
-        size  = createEmptyArray<float>(total_feat_dims);
-        desc  = createEmptyArray<uint >(desc_dims);
-
-        float* h_x = x.get();
-        float* h_y = y.get();
-        float* h_score = score.get();
-        float* h_ori = ori.get();
-        float* h_size = size.get();
-
-        unsigned* h_desc = desc.get();
-
-        unsigned offset = 0;
-        for (unsigned i = 0; i < max_levels; i++) {
-            if (feat_pyr[i] == 0)
-                continue;
-
-            if (i > 0)
-                offset += feat_pyr[i-1];
-
-            memcpy(h_x+offset, h_x_pyr[i], feat_pyr[i] * sizeof(float));
-            memcpy(h_y+offset, h_y_pyr[i], feat_pyr[i] * sizeof(float));
-            memcpy(h_score+offset, h_score_pyr[i], feat_pyr[i] * sizeof(float));
-            memcpy(h_ori+offset, h_ori_pyr[i], feat_pyr[i] * sizeof(float));
-            memcpy(h_size+offset, h_size_pyr[i], feat_pyr[i] * sizeof(float));
-
-            memcpy(h_desc+(offset*8), h_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned));
-
-            memFree(h_x_pyr[i]);
-            memFree(h_y_pyr[i]);
-            memFree(h_score_pyr[i]);
-            memFree(h_ori_pyr[i]);
-            memFree(h_size_pyr[i]);
-            memFree(h_desc_pyr[i]);
-        }
-    }
-
-    return total_feat;
 }
-
-#define INSTANTIATE(T, convAccT)                                                        \
-    template unsigned orb<T, convAccT>(Array<float> &x, Array<float> &y,                \
-                                       Array<float> &score, Array<float> &ori,          \
-                                       Array<float> &size, Array<uint> &desc,           \
-                                       const Array<T>& image,                           \
-                                       const float fast_thr, const unsigned max_feat,   \
-                                       const float scl_fctr, const unsigned levels,     \
-                                       const bool blur_img);
-
-INSTANTIATE(float , float )
-INSTANTIATE(double, double)
-
 }
diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/kernel/random.hpp
similarity index 61%
copy from src/backend/cpu/random.cpp
copy to src/backend/cpu/kernel/random.hpp
index 8c83ad6..357cbd2 100644
--- a/src/backend/cpu/random.cpp
+++ b/src/backend/cpu/kernel/random.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,22 +7,20 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
 #include <type_traits>
 #include <random>
 #include <algorithm>
 #include <functional>
 #include <limits>
 #include <type_traits>
-#include <af/array.h>
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <Array.hpp>
-#include <random.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
 
 namespace cpu
 {
+namespace kernel
+{
 
 using namespace std;
 
@@ -76,7 +74,7 @@ static bool is_first = true;
 #define GLOBAL 1
 
 template<typename T>
-void randn_(Array<T> out)
+void randn(Array<T> out)
 {
     static unsigned long long my_seed = 0;
     if (is_first) {
@@ -98,15 +96,7 @@ void randn_(Array<T> out)
 }
 
 template<typename T>
-Array<T> randn(const af::dim4 &dims)
-{
-    Array<T> outArray = createEmptyArray<T>(dims);
-    getQueue().enqueue(randn_<T>, outArray);
-    return outArray;
-}
-
-template<typename T>
-void randu_(Array<T> out)
+void randu(Array<T> out)
 {
     static unsigned long long my_seed = 0;
     if (is_first) {
@@ -128,7 +118,7 @@ void randu_(Array<T> out)
 }
 
 template<>
-void randu_(Array<char> out)
+void randu(Array<char> out)
 {
     static unsigned long long my_seed = 0;
     if (is_first) {
@@ -149,75 +139,5 @@ void randu_(Array<char> out)
     }
 }
 
-template<typename T>
-Array<T> randu(const af::dim4 &dims)
-{
-    Array<T> outArray = createEmptyArray<T>(dims);
-    getQueue().enqueue(randu_<T>, outArray);
-    return outArray;
-}
-
-#define INSTANTIATE_UNIFORM(T)                              \
-    template Array<T>  randu<T>    (const af::dim4 &dims);
-
-INSTANTIATE_UNIFORM(float)
-INSTANTIATE_UNIFORM(double)
-INSTANTIATE_UNIFORM(cfloat)
-INSTANTIATE_UNIFORM(cdouble)
-INSTANTIATE_UNIFORM(int)
-INSTANTIATE_UNIFORM(uint)
-INSTANTIATE_UNIFORM(intl)
-INSTANTIATE_UNIFORM(uintl)
-INSTANTIATE_UNIFORM(uchar)
-INSTANTIATE_UNIFORM(short)
-INSTANTIATE_UNIFORM(ushort)
-
-#define INSTANTIATE_NORMAL(T)                              \
-    template Array<T>  randn<T>(const af::dim4 &dims);
-
-INSTANTIATE_NORMAL(float)
-INSTANTIATE_NORMAL(double)
-INSTANTIATE_NORMAL(cfloat)
-INSTANTIATE_NORMAL(cdouble)
-
-template<>
-Array<char> randu(const af::dim4 &dims)
-{
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = urand<float>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = urand<float>(generator);
-        my_seed = gen_seed;
-    }
-
-    Array<char> outArray = createEmptyArray<char>(dims);
-    char *outPtr = outArray.get();
-    for (int i = 0; i < (int)outArray.elements(); i++) {
-        outPtr[i] = gen() > 0.5;
-    }
-    return outArray;
-}
-
-void setSeed(const uintl seed)
-{
-    auto f = [=](const uintl seed){
-        generator.seed(seed);
-        is_first = false;
-        gen_seed = seed;
-    };
-    getQueue().enqueue(f, seed);
 }
-
-uintl getSeed()
-{
-    getQueue().sync();
-    return gen_seed;
-}
-
 }
diff --git a/src/backend/cpu/kernel/range.hpp b/src/backend/cpu/kernel/range.hpp
new file mode 100644
index 0000000..b244a19
--- /dev/null
+++ b/src/backend/cpu/kernel/range.hpp
@@ -0,0 +1,52 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, int dim>
+void range(Array<T> output)
+{
+    T* out = output.get();
+
+    const dim4 dims = output.dims();
+    const dim4 strides = output.strides();
+
+    for(dim_t w = 0; w < dims[3]; w++) {
+        dim_t offW = w * strides[3];
+        for(dim_t z = 0; z < dims[2]; z++) {
+            dim_t offWZ = offW + z * strides[2];
+            for(dim_t y = 0; y < dims[1]; y++) {
+                dim_t offWZY = offWZ + y * strides[1];
+                for(dim_t x = 0; x < dims[0]; x++) {
+                    dim_t id = offWZY + x;
+                    if(dim == 0) {
+                        out[id] = x;
+                    } else if(dim == 1) {
+                        out[id] = y;
+                    } else if(dim == 2) {
+                        out[id] = z;
+                    } else if(dim == 3) {
+                        out[id] = w;
+                    }
+                }
+            }
+        }
+    }
+}
+
+}
+}
+
diff --git a/src/backend/cpu/kernel/reduce.hpp b/src/backend/cpu/kernel/reduce.hpp
new file mode 100644
index 0000000..85119dc
--- /dev/null
+++ b/src/backend/cpu/kernel/reduce.hpp
@@ -0,0 +1,71 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<af_op_t op, typename Ti, typename To, int D>
+struct reduce_dim
+{
+    void operator()(Array<To> out, const dim_t outOffset,
+                    const Array<Ti> in, const dim_t inOffset,
+                    const int dim, bool change_nan, double nanval)
+    {
+        static const int D1 = D - 1;
+        static reduce_dim<op, Ti, To, D1> reduce_dim_next;
+
+        const af::dim4 ostrides = out.strides();
+        const af::dim4 istrides = in.strides();
+        const af::dim4 odims    = out.dims();
+
+        for (dim_t i = 0; i < odims[D1]; i++) {
+            reduce_dim_next(out, outOffset + i * ostrides[D1],
+                            in, inOffset + i * istrides[D1],
+                            dim, change_nan, nanval);
+        }
+    }
+};
+
+template<af_op_t op, typename Ti, typename To>
+struct reduce_dim<op, Ti, To, 0>
+{
+
+    Transform<Ti, To, op> transform;
+    Binary<To, op> reduce;
+    void operator()(Array<To> out, const dim_t outOffset,
+                    const Array<Ti> in, const dim_t inOffset,
+                    const int dim, bool change_nan, double nanval)
+    {
+        const af::dim4 istrides = in.strides();
+        const af::dim4 idims    = in.dims();
+
+        To * const outPtr = out.get() + outOffset;
+        Ti const * const inPtr = in.get() + inOffset;
+        dim_t stride = istrides[dim];
+
+        To out_val = reduce.init();
+        for (dim_t i = 0; i < idims[dim]; i++) {
+            To in_val = transform(inPtr[i * stride]);
+            if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
+            out_val = reduce(in_val, out_val);
+        }
+
+        *outPtr = out_val;
+    }
+};
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/regions.hpp b/src/backend/cpu/kernel/regions.hpp
new file mode 100644
index 0000000..863ebc5
--- /dev/null
+++ b/src/backend/cpu/kernel/regions.hpp
@@ -0,0 +1,194 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+class LabelNode
+{
+private:
+    T label;
+    T minLabel;
+    unsigned rank;
+    LabelNode* parent;
+
+public:
+    LabelNode() : label(0), minLabel(0), rank(0), parent(this) { }
+    LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { }
+
+    T getLabel()
+    {
+        return label;
+    }
+
+    T getMinLabel()
+    {
+        return minLabel;
+    }
+
+    LabelNode* getParent()
+    {
+        return parent;
+    }
+
+    unsigned getRank()
+    {
+        return rank;
+    }
+
+    void setMinLabel(T l)
+    {
+        minLabel = l;
+    }
+
+    void setParent(LabelNode* p)
+    {
+        parent = p;
+    }
+
+    void setRank(unsigned r)
+    {
+        rank = r;
+    }
+};
+
+template<typename T>
+static LabelNode<T>* find(LabelNode<T>* x)
+{
+    if (x->getParent() != x)
+        x->setParent(find(x->getParent()));
+    return x->getParent();
+}
+
+template<typename T>
+static void setUnion(LabelNode<T>* x, LabelNode<T>* y)
+{
+    LabelNode<T>* xRoot = find(x);
+    LabelNode<T>* yRoot = find(y);
+    if (xRoot == yRoot)
+        return;
+
+    T xMinLabel = xRoot->getMinLabel();
+    T yMinLabel = yRoot->getMinLabel();
+    xRoot->setMinLabel(min(xMinLabel, yMinLabel));
+    yRoot->setMinLabel(min(xMinLabel, yMinLabel));
+
+    if (xRoot->getRank() < yRoot->getRank())
+        xRoot->setParent(yRoot);
+    else if (xRoot->getRank() > yRoot->getRank())
+        yRoot->setParent(xRoot);
+    else {
+        yRoot->setParent(xRoot);
+        xRoot->setRank(xRoot->getRank() + 1);
+    }
+}
+
+template<typename T>
+void regions(Array<T> out, const Array<char> in, af_connectivity connectivity)
+{
+    const af::dim4 in_dims = in.dims();
+    const char *in_ptr  = in.get();
+    T    *out_ptr = out.get();
+
+    // Map labels
+    typedef typename std::map<T, LabelNode<T>* > label_map_t;
+    typedef typename label_map_t::iterator label_map_iterator_t;
+
+    label_map_t lmap;
+
+    // Initial label
+    T label = (T)1;
+
+    for (int j = 0; j < (int)in_dims[1]; j++) {
+        for (int i = 0; i < (int)in_dims[0]; i++) {
+            int idx = j * in_dims[0] + i;
+            if (in_ptr[idx] != 0) {
+                std::vector<T> l;
+
+                // Test neighbors
+                if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0)
+                    l.push_back(out_ptr[j * in_dims[0] + i-1]);
+                if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0)
+                    l.push_back(out_ptr[(j-1) * in_dims[0] + i]);
+                if (connectivity == AF_CONNECTIVITY_8 && i > 0 &&
+                        j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0)
+                    l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]);
+                if (connectivity == AF_CONNECTIVITY_8 &&
+                        i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0)
+                    l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]);
+
+                if (!l.empty()) {
+                    T minl = l[0];
+                    for (size_t k = 0; k < l.size(); k++) {
+                        minl = min(l[k], minl);
+                        label_map_iterator_t cur_map = lmap.find(l[k]);
+                        LabelNode<T> *node = cur_map->second;
+                        // Group labels of the same region under a disjoint set
+                        for (size_t m = k+1; m < l.size(); m++)
+                            setUnion(node, lmap.find(l[m])->second);
+                    }
+                    // Set label to smallest neighbor label
+                    out_ptr[idx] = minl;
+                }
+                else {
+                    // Insert new label in map
+                    LabelNode<T> *node = new LabelNode<T>(label);
+                    lmap.insert(std::pair<T, LabelNode<T>* >(label, node));
+                    out_ptr[idx] = label++;
+                }
+            }
+        }
+    }
+
+    std::set<T> removed;
+
+    for (int j = 0; j < (int)in_dims[1]; j++) {
+        for (int i = 0; i < (int)in_dims[0]; i++) {
+            int idx = j * (int)in_dims[0] + i;
+            if (in_ptr[idx] != 0) {
+                T l = out_ptr[idx];
+                label_map_iterator_t cur_map = lmap.find(l);
+
+                if (cur_map != lmap.end()) {
+                    LabelNode<T>* node = cur_map->second;
+
+                    LabelNode<T>* node_root = find(node);
+                    out_ptr[idx] = node_root->getMinLabel();
+
+                    // Mark removed labels (those that are part of a region
+                    // that contains a smaller label)
+                    if (node->getMinLabel() < l || node_root->getMinLabel() < l)
+                        removed.insert(l);
+                    if (node->getLabel() > node->getMinLabel())
+                        removed.insert(node->getLabel());
+                }
+            }
+        }
+    }
+
+    // Calculate final neighbors (ensure final labels are sequential)
+    for (int j = 0; j < (int)in_dims[1]; j++) {
+        for (int i = 0; i < (int)in_dims[0]; i++) {
+            int idx = j * (int)in_dims[0] + i;
+            if (out_ptr[idx] > 0) {
+                out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx]));
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/kernel/reorder.hpp
similarity index 57%
copy from src/backend/cpu/reorder.cpp
copy to src/backend/cpu/kernel/reorder.hpp
index 1ad7dad..c10c96e 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/kernel/reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,18 +7,17 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <reorder.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<typename T>
-void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
+void reorder(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
 {
     T* outPtr = out.get();
     const T* inPtr = in.get();
@@ -51,35 +50,6 @@ void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::d
     }
 }
 
-template<typename T>
-Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
-{
-    in.eval();
-
-    const af::dim4 iDims = in.dims();
-    af::dim4 oDims(0);
-    for(int i = 0; i < 4; i++)
-        oDims[i] = iDims[rdims[i]];
-
-    Array<T> out = createEmptyArray<T>(oDims);
-    getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
-    return out;
 }
-
-#define INSTANTIATE(T)                                                         \
-    template Array<T> reorder<T>(const Array<T> &in, const af::dim4 &rdims);  \
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
+
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/kernel/resize.hpp
similarity index 79%
copy from src/backend/cpu/resize.cpp
copy to src/backend/cpu/kernel/resize.hpp
index 8fb2edc..19d7ec7 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/kernel/resize.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,18 +7,14 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <resize.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <math.hpp>
-#include <types.hpp>
-#include <af/traits.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
 
 namespace cpu
 {
+namespace kernel
+{
 
 /**
  * noop function for round to avoid compilation
@@ -160,7 +156,7 @@ struct resize_op<T, AF_INTERP_LOWER>
 };
 
 template<typename T, af_interp_type method>
-void resize_(Array<T> out, const Array<T> in)
+void resize(Array<T> out, const Array<T> in)
 {
     af::dim4 idims    = in.dims();
     af::dim4 odims    = out.dims();
@@ -177,44 +173,5 @@ void resize_(Array<T> out, const Array<T> in)
     }
 }
 
-template<typename T>
-Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
-                const af_interp_type method)
-{
-    af::dim4 idims = in.dims();
-    af::dim4 odims(odim0, odim1, idims[2], idims[3]);
-    // Create output placeholder
-    Array<T> out = createValueArray(odims, (T)0);
-    out.eval();
-    in.eval();
-
-    switch(method) {
-        case AF_INTERP_NEAREST:
-            getQueue().enqueue(resize_<T, AF_INTERP_NEAREST>, out, in); break;
-        case AF_INTERP_BILINEAR:
-            getQueue().enqueue(resize_<T, AF_INTERP_BILINEAR>, out, in); break;
-        case AF_INTERP_LOWER:
-            getQueue().enqueue(resize_<T, AF_INTERP_LOWER>, out, in); break;
-        default: break;
-    }
-    return out;
 }
-
-#define INSTANTIATE(T)                                                                     \
-    template Array<T> resize<T> (const Array<T> &in, const dim_t odim0, const dim_t odim1, \
-                                 const af_interp_type method);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/kernel/rotate.hpp
similarity index 61%
copy from src/backend/cpu/rotate.cpp
copy to src/backend/cpu/kernel/rotate.hpp
index 5687d69..6e4f758 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/kernel/rotate.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,20 +7,19 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <rotate.hpp>
 #include <math.hpp>
-#include <stdexcept>
 #include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-#include "transform_interp.hpp"
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<typename T, af_interp_type method>
-void rotate_(Array<T> output, const Array<T> input, const float theta)
+void rotate(Array<T> output, const Array<T> input, const float theta)
 {
     const af::dim4 odims    = output.dims();
     const af::dim4 idims    = input.dims();
@@ -80,48 +79,5 @@ void rotate_(Array<T> output, const Array<T> input, const float theta)
     }
 }
 
-template<typename T>
-Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
-                 const af_interp_type method)
-{
-    in.eval();
-
-    Array<T> out = createEmptyArray<T>(odims);
-
-    switch(method) {
-        case AF_INTERP_NEAREST:
-            getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
-            break;
-        case AF_INTERP_BILINEAR:
-            getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
-            break;
-        case AF_INTERP_LOWER:
-            getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
-            break;
-        default:
-            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-            break;
-    }
-
-    return out;
 }
-
-
-#define INSTANTIATE(T)                                                              \
-    template Array<T> rotate(const Array<T> &in, const float theta,                 \
-                             const af::dim4 &odims, const af_interp_type method);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/kernel/scan.hpp
similarity index 53%
copy from src/backend/cpu/scan.cpp
copy to src/backend/cpu/kernel/scan.hpp
index 39157ca..0bcfe7d 100644
--- a/src/backend/cpu/scan.cpp
+++ b/src/backend/cpu/kernel/scan.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,20 +7,14 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <complex>
-#include <af/dim4.hpp>
+#pragma once
 #include <af/defines.h>
-#include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <scan.hpp>
-#include <ops.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<af_op_t op, typename Ti, typename To, int D>
 struct scan_dim
@@ -74,52 +68,5 @@ struct scan_dim<op, Ti, To, 0>
     }
 };
 
-template<af_op_t op, typename Ti, typename To>
-Array<To> scan(const Array<Ti>& in, const int dim)
-{
-    dim4 dims     = in.dims();
-    Array<To> out = createValueArray<To>(dims, 0);
-    out.eval();
-    in.eval();
-
-    switch (in.ndims()) {
-        case 1:
-            scan_dim<op, Ti, To, 1> func1;
-            getQueue().enqueue(func1, out, 0, in, 0, dim);
-            break;
-        case 2:
-            scan_dim<op, Ti, To, 2> func2;
-            getQueue().enqueue(func2, out, 0, in, 0, dim);
-            break;
-        case 3:
-            scan_dim<op, Ti, To, 3> func3;
-            getQueue().enqueue(func3, out, 0, in, 0, dim);
-            break;
-        case 4:
-            scan_dim<op, Ti, To, 4> func4;
-            getQueue().enqueue(func4, out, 0, in, 0, dim);
-            break;
-    }
-
-    return out;
 }
-
-#define INSTANTIATE(ROp, Ti, To)                                        \
-    template Array<To> scan<ROp, Ti, To>(const Array<Ti> &in, const int dim); \
-
-//accum
-INSTANTIATE(af_add_t, float  , float  )
-INSTANTIATE(af_add_t, double , double )
-INSTANTIATE(af_add_t, cfloat , cfloat )
-INSTANTIATE(af_add_t, cdouble, cdouble)
-INSTANTIATE(af_add_t, int    , int    )
-INSTANTIATE(af_add_t, uint   , uint   )
-INSTANTIATE(af_add_t, intl   , intl   )
-INSTANTIATE(af_add_t, uintl  , uintl  )
-INSTANTIATE(af_add_t, char   , int    )
-INSTANTIATE(af_add_t, uchar  , uint   )
-INSTANTIATE(af_add_t, short  , int    )
-INSTANTIATE(af_add_t, ushort , uint   )
-INSTANTIATE(af_notzero_t, char  , uint)
-
 }
diff --git a/src/backend/cpu/kernel/select.hpp b/src/backend/cpu/kernel/select.hpp
new file mode 100644
index 0000000..1099c7e
--- /dev/null
+++ b/src/backend/cpu/kernel/select.hpp
@@ -0,0 +1,124 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void select(Array<T> out, const Array<char> cond, const Array<T> a, const Array<T> b)
+{
+    af::dim4 adims = a.dims();
+    af::dim4 astrides = a.strides();
+    af::dim4 bdims = b.dims();
+    af::dim4 bstrides = b.strides();
+
+    af::dim4 cdims = cond.dims();
+    af::dim4 cstrides = cond.strides();
+
+    af::dim4 odims = out.dims();
+    af::dim4 ostrides = out.strides();
+
+    bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1],
+        adims[2] == odims[2], adims[3] == odims[3]};
+
+    bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1],
+        bdims[2] == odims[2], bdims[3] == odims[3]};
+
+    bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1],
+        cdims[2] == odims[2], cdims[3] == odims[3]};
+
+    const T *aptr = a.get();
+    const T *bptr = b.get();
+    T *optr = out.get();
+    const char *cptr = cond.get();
+
+    for (int l = 0; l < odims[3]; l++) {
+
+        int o_off3   = ostrides[3] * l;
+        int a_off3   = astrides[3] * is_a_same[3] * l;
+        int b_off3   = bstrides[3] * is_b_same[3] * l;
+        int c_off3   = cstrides[3] * is_c_same[3] * l;
+
+        for (int k = 0; k < odims[2]; k++) {
+
+            int o_off2   = ostrides[2] * k + o_off3;
+            int a_off2   = astrides[2] * is_a_same[2] * k + a_off3;
+            int b_off2   = bstrides[2] * is_b_same[2] * k + b_off3;
+            int c_off2   = cstrides[2] * is_c_same[2] * k + c_off3;
+
+            for (int j = 0; j < odims[1]; j++) {
+
+                int o_off1   = ostrides[1] * j + o_off2;
+                int a_off1   = astrides[1] * is_a_same[1] * j + a_off2;
+                int b_off1   = bstrides[1] * is_b_same[1] * j + b_off2;
+                int c_off1   = cstrides[1] * is_c_same[1] * j + c_off2;
+
+                for (int i = 0; i < odims[0]; i++) {
+
+                    bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1];
+                    T    aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1];
+                    T    bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1];
+                    T    oval = cval ? aval : bval;
+                    optr[o_off1 + i] = oval;
+                }
+            }
+        }
+    }
+}
+
+template<typename T, bool flip>
+void select_scalar(Array<T> out, const Array<char> cond, const Array<T> a, const double b)
+{
+    af::dim4 astrides = a.strides();
+    af::dim4 cstrides = cond.strides();
+
+    af::dim4 odims = out.dims();
+    af::dim4 ostrides = out.strides();
+
+    const T *aptr = a.get();
+    T *optr = out.get();
+    const char *cptr = cond.get();
+
+    for (int l = 0; l < odims[3]; l++) {
+
+        int o_off3 = ostrides[3] * l;
+        int a_off3 = astrides[3] * l;
+        int c_off3 = cstrides[3] * l;
+
+        for (int k = 0; k < odims[2]; k++) {
+
+            int o_off2 = ostrides[2] * k + o_off3;
+            int a_off2 = astrides[2] * k + a_off3;
+            int c_off2 = cstrides[2] * k + c_off3;
+
+            for (int j = 0; j < odims[1]; j++) {
+
+                int o_off1 = ostrides[1] * j + o_off2;
+                int a_off1 = astrides[1] * j + a_off2;
+                int c_off1 = cstrides[1] * j + c_off2;
+
+                for (int i = 0; i < odims[0]; i++) {
+
+                    optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b;
+                }
+            }
+        }
+    }
+}
+
+
+
+}
+}
diff --git a/src/backend/cpu/kernel/shift.hpp b/src/backend/cpu/kernel/shift.hpp
new file mode 100644
index 0000000..8beb975
--- /dev/null
+++ b/src/backend/cpu/kernel/shift.hpp
@@ -0,0 +1,69 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <cassert>
+
+namespace cpu
+{
+namespace kernel
+{
+
+static inline dim_t simple_mod(const dim_t i, const dim_t dim)
+{
+    return (i < dim) ? i : (i - dim);
+}
+
+template<typename T>
+void shift(Array<T> out, const Array<T> in, const af::dim4 sdims)
+{
+    T* outPtr = out.get();
+    const T* inPtr = in.get();
+
+    const af::dim4 oDims = out.dims();
+    const af::dim4 ist   = in.strides();
+    const af::dim4 ost   = out.strides();
+
+    int sdims_[4];
+    // Need to do this because we are mapping output to input in the kernel
+    for(int i = 0; i < 4; i++) {
+        // sdims_[i] will always be positive and always [0, oDims[i]].
+        // Negative shifts are converted to position by going the other way round
+        sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0);
+        assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]);
+    }
+
+    for(dim_t ow = 0; ow < oDims[3]; ow++) {
+        const int oW = ow * ost[3];
+        const int iw = simple_mod((ow + sdims_[3]), oDims[3]);
+        const int iW = iw * ist[3];
+        for(dim_t oz = 0; oz < oDims[2]; oz++) {
+            const int oZW = oW + oz * ost[2];
+            const int iz = simple_mod((oz + sdims_[2]), oDims[2]);
+            const int iZW = iW + iz * ist[2];
+            for(dim_t oy = 0; oy < oDims[1]; oy++) {
+                const int oYZW = oZW + oy * ost[1];
+                const int iy = simple_mod((oy + sdims_[1]), oDims[1]);
+                const int iYZW = iZW + iy * ist[1];
+                for(dim_t ox = 0; ox < oDims[0]; ox++) {
+                    const int oIdx = oYZW + ox;
+                    const int ix = simple_mod((ox + sdims_[0]), oDims[0]);
+                    const int iIdx = iYZW + ix;
+
+                    outPtr[oIdx] = inPtr[iIdx];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/sift_nonfree.hpp b/src/backend/cpu/kernel/sift_nonfree.hpp
similarity index 100%
rename from src/backend/cpu/sift_nonfree.hpp
rename to src/backend/cpu/kernel/sift_nonfree.hpp
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/kernel/sobel.hpp
similarity index 65%
copy from src/backend/cpu/sobel.cpp
copy to src/backend/cpu/kernel/sobel.hpp
index ba47ba9..49d33cd 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/kernel/sobel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,26 +7,21 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
+#pragma once
 #include <af/defines.h>
-#include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <sobel.hpp>
-#include <convolve.hpp>
-#include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
+#include <cassert>
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<typename Ti, typename To, bool isDX>
 void derivative(Array<To> output, const Array<Ti> input)
 {
-    const dim4 dims    = input.dims();
-    const dim4 strides = input.strides();
+    const af::dim4 dims    = input.dims();
+    const af::dim4 strides = input.strides();
           To* optr     = output.get();
     const Ti* iptr     = input.get();
 
@@ -87,33 +82,5 @@ void derivative(Array<To> output, const Array<Ti> input)
     }
 }
 
-template<typename Ti, typename To>
-std::pair< Array<To>, Array<To> >
-sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
-{
-    img.eval();
-    // ket_size is for future proofing, this argument is not used
-    // currently
-    Array<To> dx = createEmptyArray<To>(img.dims());
-    Array<To> dy = createEmptyArray<To>(img.dims());
-
-    getQueue().enqueue(derivative<Ti, To, true >, dx, img);
-    getQueue().enqueue(derivative<Ti, To, false>, dy, img);
-
-    return std::make_pair(dx, dy);
 }
-
-#define INSTANTIATE(Ti, To)                                               \
-    template std::pair< Array<To>, Array<To> >                            \
-    sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size);
-
-INSTANTIATE(float , float)
-INSTANTIATE(double, double)
-INSTANTIATE(int   , int)
-INSTANTIATE(uint  , int)
-INSTANTIATE(char  , int)
-INSTANTIATE(uchar , int)
-INSTANTIATE(short , int)
-INSTANTIATE(ushort, int)
-
 }
diff --git a/src/backend/cpu/kernel/sort.hpp b/src/backend/cpu/kernel/sort.hpp
new file mode 100644
index 0000000..cba07fa
--- /dev/null
+++ b/src/backend/cpu/kernel/sort.hpp
@@ -0,0 +1,51 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+#include <math.hpp>
+#include <algorithm>
+#include <numeric>
+#include <err_cpu.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+// Based off of http://stackoverflow.com/a/12399290
+template<typename T, bool isAscending>
+void sort0(Array<T> val)
+{
+    // initialize original index locations
+    T *val_ptr = val.get();
+
+    function<bool(T, T)> op = std::greater<T>();
+    if(isAscending) { op = std::less<T>(); }
+
+    T *comp_ptr = nullptr;
+    for(dim_t w = 0; w < val.dims()[3]; w++) {
+        dim_t valW = w * val.strides()[3];
+        for(dim_t z = 0; z < val.dims()[2]; z++) {
+            dim_t valWZ = valW + z * val.strides()[2];
+            for(dim_t y = 0; y < val.dims()[1]; y++) {
+
+                dim_t valOffset = valWZ + y * val.strides()[1];
+
+                comp_ptr = val_ptr + valOffset;
+                std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
+            }
+        }
+    }
+    return;
+}
+
+}
+}
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/kernel/sort_by_key.hpp
similarity index 52%
copy from src/backend/cpu/sort_by_key.cpp
copy to src/backend/cpu/kernel/sort_by_key.hpp
index d2ebd42..77713a7 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/kernel/sort_by_key.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,37 +7,26 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <sort_by_key.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <algorithm>
 #include <numeric>
 #include <queue>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
-using std::queue;
-using std::async;
+#include <err_cpu.hpp>
 
 namespace cpu
 {
-
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
+namespace kernel
+{
 
 template<typename Tk, typename Tv, bool isAscending>
 void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
                   const Array<Tk> ikey, const Array<Tv> ival)
 {
-    function<bool(Tk, Tk)> op = greater<Tk>();
-    if(isAscending) { op = less<Tk>(); }
+    function<bool(Tk, Tk)> op = std::greater<Tk>();
+    if(isAscending) { op = std::less<Tk>(); }
 
     // Get pointers and initialize original index locations
         uint *oidx_ptr = oidx.get();
@@ -92,58 +81,5 @@ void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
     return;
 }
 
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename Tk, typename Tv, bool isAscending>
-void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
-           const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
-{
-    ikey.eval();
-    ival.eval();
-
-    okey = createEmptyArray<Tk>(ikey.dims());
-    oval = createEmptyArray<Tv>(ival.dims());
-    Array<uint> oidx = createValueArray(ikey.dims(), 0u);
-    oidx.eval();
-
-    switch(dim) {
-        case 0: getQueue().enqueue(sort0_by_key<Tk, Tv, isAscending>,
-                                   okey, oval, oidx, ikey, ival); break;
-        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-    }
 }
-
-#define INSTANTIATE(Tk, Tv)                                             \
-    template void                                                       \
-    sort_by_key<Tk, Tv, true>(Array<Tk> &okey, Array<Tv> &oval,         \
-                              const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim); \
-    template void                                                       \
-    sort_by_key<Tk, Tv,false>(Array<Tk> &okey, Array<Tv> &oval,         \
-                              const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim); \
-
-#define INSTANTIATE1(Tk)       \
-    INSTANTIATE(Tk, float)     \
-    INSTANTIATE(Tk, double)    \
-    INSTANTIATE(Tk, int)       \
-    INSTANTIATE(Tk, uint)      \
-    INSTANTIATE(Tk, char)      \
-    INSTANTIATE(Tk, uchar)     \
-    INSTANTIATE(Tk, short)     \
-    INSTANTIATE(Tk, ushort)    \
-    INSTANTIATE(Tk, intl)      \
-    INSTANTIATE(Tk, uintl)     \
-
-
-INSTANTIATE1(float)
-INSTANTIATE1(double)
-INSTANTIATE1(int)
-INSTANTIATE1(uint)
-INSTANTIATE1(char)
-INSTANTIATE1(uchar)
-INSTANTIATE1(short)
-INSTANTIATE1(ushort)
-INSTANTIATE1(intl)
-INSTANTIATE1(uintl)
-
 }
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/kernel/sort_index.hpp
similarity index 52%
copy from src/backend/cpu/sort_index.cpp
copy to src/backend/cpu/kernel/sort_index.hpp
index f941534..d2de05a 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/kernel/sort_index.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,35 +7,28 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <sort_index.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <algorithm>
 #include <numeric>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
+#include <err_cpu.hpp>
 
 namespace cpu
 {
+namespace kernel
+{
 
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
 template<typename T, bool isAscending>
-void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
+void sort0_index(Array<T> val, Array<uint> idx, const Array<T> in)
 {
     // initialize original index locations
        uint *idx_ptr = idx.get();
           T *val_ptr = val.get();
     const T *in_ptr  = in.get();
-    function<bool(T, T)> op = greater<T>();
-    if(isAscending) { op = less<T>(); }
+    function<bool(T, T)> op = std::greater<T>();
+    if(isAscending) { op = std::less<T>(); }
 
     std::vector<uint> seq_vec(idx.dims()[0]);
     std::iota(seq_vec.begin(), seq_vec.end(), 0);
@@ -73,39 +66,5 @@ void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
     return;
 }
 
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T, bool isAscending>
-void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
-{
-    in.eval();
-
-    val = createEmptyArray<T>(in.dims());
-    idx = createEmptyArray<uint>(in.dims());
-    switch(dim) {
-        case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
-        default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
-    }
 }
-
-#define INSTANTIATE(T)                                                  \
-    template void sort_index<T, true>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
-                                      const uint dim);                  \
-    template void sort_index<T,false>(Array<T> &val, Array<uint> &idx, const Array<T> &in, \
-                                      const uint dim);                  \
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-//INSTANTIATE(cfloat)
-//INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(char)
-INSTANTIATE(uchar)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-
 }
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/kernel/susan.hpp
similarity index 55%
copy from src/backend/cpu/susan.cpp
copy to src/backend/cpu/kernel/susan.hpp
index c278908..f543967 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/kernel/susan.hpp
@@ -1,25 +1,20 @@
 /*******************************************************
- * Copyright (c) 2015, Arrayfire
- * all rights reserved.
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
  *
- * This file is distributed under 3-clause bsd license.
- * the complete license agreement can be obtained at:
- * http://Arrayfire.com/licenses/bsd-3-clause
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/features.h>
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <cmath>
-#include <math.hpp>
-#include <memory>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::features;
-using std::shared_ptr;
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<typename T>
 void susan_responses(Array<T> output, const Array<T> input,
@@ -100,59 +95,5 @@ void non_maximal(Array<float> xcoords, Array<float> ycoords, Array<float> respon
     }
 }
 
-template<typename T>
-unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
-               const Array<T> &in,
-               const unsigned radius, const float diff_thr, const float geom_thr,
-               const float feature_ratio, const unsigned edge)
-{
-    in.eval();
-
-    dim4 idims = in.dims();
-    const unsigned corner_lim = in.elements() * feature_ratio;
-
-    auto x_corners    = createEmptyArray<float>(dim4(corner_lim));
-    auto y_corners    = createEmptyArray<float>(dim4(corner_lim));
-    auto resp_corners = createEmptyArray<float>(dim4(corner_lim));
-    auto response     = createEmptyArray<T>(dim4(in.elements()));
-    auto corners_found= std::shared_ptr<unsigned>(memAlloc<unsigned>(1), memFree<unsigned>);
-    corners_found.get()[0] = 0;
-
-    getQueue().enqueue(susan_responses<T>, response, in, idims[0], idims[1],
-                       radius, diff_thr, geom_thr, edge);
-    getQueue().enqueue(non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
-                       idims[0], idims[1], response, edge, corner_lim);
-    getQueue().sync();
-
-    const unsigned corners_out = min((corners_found.get())[0], corner_lim);
-    if (corners_out == 0) {
-        x_out    = createEmptyArray<float>(dim4());
-        y_out    = createEmptyArray<float>(dim4());
-        resp_out = createEmptyArray<float>(dim4());
-        return 0;
-    } else {
-        x_out = x_corners;
-        y_out = y_corners;
-        resp_out = resp_corners;
-        x_out.resetDims(dim4(corners_out));
-        y_out.resetDims(dim4(corners_out));
-        resp_out.resetDims(dim4(corners_out));
-        return corners_out;
-    }
 }
-
-#define INSTANTIATE(T) \
-template unsigned susan<T>(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,   \
-                           const Array<T> &in, const unsigned radius, const float diff_thr,     \
-                           const float geom_thr, const float feature_ratio, const unsigned edge);
-
-INSTANTIATE(float )
-INSTANTIATE(double)
-INSTANTIATE(char  )
-INSTANTIATE(int   )
-INSTANTIATE(uint  )
-INSTANTIATE(uchar )
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/kernel/tile.hpp b/src/backend/cpu/kernel/tile.hpp
new file mode 100644
index 0000000..3ad3009
--- /dev/null
+++ b/src/backend/cpu/kernel/tile.hpp
@@ -0,0 +1,55 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void tile(Array<T> out, const Array<T> in)
+{
+
+    T* outPtr = out.get();
+    const T* inPtr = in.get();
+
+    const af::dim4 iDims = in.dims();
+    const af::dim4 oDims = out.dims();
+    const af::dim4 ist = in.strides();
+    const af::dim4 ost = out.strides();
+
+    for(dim_t ow = 0; ow < oDims[3]; ow++) {
+        const dim_t iw = ow % iDims[3];
+        const dim_t iW = iw * ist[3];
+        const dim_t oW = ow * ost[3];
+        for(dim_t oz = 0; oz < oDims[2]; oz++) {
+            const dim_t iz = oz % iDims[2];
+            const dim_t iZW = iW + iz * ist[2];
+            const dim_t oZW = oW + oz * ost[2];
+            for(dim_t oy = 0; oy < oDims[1]; oy++) {
+                const dim_t iy = oy % iDims[1];
+                const dim_t iYZW = iZW + iy * ist[1];
+                const dim_t oYZW = oZW + oy * ost[1];
+                for(dim_t ox = 0; ox < oDims[0]; ox++) {
+                    const dim_t ix = ox % iDims[0];
+                    const dim_t iMem = iYZW + ix;
+                    const dim_t oMem = oYZW + ox;
+                    outPtr[oMem] = inPtr[iMem];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/kernel/transform.hpp
similarity index 62%
copy from src/backend/cpu/transform.cpp
copy to src/backend/cpu/kernel/transform.hpp
index a7287ce..d97613a 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/kernel/transform.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,17 +7,15 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <transform.hpp>
-#include <math.hpp>
-#include <stdexcept>
 #include <err_cpu.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-#include "transform_interp.hpp"
 
 namespace cpu
 {
+namespace kernel
+{
 
 template <typename T>
 void calc_affine_inverse(T *txo, const T *txi)
@@ -48,8 +46,8 @@ void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse)
 }
 
 template<typename T, af_interp_type method>
-void transform_(Array<T> output, const Array<T> input,
-                const Array<float> transform, const bool inverse)
+void transform(Array<T> output, const Array<T> input,
+               const Array<float> transform, const bool inverse)
 {
     const af::dim4 idims    = input.dims();
     const af::dim4 odims    = output.dims();
@@ -103,49 +101,5 @@ void transform_(Array<T> output, const Array<T> input,
     }
 }
 
-template<typename T>
-Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
-                    const af_interp_type method, const bool inverse)
-{
-    in.eval();
-    transform.eval();
-
-    Array<T> out = createEmptyArray<T>(odims);
-
-    switch(method) {
-        case AF_INTERP_NEAREST :
-            getQueue().enqueue(transform_<T, AF_INTERP_NEAREST >, out, in, transform, inverse);
-            break;
-        case AF_INTERP_BILINEAR:
-            getQueue().enqueue(transform_<T, AF_INTERP_BILINEAR>, out, in, transform, inverse);
-            break;
-        case AF_INTERP_LOWER   :
-            getQueue().enqueue(transform_<T, AF_INTERP_LOWER   >, out, in, transform, inverse);
-            break;
-        default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break;
-    }
-
-    return out;
 }
-
-
-#define INSTANTIATE(T)                                                              \
-template Array<T> transform(const Array<T> &in, const Array<float> &transform,      \
-                            const af::dim4 &odims, const af_interp_type method,     \
-                            const bool inverse);
-
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/kernel/transpose.hpp
similarity index 65%
copy from src/backend/cpu/transpose.cpp
copy to src/backend/cpu/kernel/transpose.hpp
index 7e7eec1..576de87 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/kernel/transpose.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,30 +7,16 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
+#pragma once
 #include <af/defines.h>
-#include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <transpose.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-#include <utility>
-#include <cassert>
-
-using af::dim4;
+#include <utility.hpp>
+#include <err_cpu.hpp>
 
 namespace cpu
 {
-
-static inline unsigned getIdx(const dim4 &strides,
-        int i, int j = 0, int k = 0, int l = 0)
+namespace kernel
 {
-    return (l * strides[3] +
-            k * strides[2] +
-            j * strides[1] +
-            i );
-}
 
 template<typename T>
 T getConjugate(const T &in)
@@ -52,7 +38,7 @@ cdouble getConjugate(const cdouble &in)
 }
 
 template<typename T, bool conjugate>
-void transpose_(Array<T> output, const Array<T> input)
+void transpose(Array<T> output, const Array<T> input)
 {
     const dim4 odims    = output.dims();
     const dim4 ostrides = output.strides();
@@ -86,24 +72,9 @@ void transpose_(Array<T> output, const Array<T> input)
 }
 
 template<typename T>
-void transpose_(Array<T> out, const Array<T> in, const bool conjugate)
-{
-    return (conjugate ? transpose_<T, true>(out, in) : transpose_<T, false>(out, in));
-}
-
-template<typename T>
-Array<T> transpose(const Array<T> &in, const bool conjugate)
+void transpose(Array<T> out, const Array<T> in, const bool conjugate)
 {
-    in.eval();
-
-    const dim4 inDims  = in.dims();
-    const dim4 outDims = dim4(inDims[1],inDims[0],inDims[2],inDims[3]);
-    // create an array with first two dimensions swapped
-    Array<T> out  = createEmptyArray<T>(outDims);
-
-    getQueue().enqueue(transpose_<T>, out, in, conjugate);
-
-    return out;
+    return (conjugate ? transpose<T, true>(out, in) : transpose<T, false>(out, in));
 }
 
 template<typename T, bool conjugate>
@@ -142,33 +113,10 @@ void transpose_inplace(Array<T> input)
 }
 
 template<typename T>
-void transpose_inplace_(Array<T> in, const bool conjugate)
+void transpose_inplace(Array<T> in, const bool conjugate)
 {
     return (conjugate ? transpose_inplace<T, true >(in) : transpose_inplace<T, false>(in));
 }
 
-template<typename T>
-void transpose_inplace(Array<T> &in, const bool conjugate)
-{
-    in.eval();
-    getQueue().enqueue(transpose_inplace_<T>, in, conjugate);
 }
-
-#define INSTANTIATE(T)                                                      \
-    template Array<T> transpose(const Array<T> &in, const bool conjugate);  \
-    template void transpose_inplace(Array<T> &in, const bool conjugate);
-
-INSTANTIATE(float  )
-INSTANTIATE(cfloat )
-INSTANTIATE(double )
-INSTANTIATE(cdouble)
-INSTANTIATE(char   )
-INSTANTIATE(int    )
-INSTANTIATE(uint   )
-INSTANTIATE(uchar  )
-INSTANTIATE(intl   )
-INSTANTIATE(uintl  )
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/kernel/triangle.hpp b/src/backend/cpu/kernel/triangle.hpp
new file mode 100644
index 0000000..7059de5
--- /dev/null
+++ b/src/backend/cpu/kernel/triangle.hpp
@@ -0,0 +1,61 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#pragma once
+#include <af/defines.h>
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T, bool is_upper, bool is_unit_diag>
+void triangle(Array<T> out, const Array<T> in)
+{
+    T *o = out.get();
+    const T *i = in.get();
+
+    af::dim4 odm = out.dims();
+
+    af::dim4 ost = out.strides();
+    af::dim4 ist = in.strides();
+
+    for(dim_t ow = 0; ow < odm[3]; ow++) {
+        const dim_t oW = ow * ost[3];
+        const dim_t iW = ow * ist[3];
+
+        for(dim_t oz = 0; oz < odm[2]; oz++) {
+            const dim_t oZW = oW + oz * ost[2];
+            const dim_t iZW = iW + oz * ist[2];
+
+            for(dim_t oy = 0; oy < odm[1]; oy++) {
+                const dim_t oYZW = oZW + oy * ost[1];
+                const dim_t iYZW = iZW + oy * ist[1];
+
+                for(dim_t ox = 0; ox < odm[0]; ox++) {
+                    const dim_t oMem = oYZW + ox;
+                    const dim_t iMem = iYZW + ox;
+
+                    bool cond = is_upper ? (oy >= ox) : (oy <= ox);
+                    bool do_unit_diag = (is_unit_diag && ox == oy);
+                    if(cond) {
+                        o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
+                    } else {
+                        o[oMem] = scalar<T>(0);
+                    }
+
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/kernel/unwrap.hpp
similarity index 63%
copy from src/backend/cpu/unwrap.cpp
copy to src/backend/cpu/kernel/unwrap.hpp
index 41423c7..1d996ff 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/kernel/unwrap.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,17 +7,15 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <unwrap.hpp>
-#include <stdexcept>
 #include <err_cpu.hpp>
-#include <dispatch.hpp>
-#include <math.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<typename T, int d>
 void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
@@ -79,50 +77,5 @@ void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
     }
 }
 
-template<typename T>
-Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
-                const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
-{
-    in.eval();
-
-    af::dim4 idims = in.dims();
-    dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
-    dim_t ny = (idims[1] + 2 * py - wy) / sy + 1;
-
-    af::dim4 odims(wx * wy, nx * ny, idims[2], idims[3]);
-
-    if (!is_column) {
-        std::swap(odims[0], odims[1]);
-    }
-
-    Array<T> outArray = createEmptyArray<T>(odims);
-
-    if (is_column) {
-        getQueue().enqueue(unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
-    } else {
-        getQueue().enqueue(unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
-    }
-
-    return outArray;
 }
-
-
-#define INSTANTIATE(T)                                                                  \
-    template Array<T> unwrap<T> (const Array<T> &in, const dim_t wx, const dim_t wy,    \
-                    const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column);
-
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/kernel/wrap.hpp
similarity index 62%
copy from src/backend/cpu/wrap.cpp
copy to src/backend/cpu/kernel/wrap.hpp
index 3ff54de..70be3ad 100644
--- a/src/backend/cpu/wrap.cpp
+++ b/src/backend/cpu/kernel/wrap.hpp
@@ -7,17 +7,15 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <af/defines.h>
 #include <Array.hpp>
-#include <wrap.hpp>
-#include <stdexcept>
 #include <err_cpu.hpp>
-#include <dispatch.hpp>
-#include <math.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
 
 namespace cpu
 {
+namespace kernel
+{
 
 template<typename T, int d>
 void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
@@ -78,50 +76,5 @@ void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
     }
 }
 
-template<typename T>
-Array<T> wrap(const Array<T> &in,
-              const dim_t ox, const dim_t oy,
-              const dim_t wx, const dim_t wy,
-              const dim_t sx, const dim_t sy,
-              const dim_t px, const dim_t py,
-              const bool is_column)
-{
-    af::dim4 idims = in.dims();
-    af::dim4 odims(ox, oy, idims[2], idims[3]);
-
-    Array<T> out = createValueArray<T>(odims, scalar<T>(0));
-    out.eval();
-    in.eval();
-
-    if (is_column) {
-        getQueue().enqueue(wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
-    } else {
-        getQueue().enqueue(wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
-    }
-
-    return out;
 }
-
-
-#define INSTANTIATE(T)                                          \
-    template Array<T> wrap<T> (const Array<T> &in,              \
-                               const dim_t ox, const dim_t oy,  \
-                               const dim_t wx, const dim_t wy,  \
-                               const dim_t sx, const dim_t sy,  \
-                               const dim_t px, const dim_t py,  \
-                               const bool is_column);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-INSTANTIATE(cfloat)
-INSTANTIATE(cdouble)
-INSTANTIATE(int)
-INSTANTIATE(uint)
-INSTANTIATE(intl)
-INSTANTIATE(uintl)
-INSTANTIATE(uchar)
-INSTANTIATE(char)
-INSTANTIATE(short)
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/nearest_neighbour.cpp b/src/backend/cpu/nearest_neighbour.cpp
index b6f50c2..8292562 100644
--- a/src/backend/cpu/nearest_neighbour.cpp
+++ b/src/backend/cpu/nearest_neighbour.cpp
@@ -11,139 +11,16 @@
 #include <af/defines.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
 #include <handle.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/nearest_neighbour.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-#if defined(_WIN32) || defined(_MSC_VER)
-
-#include <intrin.h>
-#define __builtin_popcount __popcnt
-
-#endif
-
-template<typename T, typename To, af_match_type dist_type>
-struct dist_op
-{
-    To operator()(T v1, T v2)
-    {
-        return v1 - v2;     // Garbage distance
-    }
-};
-
-template<typename T, typename To>
-struct dist_op<T, To, AF_SAD>
-{
-    To operator()(T v1, T v2)
-    {
-        return std::abs((double)v1 - (double)v2);
-    }
-};
-
-template<typename T, typename To>
-struct dist_op<T, To, AF_SSD>
-{
-    To operator()(T v1, T v2)
-    {
-        return (v1 - v2) * (v1 - v2);
-    }
-};
-
-template<typename To>
-struct dist_op<uint, To, AF_SHD>
-{
-    To operator()(uint v1, uint v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename To>
-struct dist_op<uintl, To, AF_SHD>
-{
-    To operator()(uintl v1, uintl v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename To>
-struct dist_op<uchar, To, AF_SHD>
-{
-    To operator()(uchar v1, uchar v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename To>
-struct dist_op<ushort, To, AF_SHD>
-{
-    To operator()(ushort v1, ushort v2)
-    {
-        return __builtin_popcount(v1 ^ v2);
-    }
-};
-
-template<typename T, typename To, af_match_type dist_type>
-void nearest_neighbour_(Array<uint> idx, Array<To> dist,
-                        const Array<T> query, const Array<T> train,
-                        const uint dist_dim, const uint n_dist)
-{
-    uint sample_dim = (dist_dim == 0) ? 1 : 0;
-    const dim4 qDims = query.dims();
-    const dim4 tDims = train.dims();
-
-    const unsigned distLength = qDims[dist_dim];
-    const unsigned nQuery = qDims[sample_dim];
-    const unsigned nTrain = tDims[sample_dim];
-
-    const T* qPtr = query.get();
-    const T* tPtr = train.get();
-    uint* iPtr = idx.get();
-    To* dPtr = dist.get();
-
-    dist_op<T, To, dist_type> op;
-
-    for (unsigned i = 0; i < nQuery; i++) {
-        To best_dist = limit_max<To>();
-        unsigned best_idx  = 0;
-
-        for (unsigned j = 0; j < nTrain; j++) {
-            To local_dist = 0;
-            for (unsigned k = 0; k < distLength; k++) {
-                size_t qIdx, tIdx;
-                if (sample_dim == 0) {
-                    qIdx = k * qDims[0] + i;
-                    tIdx = k * tDims[0] + j;
-                }
-                else {
-                    qIdx = i * qDims[0] + k;
-                    tIdx = j * tDims[0] + k;
-                }
-
-                local_dist += op(qPtr[qIdx], tPtr[tIdx]);
-            }
-
-            if (local_dist < best_dist) {
-                best_dist = local_dist;
-                best_idx  = j;
-            }
-        }
-
-        size_t oIdx;
-        oIdx = i;
-        iPtr[oIdx] = best_idx;
-        dPtr[oIdx] = best_dist;
-    }
-}
-
 template<typename T, typename To>
 void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
                        const Array<T>& query, const Array<T>& train,
@@ -166,13 +43,13 @@ void nearest_neighbour(Array<uint>& idx, Array<To>& dist,
 
     switch(dist_type) {
         case AF_SAD:
-            getQueue().enqueue(nearest_neighbour_<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
+            getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SAD>, idx, dist, query, train, dist_dim, n_dist);
             break;
         case AF_SSD:
-            getQueue().enqueue(nearest_neighbour_<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
+            getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SSD>, idx, dist, query, train, dist_dim, n_dist);
             break;
         case AF_SHD:
-            getQueue().enqueue(nearest_neighbour_<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
+            getQueue().enqueue(kernel::nearest_neighbour<T, To, AF_SHD>, idx, dist, query, train, dist_dim, n_dist);
             break;
         default:
             AF_ERROR("Unsupported dist_type", AF_ERR_NOT_CONFIGURED);
diff --git a/src/backend/cpu/orb.cpp b/src/backend/cpu/orb.cpp
index 4b6629c..00fe820 100644
--- a/src/backend/cpu/orb.cpp
+++ b/src/backend/cpu/orb.cpp
@@ -11,7 +11,6 @@
 #include <af/defines.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
 #include <handle.hpp>
 #include <resize.hpp>
 #include <fast.hpp>
@@ -21,520 +20,13 @@
 #include <cstring>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/orb.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-static const float PI_VAL = 3.14159265358979323846f;
-
-// Reference pattern, generated for a patch size of 31x31, as suggested by
-// original ORB paper
-#define REF_PAT_SIZE 31
-#define REF_PAT_SAMPLES 256
-#define REF_PAT_COORDS 4
-#define REF_PAT_LENGTH (REF_PAT_SAMPLES*REF_PAT_COORDS)
-
-// Current reference pattern was borrowed from OpenCV, to build a pattern with
-// similar quality, a training process must be applied, as described in
-// sections 4.2 and 4.3 of the original ORB paper.
-const int ref_pat[REF_PAT_LENGTH] = {
-    8,-3, 9,5,
-    4,2, 7,-12,
-    -11,9, -8,2,
-    7,-12, 12,-13,
-    2,-13, 2,12,
-    1,-7, 1,6,
-    -2,-10, -2,-4,
-    -13,-13, -11,-8,
-    -13,-3, -12,-9,
-    10,4, 11,9,
-    -13,-8, -8,-9,
-    -11,7, -9,12,
-    7,7, 12,6,
-    -4,-5, -3,0,
-    -13,2, -12,-3,
-    -9,0, -7,5,
-    12,-6, 12,-1,
-    -3,6, -2,12,
-    -6,-13, -4,-8,
-    11,-13, 12,-8,
-    4,7, 5,1,
-    5,-3, 10,-3,
-    3,-7, 6,12,
-    -8,-7, -6,-2,
-    -2,11, -1,-10,
-    -13,12, -8,10,
-    -7,3, -5,-3,
-    -4,2, -3,7,
-    -10,-12, -6,11,
-    5,-12, 6,-7,
-    5,-6, 7,-1,
-    1,0, 4,-5,
-    9,11, 11,-13,
-    4,7, 4,12,
-    2,-1, 4,4,
-    -4,-12, -2,7,
-    -8,-5, -7,-10,
-    4,11, 9,12,
-    0,-8, 1,-13,
-    -13,-2, -8,2,
-    -3,-2, -2,3,
-    -6,9, -4,-9,
-    8,12, 10,7,
-    0,9, 1,3,
-    7,-5, 11,-10,
-    -13,-6, -11,0,
-    10,7, 12,1,
-    -6,-3, -6,12,
-    10,-9, 12,-4,
-    -13,8, -8,-12,
-    -13,0, -8,-4,
-    3,3, 7,8,
-    5,7, 10,-7,
-    -1,7, 1,-12,
-    3,-10, 5,6,
-    2,-4, 3,-10,
-    -13,0, -13,5,
-    -13,-7, -12,12,
-    -13,3, -11,8,
-    -7,12, -4,7,
-    6,-10, 12,8,
-    -9,-1, -7,-6,
-    -2,-5, 0,12,
-    -12,5, -7,5,
-    3,-10, 8,-13,
-    -7,-7, -4,5,
-    -3,-2, -1,-7,
-    2,9, 5,-11,
-    -11,-13, -5,-13,
-    -1,6, 0,-1,
-    5,-3, 5,2,
-    -4,-13, -4,12,
-    -9,-6, -9,6,
-    -12,-10, -8,-4,
-    10,2, 12,-3,
-    7,12, 12,12,
-    -7,-13, -6,5,
-    -4,9, -3,4,
-    7,-1, 12,2,
-    -7,6, -5,1,
-    -13,11, -12,5,
-    -3,7, -2,-6,
-    7,-8, 12,-7,
-    -13,-7, -11,-12,
-    1,-3, 12,12,
-    2,-6, 3,0,
-    -4,3, -2,-13,
-    -1,-13, 1,9,
-    7,1, 8,-6,
-    1,-1, 3,12,
-    9,1, 12,6,
-    -1,-9, -1,3,
-    -13,-13, -10,5,
-    7,7, 10,12,
-    12,-5, 12,9,
-    6,3, 7,11,
-    5,-13, 6,10,
-    2,-12, 2,3,
-    3,8, 4,-6,
-    2,6, 12,-13,
-    9,-12, 10,3,
-    -8,4, -7,9,
-    -11,12, -4,-6,
-    1,12, 2,-8,
-    6,-9, 7,-4,
-    2,3, 3,-2,
-    6,3, 11,0,
-    3,-3, 8,-8,
-    7,8, 9,3,
-    -11,-5, -6,-4,
-    -10,11, -5,10,
-    -5,-8, -3,12,
-    -10,5, -9,0,
-    8,-1, 12,-6,
-    4,-6, 6,-11,
-    -10,12, -8,7,
-    4,-2, 6,7,
-    -2,0, -2,12,
-    -5,-8, -5,2,
-    7,-6, 10,12,
-    -9,-13, -8,-8,
-    -5,-13, -5,-2,
-    8,-8, 9,-13,
-    -9,-11, -9,0,
-    1,-8, 1,-2,
-    7,-4, 9,1,
-    -2,1, -1,-4,
-    11,-6, 12,-11,
-    -12,-9, -6,4,
-    3,7, 7,12,
-    5,5, 10,8,
-    0,-4, 2,8,
-    -9,12, -5,-13,
-    0,7, 2,12,
-    -1,2, 1,7,
-    5,11, 7,-9,
-    3,5, 6,-8,
-    -13,-4, -8,9,
-    -5,9, -3,-3,
-    -4,-7, -3,-12,
-    6,5, 8,0,
-    -7,6, -6,12,
-    -13,6, -5,-2,
-    1,-10, 3,10,
-    4,1, 8,-4,
-    -2,-2, 2,-13,
-    2,-12, 12,12,
-    -2,-13, 0,-6,
-    4,1, 9,3,
-    -6,-10, -3,-5,
-    -3,-13, -1,1,
-    7,5, 12,-11,
-    4,-2, 5,-7,
-    -13,9, -9,-5,
-    7,1, 8,6,
-    7,-8, 7,6,
-    -7,-4, -7,1,
-    -8,11, -7,-8,
-    -13,6, -12,-8,
-    2,4, 3,9,
-    10,-5, 12,3,
-    -6,-5, -6,7,
-    8,-3, 9,-8,
-    2,-12, 2,8,
-    -11,-2, -10,3,
-    -12,-13, -7,-9,
-    -11,0, -10,-5,
-    5,-3, 11,8,
-    -2,-13, -1,12,
-    -1,-8, 0,9,
-    -13,-11, -12,-5,
-    -10,-2, -10,11,
-    -3,9, -2,-13,
-    2,-3, 3,2,
-    -9,-13, -4,0,
-    -4,6, -3,-10,
-    -4,12, -2,-7,
-    -6,-11, -4,9,
-    6,-3, 6,11,
-    -13,11, -5,5,
-    11,11, 12,6,
-    7,-5, 12,-2,
-    -1,12, 0,7,
-    -4,-8, -3,-2,
-    -7,1, -6,7,
-    -13,-12, -8,-13,
-    -7,-2, -6,-8,
-    -8,5, -6,-9,
-    -5,-1, -4,5,
-    -13,7, -8,10,
-    1,5, 5,-13,
-    1,0, 10,-13,
-    9,12, 10,-1,
-    5,-8, 10,-9,
-    -1,11, 1,-13,
-    -9,-3, -6,2,
-    -1,-10, 1,12,
-    -13,1, -8,-10,
-    8,-11, 10,-6,
-    2,-13, 3,-6,
-    7,-13, 12,-9,
-    -10,-10, -5,-7,
-    -10,-8, -8,-13,
-    4,-6, 8,5,
-    3,12, 8,-13,
-    -4,2, -3,-3,
-    5,-13, 10,-12,
-    4,-13, 5,-1,
-    -9,9, -4,3,
-    0,3, 3,-9,
-    -12,1, -6,1,
-    3,2, 4,-8,
-    -10,-10, -10,9,
-    8,-13, 12,12,
-    -8,-12, -6,-5,
-    2,2, 3,7,
-    10,6, 11,-8,
-    6,8, 8,-12,
-    -7,10, -6,5,
-    -3,-9, -3,9,
-    -1,-13, -1,5,
-    -3,-7, -3,4,
-    -8,-2, -8,3,
-    4,2, 12,12,
-    2,-5, 3,11,
-    6,-9, 11,-13,
-    3,-1, 7,12,
-    11,-1, 12,4,
-    -3,0, -3,6,
-    4,-11, 4,12,
-    2,-4, 2,1,
-    -10,-6, -8,1,
-    -13,7, -11,1,
-    -13,12, -11,-13,
-    6,0, 11,-13,
-    0,-1, 1,4,
-    -13,3, -9,-2,
-    -9,8, -6,-3,
-    -13,-6, -8,-2,
-    5,-9, 8,10,
-    2,7, 3,-9,
-    -1,-6, -1,-1,
-    9,5, 11,-2,
-    11,-3, 12,-8,
-    3,0, 3,5,
-    -1,4, 0,10,
-    3,-6, 4,5,
-    -13,0, -10,5,
-    5,8, 12,11,
-    8,9, 9,-6,
-    7,-4, 8,-12,
-    -10,4, -10,9,
-    7,3, 12,4,
-    9,-7, 10,-2,
-    7,0, 12,-2,
-    -1,-6, 0,-11,
-};
-
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * PI_VAL * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
-void keep_features(
-    float* x_out,
-    float* y_out,
-    float* score_out,
-    float* size_out,
-    const float* x_in,
-    const float* y_in,
-    const float* score_in,
-    const unsigned* score_idx,
-    const float* size_in,
-    const unsigned n_feat)
-{
-    // Keep only the first n_feat features
-    for (unsigned f = 0; f < n_feat; f++) {
-        x_out[f] = x_in[score_idx[f]];
-        y_out[f] = y_in[score_idx[f]];
-        score_out[f] = score_in[f];
-        if (size_in != nullptr && size_out != nullptr)
-            size_out[f] = size_in[score_idx[f]];
-    }
-}
-
-template<typename T, bool use_scl>
-void harris_response(
-    float* x_out,
-    float* y_out,
-    float* score_out,
-    float* size_out,
-    const float* x_in,
-    const float* y_in,
-    const float* scl_in,
-    const unsigned total_feat,
-    unsigned* usable_feat,
-    const Array<T>& image,
-    const unsigned block_size,
-    const float k_thr,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    const T* image_ptr = image.get();
-    for (unsigned f = 0; f < total_feat; f++) {
-        unsigned x, y;
-        float scl = 1.f;
-        if (use_scl) {
-            // Update x and y coordinates according to scale
-            scl = scl_in[f];
-            x = (unsigned)round(x_in[f] * scl);
-            y = (unsigned)round(y_in[f] * scl);
-        }
-        else {
-            x = (unsigned)round(x_in[f]);
-            y = (unsigned)round(y_in[f]);
-        }
-
-        // Round feature size to nearest odd integer
-        float size = 2.f * floor((patch_size * scl) / 2.f) + 1.f;
-
-        // Avoid keeping features that might be too wide and might not fit on
-        // the image, sqrt(2.f) is the radius when angle is 45 degrees and
-        // represents widest case possible
-        unsigned patch_r = ceil(size * sqrt(2.f) / 2.f);
-        if (x < patch_r || y < patch_r || x >= idims[1] - patch_r || y >= idims[0] - patch_r)
-            continue;
-
-        unsigned r = block_size / 2;
-
-        float ixx = 0.f, iyy = 0.f, ixy = 0.f;
-        unsigned block_size_sq = block_size * block_size;
-        for (unsigned k = 0; k < block_size_sq; k++) {
-            int i = k / block_size - r;
-            int j = k % block_size - r;
-
-            // Calculate local x and y derivatives
-            float ix = image_ptr[(x+i+1) * idims[0] + y+j] - image_ptr[(x+i-1) * idims[0] + y+j];
-            float iy = image_ptr[(x+i) * idims[0] + y+j+1] - image_ptr[(x+i) * idims[0] + y+j-1];
-
-            // Accumulate second order derivatives
-            ixx += ix*ix;
-            iyy += iy*iy;
-            ixy += ix*iy;
-        }
-
-        unsigned idx = *usable_feat;
-        *usable_feat += 1;
-        float tr = ixx + iyy;
-        float det = ixx*iyy - ixy*ixy;
-
-        // Calculate Harris responses
-        float resp = det - k_thr * (tr*tr);
-
-        // Scale factor
-        // TODO: improve response scaling
-        float rscale = 0.001f;
-        rscale = rscale * rscale * rscale * rscale;
-
-        x_out[idx] = x;
-        y_out[idx] = y;
-        score_out[idx] = resp * rscale;
-        if (use_scl)
-            size_out[idx] = size;
-    }
-}
-
-template<typename T>
-void centroid_angle(
-    const float* x_in,
-    const float* y_in,
-    float* orientation_out,
-    const unsigned total_feat,
-    const Array<T>& image,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    const T* image_ptr = image.get();
-    for (unsigned f = 0; f < total_feat; f++) {
-        unsigned x = (unsigned)round(x_in[f]);
-        unsigned y = (unsigned)round(y_in[f]);
-
-        unsigned r = patch_size / 2;
-        if (x < r || y < r || x > idims[1] - r || y > idims[0] - r)
-            continue;
-
-        T m01 = (T)0, m10 = (T)0;
-        unsigned patch_size_sq = patch_size * patch_size;
-        for (unsigned k = 0; k < patch_size_sq; k++) {
-            int i = k / patch_size - r;
-            int j = k % patch_size - r;
-
-            // Calculate first order moments
-            T p = image_ptr[(x+i) * idims[0] + y+j];
-            m01 += j * p;
-            m10 += i * p;
-        }
-
-        float angle = atan2(m01, m10);
-        orientation_out[f] = angle;
-    }
-}
-
-template<typename T>
-inline T get_pixel(
-    unsigned x,
-    unsigned y,
-    const float ori,
-    const unsigned size,
-    const int dist_x,
-    const int dist_y,
-    const Array<T>& image,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    const T* image_ptr = image.get();
-    float ori_sin = sin(ori);
-    float ori_cos = cos(ori);
-    float patch_scl = (float)size / (float)patch_size;
-
-    // Calculate point coordinates based on orientation and size
-    x += round(dist_x * patch_scl * ori_cos - dist_y * patch_scl * ori_sin);
-    y += round(dist_x * patch_scl * ori_sin + dist_y * patch_scl * ori_cos);
-
-    return image_ptr[x * idims[0] + y];
-}
-
-template<typename T>
-void extract_orb(
-    unsigned* desc_out,
-    const unsigned n_feat,
-    float* x_in_out,
-    float* y_in_out,
-    const float* ori_in,
-    float* size_out,
-    const Array<T>& image,
-    const float scl,
-    const unsigned patch_size)
-{
-    const af::dim4 idims = image.dims();
-    for (unsigned f = 0; f < n_feat; f++) {
-        unsigned x = (unsigned)round(x_in_out[f]);
-        unsigned y = (unsigned)round(y_in_out[f]);
-        float ori = ori_in[f];
-        unsigned size = patch_size;
-
-        unsigned r = ceil(patch_size * sqrt(2.f) / 2.f);
-        if (x < r || y < r || x >= idims[1] - r || y >= idims[0] - r)
-            continue;
-
-        // Descriptor fixed at 256 bits for now
-        // Storing descriptor as a vector of 8 x 32-bit unsigned numbers
-        for (unsigned i = 0; i < 8; i++) {
-            unsigned v = 0;
-
-            // j < 32 for 256 bits descriptor
-            for (unsigned j = 0; j < 32; j++) {
-                // Get position from distribution pattern and values of points p1 and p2
-                int dist_x = ref_pat[i*32*4 + j*4];
-                int dist_y = ref_pat[i*32*4 + j*4+1];
-                T p1 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
-
-                dist_x = ref_pat[i*32*4 + j*4+2];
-                dist_y = ref_pat[i*32*4 + j*4+3];
-                T p2 = get_pixel(x, y, ori, size, dist_x, dist_y, image, patch_size);
-
-                // Calculate bit based on p1 and p2 and shifts it to correct position
-                v |= (p1 < p2) << j;
-            }
-
-            // Store 32 bits of descriptor
-            desc_out[f * 8 + i] += v;
-        }
-
-        x_in_out[f] = round(x * scl);
-        y_in_out[f] = round(y * scl);
-        size_out[f] = patch_size * scl;
-    }
-}
-
-
-
 template<typename T, typename convAccT>
 unsigned orb(Array<float> &x, Array<float> &y,
              Array<float> &score, Array<float> &ori,
@@ -652,7 +144,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
         // Calculate Harris responses
         // Good block_size >= 7 (must be an odd number)
         unsigned usable_feat = 0;
-        harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
+        kernel::harris_response<T, false>(h_x_harris, h_y_harris, h_score_harris, nullptr,
                                   h_x_feat, h_y_feat, nullptr,
                                   lvl_feat, &usable_feat,
                                   lvl_img,
@@ -689,7 +181,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
         float* h_score_lvl = memAlloc<float>(usable_feat);
 
         // Keep only features with higher Harris responses
-        keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
+        kernel::keep_features<T>(h_x_lvl, h_y_lvl, h_score_lvl, nullptr,
                          h_x_harris, h_y_harris, harris_sorted.get(), harris_idx.get(),
                          nullptr, usable_feat);
 
@@ -700,7 +192,7 @@ unsigned orb(Array<float> &x, Array<float> &y,
         float* h_size_lvl = memAlloc<float>(usable_feat);
 
         // Compute orientation of features
-        centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
+        kernel::centroid_angle<T>(h_x_lvl, h_y_lvl, h_ori_lvl, usable_feat,
                           lvl_img, patch_size);
 
         Array<T> lvl_filt = createEmptyArray<T>(dim4());
@@ -723,11 +215,11 @@ unsigned orb(Array<float> &x, Array<float> &y,
         unsigned* h_desc_lvl = memAlloc<unsigned>(usable_feat * 8);
         memset(h_desc_lvl, 0, usable_feat * 8 * sizeof(unsigned));
         if (blur_img)
-            extract_orb<T>(h_desc_lvl, usable_feat,
+            kernel::extract_orb<T>(h_desc_lvl, usable_feat,
                            h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
                            lvl_filt, lvl_scl, patch_size);
         else
-            extract_orb<T>(h_desc_lvl, usable_feat,
+            kernel::extract_orb<T>(h_desc_lvl, usable_feat,
                            h_x_lvl, h_y_lvl, h_ori_lvl, h_size_lvl,
                            lvl_img, lvl_scl, patch_size);
 
diff --git a/src/backend/cpu/random.cpp b/src/backend/cpu/random.cpp
index 8c83ad6..55cf295 100644
--- a/src/backend/cpu/random.cpp
+++ b/src/backend/cpu/random.cpp
@@ -7,12 +7,6 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <type_traits>
-#include <random>
-#include <algorithm>
-#include <functional>
-#include <limits>
-#include <type_traits>
 #include <af/array.h>
 #include <af/dim4.hpp>
 #include <af/defines.h>
@@ -20,140 +14,16 @@
 #include <random.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/random.hpp>
 
 namespace cpu
 {
 
-using namespace std;
-
-template<typename T>
-using is_arithmetic_t       = typename enable_if< is_arithmetic<T>::value,      function<T()>>::type;
-template<typename T>
-using is_complex_t          = typename enable_if< is_complex<T>::value,         function<T()>>::type;
-template<typename T>
-using is_floating_point_t   = typename enable_if< is_floating_point<T>::value,  function<T()>>::type;
-
-template<typename T, typename GenType>
-is_arithmetic_t<T>
-urand(GenType &generator)
-{
-    typedef typename conditional<   is_floating_point<T>::value,
-                                    uniform_real_distribution<T>,
-#if OS_WIN
-                                    uniform_int_distribution<unsigned>>::type dist;
-#else
-                                    uniform_int_distribution<T >> ::type dist;
-#endif
-    return bind(dist(), generator);
-}
-
-template<typename T, typename GenType>
-is_complex_t<T>
-urand(GenType &generator)
-{
-    auto func = urand<typename T::value_type>(generator);
-    return [func] () { return T(func(), func());};
-}
-
-template<typename T, typename GenType>
-is_floating_point_t<T>
-nrand(GenType &generator)
-{
-    return bind(normal_distribution<T>(), generator);
-}
-
-template<typename T, typename GenType>
-is_complex_t<T>
-nrand(GenType &generator)
-{
-    auto func = nrand<typename T::value_type>(generator);
-    return [func] () { return T(func(), func());};
-}
-
-static default_random_engine generator;
-static unsigned long long gen_seed = 0;
-static bool is_first = true;
-#define GLOBAL 1
-
-template<typename T>
-void randn_(Array<T> out)
-{
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = nrand<T>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = nrand<T>(generator);
-        my_seed = gen_seed;
-    }
-
-    T *outPtr = out.get();
-    for (int i = 0; i < (int)out.elements(); i++) {
-        outPtr[i] = gen();
-    }
-}
-
-template<typename T>
-Array<T> randn(const af::dim4 &dims)
-{
-    Array<T> outArray = createEmptyArray<T>(dims);
-    getQueue().enqueue(randn_<T>, outArray);
-    return outArray;
-}
-
-template<typename T>
-void randu_(Array<T> out)
-{
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = urand<T>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = urand<T>(generator);
-        my_seed = gen_seed;
-    }
-
-    T *outPtr = out.get();
-    for (int i = 0; i < (int)out.elements(); i++) {
-        outPtr[i] = gen();
-    }
-}
-
-template<>
-void randu_(Array<char> out)
-{
-    static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
-    }
-
-    static auto gen = urand<float>(generator);
-
-    if (my_seed != gen_seed) {
-        gen = urand<float>(generator);
-        my_seed = gen_seed;
-    }
-
-    char *outPtr = out.get();
-    for (int i = 0; i < (int)out.elements(); i++) {
-        outPtr[i] = gen() > 0.5;
-    }
-}
-
 template<typename T>
 Array<T> randu(const af::dim4 &dims)
 {
     Array<T> outArray = createEmptyArray<T>(dims);
-    getQueue().enqueue(randu_<T>, outArray);
+    getQueue().enqueue(kernel::randu<T>, outArray);
     return outArray;
 }
 
@@ -172,6 +42,14 @@ INSTANTIATE_UNIFORM(uchar)
 INSTANTIATE_UNIFORM(short)
 INSTANTIATE_UNIFORM(ushort)
 
+template<typename T>
+Array<T> randn(const af::dim4 &dims)
+{
+    Array<T> outArray = createEmptyArray<T>(dims);
+    getQueue().enqueue(kernel::randn<T>, outArray);
+    return outArray;
+}
+
 #define INSTANTIATE_NORMAL(T)                              \
     template Array<T>  randn<T>(const af::dim4 &dims);
 
@@ -184,32 +62,36 @@ template<>
 Array<char> randu(const af::dim4 &dims)
 {
     static unsigned long long my_seed = 0;
-    if (is_first) {
-        setSeed(gen_seed);
-        my_seed = gen_seed;
+    if (kernel::is_first) {
+        setSeed(kernel::gen_seed);
+        my_seed = kernel::gen_seed;
     }
 
-    static auto gen = urand<float>(generator);
+    static auto gen = kernel::urand<float>(kernel::generator);
 
-    if (my_seed != gen_seed) {
-        gen = urand<float>(generator);
-        my_seed = gen_seed;
+    if (my_seed != kernel::gen_seed) {
+        gen = kernel::urand<float>(kernel::generator);
+        my_seed = kernel::gen_seed;
     }
 
     Array<char> outArray = createEmptyArray<char>(dims);
-    char *outPtr = outArray.get();
-    for (int i = 0; i < (int)outArray.elements(); i++) {
-        outPtr[i] = gen() > 0.5;
-    }
+    auto func = [=](Array<char> outArray) {
+        char *outPtr = outArray.get();
+        for (int i = 0; i < (int)outArray.elements(); i++) {
+            outPtr[i] = gen() > 0.5;
+        }
+    };
+    getQueue().enqueue(func, outArray);
+
     return outArray;
 }
 
 void setSeed(const uintl seed)
 {
     auto f = [=](const uintl seed){
-        generator.seed(seed);
-        is_first = false;
-        gen_seed = seed;
+        kernel::generator.seed(seed);
+        kernel::is_first = false;
+        kernel::gen_seed = seed;
     };
     getQueue().enqueue(f, seed);
 }
@@ -217,7 +99,7 @@ void setSeed(const uintl seed)
 uintl getSeed()
 {
     getQueue().sync();
-    return gen_seed;
+    return kernel::gen_seed;
 }
 
 }
diff --git a/src/backend/cpu/range.cpp b/src/backend/cpu/range.cpp
index 7837db5..b5ba5f8 100644
--- a/src/backend/cpu/range.cpp
+++ b/src/backend/cpu/range.cpp
@@ -16,47 +16,11 @@
 #include <numeric>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/range.hpp>
 
 namespace cpu
 {
 
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T, int dim>
-void range(Array<T> output)
-{
-    T* out = output.get();
-
-    const dim4 dims = output.dims();
-    const dim4 strides = output.strides();
-
-    for(dim_t w = 0; w < dims[3]; w++) {
-        dim_t offW = w * strides[3];
-        for(dim_t z = 0; z < dims[2]; z++) {
-            dim_t offWZ = offW + z * strides[2];
-            for(dim_t y = 0; y < dims[1]; y++) {
-                dim_t offWZY = offWZ + y * strides[1];
-                for(dim_t x = 0; x < dims[0]; x++) {
-                    dim_t id = offWZY + x;
-                    if(dim == 0) {
-                        out[id] = x;
-                    } else if(dim == 1) {
-                        out[id] = y;
-                    } else if(dim == 2) {
-                        out[id] = z;
-                    } else if(dim == 3) {
-                        out[id] = w;
-                    }
-                }
-            }
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
 template<typename T>
 Array<T> range(const dim4& dims, const int seq_dim)
 {
@@ -69,10 +33,10 @@ Array<T> range(const dim4& dims, const int seq_dim)
 
     Array<T> out = createEmptyArray<T>(dims);
     switch(_seq_dim) {
-        case 0: getQueue().enqueue(range<T, 0>, out); break;
-        case 1: getQueue().enqueue(range<T, 1>, out); break;
-        case 2: getQueue().enqueue(range<T, 2>, out); break;
-        case 3: getQueue().enqueue(range<T, 3>, out); break;
+        case 0: getQueue().enqueue(kernel::range<T, 0>, out); break;
+        case 1: getQueue().enqueue(kernel::range<T, 1>, out); break;
+        case 2: getQueue().enqueue(kernel::range<T, 2>, out); break;
+        case 3: getQueue().enqueue(kernel::range<T, 3>, out); break;
         default : AF_ERROR("Invalid rep selection", AF_ERR_ARG);
     }
 
diff --git a/src/backend/cpu/reduce.cpp b/src/backend/cpu/reduce.cpp
index cce1226..cd44b5e 100644
--- a/src/backend/cpu/reduce.cpp
+++ b/src/backend/cpu/reduce.cpp
@@ -15,9 +15,9 @@
 #include <ops.hpp>
 #include <functional>
 #include <complex>
-
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/reduce.hpp>
 
 using af::dim4;
 
@@ -38,56 +38,6 @@ struct Binary<cdouble, af_add_t>
 namespace cpu
 {
 
-template<af_op_t op, typename Ti, typename To, int D>
-struct reduce_dim
-{
-    void operator()(Array<To> out, const dim_t outOffset,
-                    const Array<Ti> in, const dim_t inOffset,
-                    const int dim, bool change_nan, double nanval)
-    {
-        static const int D1 = D - 1;
-        static reduce_dim<op, Ti, To, D1> reduce_dim_next;
-
-        const dim4 ostrides = out.strides();
-        const dim4 istrides = in.strides();
-        const dim4 odims    = out.dims();
-
-        for (dim_t i = 0; i < odims[D1]; i++) {
-            reduce_dim_next(out, outOffset + i * ostrides[D1],
-                            in, inOffset + i * istrides[D1],
-                            dim, change_nan, nanval);
-        }
-    }
-};
-
-template<af_op_t op, typename Ti, typename To>
-struct reduce_dim<op, Ti, To, 0>
-{
-
-    Transform<Ti, To, op> transform;
-    Binary<To, op> reduce;
-    void operator()(Array<To> out, const dim_t outOffset,
-                    const Array<Ti> in, const dim_t inOffset,
-                    const int dim, bool change_nan, double nanval)
-    {
-        const dim4 istrides = in.strides();
-        const dim4 idims    = in.dims();
-
-        To * const outPtr = out.get() + outOffset;
-        Ti const * const inPtr = in.get() + inOffset;
-        dim_t stride = istrides[dim];
-
-        To out_val = reduce.init();
-        for (dim_t i = 0; i < idims[dim]; i++) {
-            To in_val = transform(inPtr[i * stride]);
-            if (change_nan) in_val = IS_NAN(in_val) ? nanval : in_val;
-            out_val = reduce(in_val, out_val);
-        }
-
-        *outPtr = out_val;
-    }
-};
-
 template<af_op_t op, typename Ti, typename To>
 using reduce_dim_func = std::function<void(Array<To>, const dim_t,
                                            const Array<Ti>, const dim_t,
@@ -101,10 +51,10 @@ Array<To> reduce(const Array<Ti> &in, const int dim, bool change_nan, double nan
     in.eval();
 
     Array<To> out = createEmptyArray<To>(odims);
-    static const reduce_dim_func<op, Ti, To>  reduce_funcs[4] = { reduce_dim<op, Ti, To, 1>()
-                                                                , reduce_dim<op, Ti, To, 2>()
-                                                                , reduce_dim<op, Ti, To, 3>()
-                                                                , reduce_dim<op, Ti, To, 4>()};
+    static const reduce_dim_func<op, Ti, To>  reduce_funcs[4] = { kernel::reduce_dim<op, Ti, To, 1>()
+                                                                , kernel::reduce_dim<op, Ti, To, 2>()
+                                                                , kernel::reduce_dim<op, Ti, To, 3>()
+                                                                , kernel::reduce_dim<op, Ti, To, 4>()};
 
     getQueue().enqueue(reduce_funcs[in.ndims() - 1], out, 0, in, 0, dim, change_nan, nanval);
 
diff --git a/src/backend/cpu/regions.cpp b/src/backend/cpu/regions.cpp
index f7309c8..ffac11c 100644
--- a/src/backend/cpu/regions.cpp
+++ b/src/backend/cpu/regions.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/regions.hpp>
 
 using af::dim4;
 
@@ -26,186 +27,14 @@ namespace cpu
 {
 
 template<typename T>
-class LabelNode
-{
-private:
-    T label;
-    T minLabel;
-    unsigned rank;
-    LabelNode* parent;
-
-public:
-    LabelNode() : label(0), minLabel(0), rank(0), parent(this) { }
-    LabelNode(T label) : label(label), minLabel(label), rank(0), parent(this) { }
-
-    T getLabel()
-    {
-        return label;
-    }
-
-    T getMinLabel()
-    {
-        return minLabel;
-    }
-
-    LabelNode* getParent()
-    {
-        return parent;
-    }
-
-    unsigned getRank()
-    {
-        return rank;
-    }
-
-    void setMinLabel(T l)
-    {
-        minLabel = l;
-    }
-
-    void setParent(LabelNode* p)
-    {
-        parent = p;
-    }
-
-    void setRank(unsigned r)
-    {
-        rank = r;
-    }
-};
-
-template<typename T>
-static LabelNode<T>* find(LabelNode<T>* x)
-{
-    if (x->getParent() != x)
-        x->setParent(find(x->getParent()));
-    return x->getParent();
-}
-
-template<typename T>
-static void setUnion(LabelNode<T>* x, LabelNode<T>* y)
-{
-    LabelNode<T>* xRoot = find(x);
-    LabelNode<T>* yRoot = find(y);
-    if (xRoot == yRoot)
-        return;
-
-    T xMinLabel = xRoot->getMinLabel();
-    T yMinLabel = yRoot->getMinLabel();
-    xRoot->setMinLabel(min(xMinLabel, yMinLabel));
-    yRoot->setMinLabel(min(xMinLabel, yMinLabel));
-
-    if (xRoot->getRank() < yRoot->getRank())
-        xRoot->setParent(yRoot);
-    else if (xRoot->getRank() > yRoot->getRank())
-        yRoot->setParent(xRoot);
-    else {
-        yRoot->setParent(xRoot);
-        xRoot->setRank(xRoot->getRank() + 1);
-    }
-}
-
-template<typename T>
 Array<T> regions(const Array<char> &in, af_connectivity connectivity)
 {
     in.eval();
 
-    // Create output placeholder
     Array<T> out = createValueArray(in.dims(), (T)0);
     out.eval();
 
-    auto func = [=] (Array<T> out, const Array<char> in, af_connectivity connectivity) {
-        const dim4 in_dims = in.dims();
-        const char *in_ptr  = in.get();
-        T    *out_ptr = out.get();
-
-        // Map labels
-        typedef typename std::map<T, LabelNode<T>* > label_map_t;
-        typedef typename label_map_t::iterator label_map_iterator_t;
-
-        label_map_t lmap;
-
-        // Initial label
-        T label = (T)1;
-
-        for (int j = 0; j < (int)in_dims[1]; j++) {
-            for (int i = 0; i < (int)in_dims[0]; i++) {
-                int idx = j * in_dims[0] + i;
-                if (in_ptr[idx] != 0) {
-                    std::vector<T> l;
-
-                    // Test neighbors
-                    if (i > 0 && out_ptr[j * (int)in_dims[0] + i-1] > 0)
-                        l.push_back(out_ptr[j * in_dims[0] + i-1]);
-                    if (j > 0 && out_ptr[(j-1) * (int)in_dims[0] + i] > 0)
-                        l.push_back(out_ptr[(j-1) * in_dims[0] + i]);
-                    if (connectivity == AF_CONNECTIVITY_8 && i > 0 &&
-                            j > 0 && out_ptr[(j-1) * in_dims[0] + i-1] > 0)
-                        l.push_back(out_ptr[(j-1) * in_dims[0] + i-1]);
-                    if (connectivity == AF_CONNECTIVITY_8 &&
-                            i < (int)in_dims[0] - 1 && j > 0 && out_ptr[(j-1) * in_dims[0] + i+1] != 0)
-                        l.push_back(out_ptr[(j-1) * in_dims[0] + i+1]);
-
-                    if (!l.empty()) {
-                        T minl = l[0];
-                        for (size_t k = 0; k < l.size(); k++) {
-                            minl = min(l[k], minl);
-                            label_map_iterator_t cur_map = lmap.find(l[k]);
-                            LabelNode<T> *node = cur_map->second;
-                            // Group labels of the same region under a disjoint set
-                            for (size_t m = k+1; m < l.size(); m++)
-                                setUnion(node, lmap.find(l[m])->second);
-                        }
-                        // Set label to smallest neighbor label
-                        out_ptr[idx] = minl;
-                    }
-                    else {
-                        // Insert new label in map
-                        LabelNode<T> *node = new LabelNode<T>(label);
-                        lmap.insert(std::pair<T, LabelNode<T>* >(label, node));
-                        out_ptr[idx] = label++;
-                    }
-                }
-            }
-        }
-
-        std::set<T> removed;
-
-        for (int j = 0; j < (int)in_dims[1]; j++) {
-            for (int i = 0; i < (int)in_dims[0]; i++) {
-                int idx = j * (int)in_dims[0] + i;
-                if (in_ptr[idx] != 0) {
-                    T l = out_ptr[idx];
-                    label_map_iterator_t cur_map = lmap.find(l);
-
-                    if (cur_map != lmap.end()) {
-                        LabelNode<T>* node = cur_map->second;
-
-                        LabelNode<T>* node_root = find(node);
-                        out_ptr[idx] = node_root->getMinLabel();
-
-                        // Mark removed labels (those that are part of a region
-                        // that contains a smaller label)
-                        if (node->getMinLabel() < l || node_root->getMinLabel() < l)
-                            removed.insert(l);
-                        if (node->getLabel() > node->getMinLabel())
-                            removed.insert(node->getLabel());
-                    }
-                }
-            }
-        }
-
-        // Calculate final neighbors (ensure final labels are sequential)
-        for (int j = 0; j < (int)in_dims[1]; j++) {
-            for (int i = 0; i < (int)in_dims[0]; i++) {
-                int idx = j * (int)in_dims[0] + i;
-                if (out_ptr[idx] > 0) {
-                    out_ptr[idx] -= distance(removed.begin(), removed.lower_bound(out_ptr[idx]));
-                }
-            }
-        }
-    };
-    getQueue().enqueue(func, out, in, connectivity);
+    getQueue().enqueue(kernel::regions<T>, out, in, connectivity);
 
     return out;
 }
diff --git a/src/backend/cpu/reorder.cpp b/src/backend/cpu/reorder.cpp
index 1ad7dad..162039b 100644
--- a/src/backend/cpu/reorder.cpp
+++ b/src/backend/cpu/reorder.cpp
@@ -9,49 +9,14 @@
 
 #include <Array.hpp>
 #include <reorder.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/reorder.hpp>
 
 namespace cpu
 {
 
 template<typename T>
-void reorder_(Array<T> out, const Array<T> in, const af::dim4 oDims, const af::dim4 rdims)
-{
-    T* outPtr = out.get();
-    const T* inPtr = in.get();
-
-    const af::dim4 ist = in.strides();
-    const af::dim4 ost = out.strides();
-
-
-    dim_t ids[4]  = {0};
-    for(dim_t ow = 0; ow < oDims[3]; ow++) {
-        const dim_t oW = ow * ost[3];
-        ids[rdims[3]] = ow;
-        for(dim_t oz = 0; oz < oDims[2]; oz++) {
-            const dim_t oZW = oW + oz * ost[2];
-            ids[rdims[2]] = oz;
-            for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                const dim_t oYZW = oZW + oy * ost[1];
-                ids[rdims[1]] = oy;
-                for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                    const dim_t oIdx = oYZW + ox;
-
-                    ids[rdims[0]] = ox;
-                    const dim_t iIdx = ids[3] * ist[3] + ids[2] * ist[2] +
-                                          ids[1] * ist[1] + ids[0];
-
-                    outPtr[oIdx] = inPtr[iIdx];
-                }
-            }
-        }
-    }
-}
-
-template<typename T>
 Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
 {
     in.eval();
@@ -62,7 +27,7 @@ Array<T> reorder(const Array<T> &in, const af::dim4 &rdims)
         oDims[i] = iDims[rdims[i]];
 
     Array<T> out = createEmptyArray<T>(oDims);
-    getQueue().enqueue(reorder_<T>, out, in, oDims, rdims);
+    getQueue().enqueue(kernel::reorder<T>, out, in, oDims, rdims);
     return out;
 }
 
diff --git a/src/backend/cpu/resize.cpp b/src/backend/cpu/resize.cpp
index 8fb2edc..9a5c85b 100644
--- a/src/backend/cpu/resize.cpp
+++ b/src/backend/cpu/resize.cpp
@@ -9,174 +9,16 @@
 
 #include <Array.hpp>
 #include <resize.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <math.hpp>
 #include <types.hpp>
 #include <af/traits.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/resize.hpp>
 
 namespace cpu
 {
 
-/**
- * noop function for round to avoid compilation
- * issues due to lack of this function in C90 based
- * compilers, it is only present in C99 and C++11
- *
- * This is not a full fledged implementation, this function
- * is to be used only for positive numbers, i m using it here
- * for calculating dimensions of arrays
- */
-dim_t round2int(float value)
-{
-    return (dim_t)(value+0.5f);
-}
-
-using std::conditional;
-using std::is_same;
-
-template<typename T>
-using wtype_t = typename conditional<is_same<T, double>::value, double, float>::type;
-
-template<typename T>
-using vtype_t = typename conditional<is_complex<T>::value,
-                                     T, wtype_t<T>
-                                    >::type;
-
-template<typename T, af_interp_type method>
-struct resize_op
-{
-    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-              const af::dim4 &ostrides, const af::dim4 &istrides,
-              const dim_t x, const dim_t y)
-    {
-        return;
-    }
-};
-
-template<typename T>
-struct resize_op<T, AF_INTERP_NEAREST>
-{
-    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-            const af::dim4 &ostrides, const af::dim4 &istrides,
-            const dim_t x, const dim_t y)
-    {
-        // Compute Indices
-        dim_t i_x = round2int((float)x / (odims[0] / (float)idims[0]));
-        dim_t i_y = round2int((float)y / (odims[1] / (float)idims[1]));
-
-        if (i_x >= idims[0]) i_x = idims[0] - 1;
-        if (i_y >= idims[1]) i_y = idims[1] - 1;
-
-        dim_t i_off = i_y * istrides[1] + i_x;
-        dim_t o_off =   y * ostrides[1] + x;
-        // Copy values from all channels
-        for(dim_t w = 0; w < odims[3]; w++) {
-            dim_t wost = w * ostrides[3];
-            dim_t wist = w * istrides[3];
-            for(dim_t z = 0; z < odims[2]; z++) {
-                outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
-            }
-        }
-    }
-};
-
-template<typename T>
-struct resize_op<T, AF_INTERP_BILINEAR>
-{
-    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-            const af::dim4 &ostrides, const af::dim4 &istrides,
-            const dim_t x, const dim_t y)
-    {
-        // Compute Indices
-        float f_x = (float)x / (odims[0] / (float)idims[0]);
-        float f_y = (float)y / (odims[1] / (float)idims[1]);
-
-        dim_t i1_x  = floor(f_x);
-        dim_t i1_y  = floor(f_y);
-
-        if (i1_x >= idims[0]) i1_x = idims[0] - 1;
-        if (i1_y >= idims[1]) i1_y = idims[1] - 1;
-
-        float b   = f_x - i1_x;
-        float a   = f_y - i1_y;
-
-        dim_t i2_x  = (i1_x + 1 >= idims[0] ? idims[0] - 1 : i1_x + 1);
-        dim_t i2_y  = (i1_y + 1 >= idims[1] ? idims[1] - 1 : i1_y + 1);
-
-        typedef typename dtype_traits<T>::base_type BT;
-        typedef wtype_t<BT> WT;
-        typedef vtype_t<T> VT;
-
-        dim_t o_off = y * ostrides[1] + x;
-        // Copy values from all channels
-        for(dim_t w = 0; w < odims[3]; w++) {
-            dim_t wst = w * istrides[3];
-            for(dim_t z = 0; z < odims[2]; z++) {
-                dim_t zst = z * istrides[2];
-                dim_t channel_off = zst + wst;
-                VT p1 = inPtr[i1_y * istrides[1] + i1_x + channel_off];
-                VT p2 = inPtr[i2_y * istrides[1] + i1_x + channel_off];
-                VT p3 = inPtr[i1_y * istrides[1] + i2_x + channel_off];
-                VT p4 = inPtr[i2_y * istrides[1] + i2_x + channel_off];
-
-                outPtr[o_off + z * ostrides[2] + w * ostrides[3]] =
-                                scalar<WT>((1.0f - a) * (1.0f - b)) * p1 +
-                                scalar<WT>((    a   ) * (1.0f - b)) * p2 +
-                                scalar<WT>((1.0f - a) * (    b   )) * p3 +
-                                scalar<WT>((    a   ) * (    b   )) * p4;
-            }
-        }
-    }
-};
-
-template<typename T>
-struct resize_op<T, AF_INTERP_LOWER>
-{
-    void operator()(T *outPtr, const T *inPtr, const af::dim4 &odims, const af::dim4 &idims,
-            const af::dim4 &ostrides, const af::dim4 &istrides,
-            const dim_t x, const dim_t y)
-    {
-        // Compute Indices
-        dim_t i_x = floor((float)x / (odims[0] / (float)idims[0]));
-        dim_t i_y = floor((float)y / (odims[1] / (float)idims[1]));
-
-        if (i_x >= idims[0]) i_x = idims[0] - 1;
-        if (i_y >= idims[1]) i_y = idims[1] - 1;
-
-        dim_t i_off = i_y * istrides[1] + i_x;
-        dim_t o_off =   y * ostrides[1] + x;
-        // Copy values from all channels
-        for(dim_t w = 0; w < odims[3]; w++) {
-            dim_t wost = w * ostrides[3];
-            dim_t wist = w * istrides[3];
-            for(dim_t z = 0; z < odims[2]; z++) {
-                outPtr[o_off + z * ostrides[2] + wost] = inPtr[i_off + z * istrides[2] + wist];
-            }
-        }
-    }
-};
-
-template<typename T, af_interp_type method>
-void resize_(Array<T> out, const Array<T> in)
-{
-    af::dim4 idims    = in.dims();
-    af::dim4 odims    = out.dims();
-    const T *inPtr    = in.get();
-          T *outPtr   = out.get();
-    af::dim4 ostrides = out.strides();
-    af::dim4 istrides = in.strides();
-
-    resize_op<T, method> op;
-    for(dim_t y = 0; y < odims[1]; y++) {
-        for(dim_t x = 0; x < odims[0]; x++) {
-            op(outPtr, inPtr, odims, idims, ostrides, istrides, x, y);
-        }
-    }
-}
-
 template<typename T>
 Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
                 const af_interp_type method)
@@ -190,11 +32,11 @@ Array<T> resize(const Array<T> &in, const dim_t odim0, const dim_t odim1,
 
     switch(method) {
         case AF_INTERP_NEAREST:
-            getQueue().enqueue(resize_<T, AF_INTERP_NEAREST>, out, in); break;
+            getQueue().enqueue(kernel::resize<T, AF_INTERP_NEAREST>, out, in); break;
         case AF_INTERP_BILINEAR:
-            getQueue().enqueue(resize_<T, AF_INTERP_BILINEAR>, out, in); break;
+            getQueue().enqueue(kernel::resize<T, AF_INTERP_BILINEAR>, out, in); break;
         case AF_INTERP_LOWER:
-            getQueue().enqueue(resize_<T, AF_INTERP_LOWER>, out, in); break;
+            getQueue().enqueue(kernel::resize<T, AF_INTERP_LOWER>, out, in); break;
         default: break;
     }
     return out;
diff --git a/src/backend/cpu/rotate.cpp b/src/backend/cpu/rotate.cpp
index 5687d69..e81ee04 100644
--- a/src/backend/cpu/rotate.cpp
+++ b/src/backend/cpu/rotate.cpp
@@ -9,77 +9,14 @@
 
 #include <Array.hpp>
 #include <rotate.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
 #include "transform_interp.hpp"
+#include <kernel/rotate.hpp>
 
 namespace cpu
 {
 
-template<typename T, af_interp_type method>
-void rotate_(Array<T> output, const Array<T> input, const float theta)
-{
-    const af::dim4 odims    = output.dims();
-    const af::dim4 idims    = input.dims();
-    const af::dim4 ostrides = output.strides();
-    const af::dim4 istrides = input.strides();
-
-    const T* in   = input.get();
-          T* out  = output.get();
-    dim_t nimages = idims[2];
-
-    void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
-                 const af::dim4 &, const af::dim4 &,
-                 const dim_t, const dim_t, const dim_t, const dim_t);
-
-    const float c = cos(-theta), s = sin(-theta);
-    float tx, ty;
-    {
-        const float nx = 0.5 * (idims[0] - 1);
-        const float ny = 0.5 * (idims[1] - 1);
-        const float mx = 0.5 * (odims[0] - 1);
-        const float my = 0.5 * (odims[1] - 1);
-        const float sx = (mx * c + my *-s);
-        const float sy = (mx * s + my * c);
-        tx = -(sx - nx);
-        ty = -(sy - ny);
-    }
-
-    const float tmat[6] = {std::round( c * 1000) / 1000.0f,
-                           std::round(-s * 1000) / 1000.0f,
-                           std::round(tx * 1000) / 1000.0f,
-                           std::round( s * 1000) / 1000.0f,
-                           std::round( c * 1000) / 1000.0f,
-                           std::round(ty * 1000) / 1000.0f,
-                          };
-
-    switch(method) {
-        case AF_INTERP_NEAREST:
-            t_fn = &transform_n;
-            break;
-        case AF_INTERP_BILINEAR:
-            t_fn = &transform_b;
-            break;
-        case AF_INTERP_LOWER:
-            t_fn = &transform_l;
-            break;
-        default:
-            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-            break;
-    }
-
-
-    // Do transform for image
-    for(int yy = 0; yy < (int)odims[1]; yy++) {
-        for(int xx = 0; xx < (int)odims[0]; xx++) {
-            t_fn(out, in, tmat, idims, ostrides, istrides, nimages, 0, xx, yy);
-        }
-    }
-}
-
 template<typename T>
 Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
                  const af_interp_type method)
@@ -90,13 +27,13 @@ Array<T> rotate(const Array<T> &in, const float theta, const af::dim4 &odims,
 
     switch(method) {
         case AF_INTERP_NEAREST:
-            getQueue().enqueue(rotate_<T, AF_INTERP_NEAREST>, out, in, theta);
+            getQueue().enqueue(kernel::rotate<T, AF_INTERP_NEAREST>, out, in, theta);
             break;
         case AF_INTERP_BILINEAR:
-            getQueue().enqueue(rotate_<T, AF_INTERP_BILINEAR>, out, in, theta);
+            getQueue().enqueue(kernel::rotate<T, AF_INTERP_BILINEAR>, out, in, theta);
             break;
         case AF_INTERP_LOWER:
-            getQueue().enqueue(rotate_<T, AF_INTERP_LOWER>, out, in, theta);
+            getQueue().enqueue(kernel::rotate<T, AF_INTERP_LOWER>, out, in, theta);
             break;
         default:
             AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
diff --git a/src/backend/cpu/scan.cpp b/src/backend/cpu/scan.cpp
index 39157ca..615744f 100644
--- a/src/backend/cpu/scan.cpp
+++ b/src/backend/cpu/scan.cpp
@@ -16,64 +16,13 @@
 #include <ops.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/scan.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<af_op_t op, typename Ti, typename To, int D>
-struct scan_dim
-{
-    void operator()(Array<To> out, dim_t outOffset,
-                    const Array<Ti> in, dim_t inOffset,
-                    const int dim) const
-    {
-        const dim4 odims    = out.dims();
-        const dim4 ostrides = out.strides();
-        const dim4 istrides = in.strides();
-
-        const int D1 = D - 1;
-        for (dim_t i = 0; i < odims[D1]; i++) {
-            scan_dim<op, Ti, To, D1> func;
-            getQueue().enqueue(func,
-                    out, outOffset + i * ostrides[D1],
-                    in, inOffset + i * istrides[D1], dim);
-            if (D1 == dim) break;
-        }
-    }
-};
-
-template<af_op_t op, typename Ti, typename To>
-struct scan_dim<op, Ti, To, 0>
-{
-    void operator()(Array<To> output, dim_t outOffset,
-                    const Array<Ti> input,  dim_t inOffset,
-                    const int dim) const
-    {
-        const Ti* in = input.get() + inOffset;
-              To* out= output.get()+ outOffset;
-
-        const dim4 ostrides = output.strides();
-        const dim4 istrides = input.strides();
-        const dim4 idims    = input.dims();
-
-        dim_t istride = istrides[dim];
-        dim_t ostride = ostrides[dim];
-
-        Transform<Ti, To, op> transform;
-        // FIXME: Change the name to something better
-        Binary<To, op> scan;
-
-        To out_val = scan.init();
-        for (dim_t i = 0; i < idims[dim]; i++) {
-            To in_val = transform(in[i * istride]);
-            out_val = scan(in_val, out_val);
-            out[i * ostride] = out_val;
-        }
-    }
-};
-
 template<af_op_t op, typename Ti, typename To>
 Array<To> scan(const Array<Ti>& in, const int dim)
 {
@@ -84,19 +33,19 @@ Array<To> scan(const Array<Ti>& in, const int dim)
 
     switch (in.ndims()) {
         case 1:
-            scan_dim<op, Ti, To, 1> func1;
+            kernel::scan_dim<op, Ti, To, 1> func1;
             getQueue().enqueue(func1, out, 0, in, 0, dim);
             break;
         case 2:
-            scan_dim<op, Ti, To, 2> func2;
+            kernel::scan_dim<op, Ti, To, 2> func2;
             getQueue().enqueue(func2, out, 0, in, 0, dim);
             break;
         case 3:
-            scan_dim<op, Ti, To, 3> func3;
+            kernel::scan_dim<op, Ti, To, 3> func3;
             getQueue().enqueue(func3, out, 0, in, 0, dim);
             break;
         case 4:
-            scan_dim<op, Ti, To, 4> func4;
+            kernel::scan_dim<op, Ti, To, 4> func4;
             getQueue().enqueue(func4, out, 0, in, 0, dim);
             break;
     }
diff --git a/src/backend/cpu/select.cpp b/src/backend/cpu/select.cpp
index 4a219ed..d9a6795 100644
--- a/src/backend/cpu/select.cpp
+++ b/src/backend/cpu/select.cpp
@@ -6,12 +6,13 @@
  * The complete license agreement can be obtained at:
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
+
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
 #include <select.hpp>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/select.hpp>
 
 using af::dim4;
 
@@ -25,66 +26,7 @@ void select(Array<T> &out, const Array<char> &cond, const Array<T> &a, const Arr
     cond.eval();
     a.eval();
     b.eval();
-    auto func = [=] (Array<T> out, const Array<char> cond, const Array<T> a, const Array<T> b) {
-        dim4 adims = a.dims();
-        dim4 astrides = a.strides();
-        dim4 bdims = b.dims();
-        dim4 bstrides = b.strides();
-
-        dim4 cdims = cond.dims();
-        dim4 cstrides = cond.strides();
-
-        dim4 odims = out.dims();
-        dim4 ostrides = out.strides();
-
-        bool is_a_same[] = {adims[0] == odims[0], adims[1] == odims[1],
-            adims[2] == odims[2], adims[3] == odims[3]};
-
-        bool is_b_same[] = {bdims[0] == odims[0], bdims[1] == odims[1],
-            bdims[2] == odims[2], bdims[3] == odims[3]};
-
-        bool is_c_same[] = {cdims[0] == odims[0], cdims[1] == odims[1],
-            cdims[2] == odims[2], cdims[3] == odims[3]};
-
-        const T *aptr = a.get();
-        const T *bptr = b.get();
-        T *optr = out.get();
-        const char *cptr = cond.get();
-
-        for (int l = 0; l < odims[3]; l++) {
-
-            int o_off3   = ostrides[3] * l;
-            int a_off3   = astrides[3] * is_a_same[3] * l;
-            int b_off3   = bstrides[3] * is_b_same[3] * l;
-            int c_off3   = cstrides[3] * is_c_same[3] * l;
-
-            for (int k = 0; k < odims[2]; k++) {
-
-                int o_off2   = ostrides[2] * k + o_off3;
-                int a_off2   = astrides[2] * is_a_same[2] * k + a_off3;
-                int b_off2   = bstrides[2] * is_b_same[2] * k + b_off3;
-                int c_off2   = cstrides[2] * is_c_same[2] * k + c_off3;
-
-                for (int j = 0; j < odims[1]; j++) {
-
-                    int o_off1   = ostrides[1] * j + o_off2;
-                    int a_off1   = astrides[1] * is_a_same[1] * j + a_off2;
-                    int b_off1   = bstrides[1] * is_b_same[1] * j + b_off2;
-                    int c_off1   = cstrides[1] * is_c_same[1] * j + c_off2;
-
-                    for (int i = 0; i < odims[0]; i++) {
-
-                        bool cval = is_c_same[0] ? cptr[c_off1 + i] : cptr[c_off1];
-                        T    aval = is_a_same[0] ? aptr[a_off1 + i] : aptr[a_off1];
-                        T    bval = is_b_same[0] ? bptr[b_off1 + i] : bptr[b_off1];
-                        T    oval = cval ? aval : bval;
-                        optr[o_off1 + i] = oval;
-                    }
-                }
-            }
-        }
-    };
-    getQueue().enqueue(func, out, cond, a, b);
+    getQueue().enqueue(kernel::select<T>, out, cond, a, b);
 }
 
 template<typename T, bool flip>
@@ -93,44 +35,7 @@ void select_scalar(Array<T> &out, const Array<char> &cond, const Array<T> &a, co
     out.eval();
     cond.eval();
     a.eval();
-    auto func = [=] (Array<T> out, const Array<char> cond, const Array<T> a, const double b) {
-        dim4 astrides = a.strides();
-        dim4 cstrides = cond.strides();
-
-        dim4 odims = out.dims();
-        dim4 ostrides = out.strides();
-
-        const T *aptr = a.get();
-        T *optr = out.get();
-        const char *cptr = cond.get();
-
-        for (int l = 0; l < odims[3]; l++) {
-
-            int o_off3 = ostrides[3] * l;
-            int a_off3 = astrides[3] * l;
-            int c_off3 = cstrides[3] * l;
-
-            for (int k = 0; k < odims[2]; k++) {
-
-                int o_off2 = ostrides[2] * k + o_off3;
-                int a_off2 = astrides[2] * k + a_off3;
-                int c_off2 = cstrides[2] * k + c_off3;
-
-                for (int j = 0; j < odims[1]; j++) {
-
-                    int o_off1 = ostrides[1] * j + o_off2;
-                    int a_off1 = astrides[1] * j + a_off2;
-                    int c_off1 = cstrides[1] * j + c_off2;
-
-                    for (int i = 0; i < odims[0]; i++) {
-
-                        optr[o_off1 + i] = (flip ^ cptr[c_off1 + i]) ? aptr[a_off1 + i] : b;
-                    }
-                }
-            }
-        }
-    };
-    getQueue().enqueue(func, out, cond, a, b);
+    getQueue().enqueue(kernel::select_scalar<T, flip>, out, cond, a, b);
 }
 
 #define INSTANTIATE(T)                                              \
diff --git a/src/backend/cpu/shift.cpp b/src/backend/cpu/shift.cpp
index 766427b..eca1e50 100644
--- a/src/backend/cpu/shift.cpp
+++ b/src/backend/cpu/shift.cpp
@@ -9,20 +9,13 @@
 
 #include <Array.hpp>
 #include <shift.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <cassert>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/shift.hpp>
 
 namespace cpu
 {
 
-static inline dim_t simple_mod(const dim_t i, const dim_t dim)
-{
-    return (i < dim) ? i : (i - dim);
-}
-
 template<typename T>
 Array<T> shift(const Array<T> &in, const int sdims[4])
 {
@@ -31,48 +24,7 @@ Array<T> shift(const Array<T> &in, const int sdims[4])
     Array<T> out = createEmptyArray<T>(in.dims());
     const af::dim4 temp(sdims[0], sdims[1], sdims[2], sdims[3]);
 
-    auto func = [=] (Array<T> out, const Array<T> in, const af::dim4 sdims) {
-
-        T* outPtr = out.get();
-        const T* inPtr = in.get();
-
-        const af::dim4 oDims = out.dims();
-        const af::dim4 ist   = in.strides();
-        const af::dim4 ost   = out.strides();
-
-        int sdims_[4];
-        // Need to do this because we are mapping output to input in the kernel
-        for(int i = 0; i < 4; i++) {
-            // sdims_[i] will always be positive and always [0, oDims[i]].
-            // Negative shifts are converted to position by going the other way round
-            sdims_[i] = -(sdims[i] % (int)oDims[i]) + oDims[i] * (sdims[i] > 0);
-            assert(sdims_[i] >= 0 && sdims_[i] <= oDims[i]);
-        }
-
-        for(dim_t ow = 0; ow < oDims[3]; ow++) {
-            const int oW = ow * ost[3];
-            const int iw = simple_mod((ow + sdims_[3]), oDims[3]);
-            const int iW = iw * ist[3];
-            for(dim_t oz = 0; oz < oDims[2]; oz++) {
-                const int oZW = oW + oz * ost[2];
-                const int iz = simple_mod((oz + sdims_[2]), oDims[2]);
-                const int iZW = iW + iz * ist[2];
-                for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                    const int oYZW = oZW + oy * ost[1];
-                    const int iy = simple_mod((oy + sdims_[1]), oDims[1]);
-                    const int iYZW = iZW + iy * ist[1];
-                    for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                        const int oIdx = oYZW + ox;
-                        const int ix = simple_mod((ox + sdims_[0]), oDims[0]);
-                        const int iIdx = iYZW + ix;
-
-                        outPtr[oIdx] = inPtr[iIdx];
-                    }
-                }
-            }
-        }
-    };
-    getQueue().enqueue(func, out, in, temp);
+    getQueue().enqueue(kernel::shift<T>, out, in, temp);
 
     return out;
 }
diff --git a/src/backend/cpu/sift.cpp b/src/backend/cpu/sift.cpp
index 70bb11d..4b20f8a 100644
--- a/src/backend/cpu/sift.cpp
+++ b/src/backend/cpu/sift.cpp
@@ -22,7 +22,7 @@
 #include <vector>
 
 #ifdef AF_BUILD_SIFT
-#include <sift_nonfree.hpp>
+#include <kernel/sift_nonfree.hpp>
 #endif
 
 using af::dim4;
diff --git a/src/backend/cpu/sobel.cpp b/src/backend/cpu/sobel.cpp
index ba47ba9..161266d 100644
--- a/src/backend/cpu/sobel.cpp
+++ b/src/backend/cpu/sobel.cpp
@@ -13,80 +13,15 @@
 #include <Array.hpp>
 #include <sobel.hpp>
 #include <convolve.hpp>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/sobel.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename Ti, typename To, bool isDX>
-void derivative(Array<To> output, const Array<Ti> input)
-{
-    const dim4 dims    = input.dims();
-    const dim4 strides = input.strides();
-          To* optr     = output.get();
-    const Ti* iptr     = input.get();
-
-    for(dim_t b3=0; b3<dims[3]; ++b3) {
-    for(dim_t b2=0; b2<dims[2]; ++b2) {
-
-        for(dim_t j=0; j<dims[1]; ++j) {
-
-            int joff  = j;
-            int _joff = j-1;
-            int joff_ = j+1;
-            int joffset = j*strides[1];
-
-            for(dim_t i=0; i<dims[0]; ++i) {
-
-                To accum = To(0);
-
-                int  ioff = i;
-                int _ioff = i-1;
-                int ioff_ = i+1;
-
-                To NW = (_ioff>=0 && _joff>=0) ?
-                        iptr[_joff*strides[1]+_ioff*strides[0]] : 0;
-                To SW = (ioff_<(int)dims[0] && _joff>=0) ?
-                        iptr[_joff*strides[1]+ioff_*strides[0]] : 0;
-                To NE = (_ioff>=0 && joff_<(int)dims[1]) ?
-                        iptr[joff_*strides[1]+_ioff*strides[0]] : 0;
-                To SE = (ioff_<(int)dims[0] && joff_<(int)dims[1]) ?
-                        iptr[joff_*strides[1]+ioff_*strides[0]] : 0;
-
-                if (isDX) {
-                    To W  = _joff>=0 ?
-                            iptr[_joff*strides[1]+ioff*strides[0]] : 0;
-
-                    To E  = joff_<(int)dims[1] ?
-                            iptr[joff_*strides[1]+ioff*strides[0]] : 0;
-
-                    accum = NW+SW - (NE+SE) + 2*(W-E);
-                } else {
-                    To N  = _ioff>=0 ?
-                            iptr[joff*strides[1]+_ioff*strides[0]] : 0;
-
-                    To S  = ioff_<(int)dims[0] ?
-                            iptr[joff*strides[1]+ioff_*strides[0]] : 0;
-
-                    accum = NW+NE - (SW+SE) + 2*(N-S);
-                }
-
-                optr[joffset+i*strides[0]] = accum;
-            }
-        }
-
-        optr += strides[2];
-        iptr += strides[2];
-    }
-    optr += strides[3];
-    iptr += strides[3];
-    }
-}
-
 template<typename Ti, typename To>
 std::pair< Array<To>, Array<To> >
 sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
@@ -97,8 +32,8 @@ sobelDerivatives(const Array<Ti> &img, const unsigned &ker_size)
     Array<To> dx = createEmptyArray<To>(img.dims());
     Array<To> dy = createEmptyArray<To>(img.dims());
 
-    getQueue().enqueue(derivative<Ti, To, true >, dx, img);
-    getQueue().enqueue(derivative<Ti, To, false>, dy, img);
+    getQueue().enqueue(kernel::derivative<Ti, To, true >, dx, img);
+    getQueue().enqueue(kernel::derivative<Ti, To, false>, dy, img);
 
     return std::make_pair(dx, dy);
 }
diff --git a/src/backend/cpu/sort.cpp b/src/backend/cpu/sort.cpp
index cbdb50e..6a0465c 100644
--- a/src/backend/cpu/sort.cpp
+++ b/src/backend/cpu/sort.cpp
@@ -11,55 +11,15 @@
 #include <sort.hpp>
 #include <math.hpp>
 #include <copy.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <algorithm>
 #include <functional>
 #include <platform.hpp>
 #include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
+#include <kernel/sort.hpp>
 
 namespace cpu
 {
 
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-
-// Based off of http://stackoverflow.com/a/12399290
-template<typename T, bool isAscending>
-void sort0(Array<T> val)
-{
-    // initialize original index locations
-    T *val_ptr = val.get();
-
-    function<bool(T, T)> op = greater<T>();
-    if(isAscending) { op = less<T>(); }
-
-    T *comp_ptr = nullptr;
-    for(dim_t w = 0; w < val.dims()[3]; w++) {
-        dim_t valW = w * val.strides()[3];
-        for(dim_t z = 0; z < val.dims()[2]; z++) {
-            dim_t valWZ = valW + z * val.strides()[2];
-            for(dim_t y = 0; y < val.dims()[1]; y++) {
-
-                dim_t valOffset = valWZ + y * val.strides()[1];
-
-                comp_ptr = val_ptr + valOffset;
-                std::sort(comp_ptr, comp_ptr + val.dims()[0], op);
-            }
-        }
-    }
-    return;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
 template<typename T, bool isAscending>
 Array<T> sort(const Array<T> &in, const unsigned dim)
 {
@@ -67,7 +27,7 @@ Array<T> sort(const Array<T> &in, const unsigned dim)
 
     Array<T> out = copyArray<T>(in);
     switch(dim) {
-        case 0: getQueue().enqueue(sort0<T, isAscending>, out); break;
+        case 0: getQueue().enqueue(kernel::sort0<T, isAscending>, out); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
     return out;
diff --git a/src/backend/cpu/sort_by_key.cpp b/src/backend/cpu/sort_by_key.cpp
index d2ebd42..409b825 100644
--- a/src/backend/cpu/sort_by_key.cpp
+++ b/src/backend/cpu/sort_by_key.cpp
@@ -9,92 +9,13 @@
 
 #include <Array.hpp>
 #include <sort_by_key.hpp>
-#include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
-#include <algorithm>
-#include <numeric>
-#include <queue>
 #include <platform.hpp>
 #include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
-using std::function;
-using std::queue;
-using std::async;
+#include <kernel/sort_by_key.hpp>
 
 namespace cpu
 {
 
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-
-template<typename Tk, typename Tv, bool isAscending>
-void sort0_by_key(Array<Tk> okey, Array<Tv> oval, Array<uint> oidx,
-                  const Array<Tk> ikey, const Array<Tv> ival)
-{
-    function<bool(Tk, Tk)> op = greater<Tk>();
-    if(isAscending) { op = less<Tk>(); }
-
-    // Get pointers and initialize original index locations
-        uint *oidx_ptr = oidx.get();
-          Tk *okey_ptr = okey.get();
-          Tv *oval_ptr = oval.get();
-    const Tk *ikey_ptr = ikey.get();
-    const Tv *ival_ptr = ival.get();
-
-    std::vector<uint> seq_vec(oidx.dims()[0]);
-    std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
-    const Tk *comp_ptr = nullptr;
-    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
-    for(dim_t w = 0; w < ikey.dims()[3]; w++) {
-        dim_t okeyW = w * okey.strides()[3];
-        dim_t ovalW = w * oval.strides()[3];
-        dim_t oidxW = w * oidx.strides()[3];
-        dim_t ikeyW = w * ikey.strides()[3];
-        dim_t ivalW = w * ival.strides()[3];
-
-        for(dim_t z = 0; z < ikey.dims()[2]; z++) {
-            dim_t okeyWZ = okeyW + z * okey.strides()[2];
-            dim_t ovalWZ = ovalW + z * oval.strides()[2];
-            dim_t oidxWZ = oidxW + z * oidx.strides()[2];
-            dim_t ikeyWZ = ikeyW + z * ikey.strides()[2];
-            dim_t ivalWZ = ivalW + z * ival.strides()[2];
-
-            for(dim_t y = 0; y < ikey.dims()[1]; y++) {
-
-                dim_t okeyOffset = okeyWZ + y * okey.strides()[1];
-                dim_t ovalOffset = ovalWZ + y * oval.strides()[1];
-                dim_t oidxOffset = oidxWZ + y * oidx.strides()[1];
-                dim_t ikeyOffset = ikeyWZ + y * ikey.strides()[1];
-                dim_t ivalOffset = ivalWZ + y * ival.strides()[1];
-
-                uint *ptr = oidx_ptr + oidxOffset;
-                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
-                comp_ptr = ikey_ptr + ikeyOffset;
-                std::stable_sort(ptr, ptr + ikey.dims()[0], comparator);
-
-                for (dim_t i = 0; i < oval.dims()[0]; ++i){
-                    uint sortIdx = oidx_ptr[oidxOffset + i];
-                    okey_ptr[okeyOffset + i] = ikey_ptr[ikeyOffset + sortIdx];
-                    oval_ptr[ovalOffset + i] = ival_ptr[ivalOffset + sortIdx];
-                }
-            }
-        }
-    }
-
-    return;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
 template<typename Tk, typename Tv, bool isAscending>
 void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
            const Array<Tk> &ikey, const Array<Tv> &ival, const uint dim)
@@ -108,7 +29,7 @@ void sort_by_key(Array<Tk> &okey, Array<Tv> &oval,
     oidx.eval();
 
     switch(dim) {
-        case 0: getQueue().enqueue(sort0_by_key<Tk, Tv, isAscending>,
+        case 0: getQueue().enqueue(kernel::sort0_by_key<Tk, Tv, isAscending>,
                                    okey, oval, oidx, ikey, ival); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
diff --git a/src/backend/cpu/sort_index.cpp b/src/backend/cpu/sort_index.cpp
index f941534..ed6afea 100644
--- a/src/backend/cpu/sort_index.cpp
+++ b/src/backend/cpu/sort_index.cpp
@@ -10,72 +10,15 @@
 #include <Array.hpp>
 #include <sort_index.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <algorithm>
 #include <numeric>
 #include <platform.hpp>
 #include <async_queue.hpp>
-
-using std::greater;
-using std::less;
-using std::sort;
+#include <kernel/sort_index.hpp>
 
 namespace cpu
 {
 
-///////////////////////////////////////////////////////////////////////////
-// Kernel Functions
-///////////////////////////////////////////////////////////////////////////
-template<typename T, bool isAscending>
-void sort0_index(Array<T> &val, Array<uint> &idx, const Array<T> &in)
-{
-    // initialize original index locations
-       uint *idx_ptr = idx.get();
-          T *val_ptr = val.get();
-    const T *in_ptr  = in.get();
-    function<bool(T, T)> op = greater<T>();
-    if(isAscending) { op = less<T>(); }
-
-    std::vector<uint> seq_vec(idx.dims()[0]);
-    std::iota(seq_vec.begin(), seq_vec.end(), 0);
-
-    const T *comp_ptr = nullptr;
-    auto comparator = [&comp_ptr, &op](size_t i1, size_t i2) {return op(comp_ptr[i1], comp_ptr[i2]);};
-
-    for(dim_t w = 0; w < in.dims()[3]; w++) {
-        dim_t valW = w * val.strides()[3];
-        dim_t idxW = w * idx.strides()[3];
-        dim_t  inW = w *  in.strides()[3];
-        for(dim_t z = 0; z < in.dims()[2]; z++) {
-            dim_t valWZ = valW + z * val.strides()[2];
-            dim_t idxWZ = idxW + z * idx.strides()[2];
-            dim_t  inWZ =  inW + z *  in.strides()[2];
-            for(dim_t y = 0; y < in.dims()[1]; y++) {
-
-                dim_t valOffset = valWZ + y * val.strides()[1];
-                dim_t idxOffset = idxWZ + y * idx.strides()[1];
-                dim_t inOffset  =  inWZ + y *  in.strides()[1];
-
-                uint *ptr = idx_ptr + idxOffset;
-                std::copy(seq_vec.begin(), seq_vec.end(), ptr);
-
-                comp_ptr = in_ptr + inOffset;
-                std::stable_sort(ptr, ptr + in.dims()[0], comparator);
-
-                for (dim_t i = 0; i < val.dims()[0]; ++i){
-                    val_ptr[valOffset + i] = in_ptr[inOffset + idx_ptr[idxOffset + i]];
-                }
-            }
-        }
-    }
-
-    return;
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Wrapper Functions
-///////////////////////////////////////////////////////////////////////////
 template<typename T, bool isAscending>
 void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint dim)
 {
@@ -84,7 +27,7 @@ void sort_index(Array<T> &val, Array<uint> &idx, const Array<T> &in, const uint
     val = createEmptyArray<T>(in.dims());
     idx = createEmptyArray<uint>(in.dims());
     switch(dim) {
-        case 0: getQueue().enqueue(sort0_index<T, isAscending>, val, idx, in); break;
+        case 0: getQueue().enqueue(kernel::sort0_index<T, isAscending>, val, idx, in); break;
         default: AF_ERROR("Not Supported", AF_ERR_NOT_SUPPORTED);
     }
 }
diff --git a/src/backend/cpu/susan.cpp b/src/backend/cpu/susan.cpp
index c278908..6e8d0fe 100644
--- a/src/backend/cpu/susan.cpp
+++ b/src/backend/cpu/susan.cpp
@@ -14,6 +14,7 @@
 #include <memory>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/susan.hpp>
 
 using af::features;
 using std::shared_ptr;
@@ -22,85 +23,6 @@ namespace cpu
 {
 
 template<typename T>
-void susan_responses(Array<T> output, const Array<T> input,
-                     const unsigned idim0, const unsigned idim1,
-                     const int radius, const float t, const float g,
-                     const unsigned border_len)
-{
-    T* resp_out = output.get();
-    const T* in = input.get();
-
-    const unsigned r = border_len;
-    const int rSqrd = radius*radius;
-
-    for (unsigned y = r; y < idim1 - r; ++y) {
-        for (unsigned x = r; x < idim0 - r; ++x) {
-            const unsigned idx = y * idim0 + x;
-            T m_0 = in[idx];
-            float nM = 0.0f;
-
-            for (int i=-radius; i<=radius; ++i) {
-                for (int j=-radius; j<=radius; ++j) {
-                    if (i*i + j*j < rSqrd) {
-                        int p = x + i;
-                        int q = y + j;
-                        T m = in[p + idim0 * q];
-                        float exp_pow = std::pow((m - m_0)/t, 6.0);
-                        float cM = std::exp(-exp_pow);
-                        nM += cM;
-                    }
-                }
-            }
-
-            resp_out[idx] = nM < g ? g - nM : T(0);
-        }
-    }
-}
-
-template<typename T>
-void non_maximal(Array<float> xcoords, Array<float> ycoords, Array<float> response,
-                 shared_ptr<unsigned> counter, const unsigned idim0, const unsigned idim1,
-                 const Array<T> input, const unsigned border_len, const unsigned max_corners)
-{
-    float* x_out    = xcoords.get();
-    float* y_out    = ycoords.get();
-    float* resp_out = response.get();
-    unsigned* count = counter.get();
-    const T* resp_in= input.get();
-
-    // Responses on the border don't have 8-neighbors to compare, discard them
-    const unsigned r = border_len + 1;
-
-    for (unsigned y = r; y < idim1 - r; y++) {
-        for (unsigned x = r; x < idim0 - r; x++) {
-            const T v = resp_in[y * idim0 + x];
-
-            // Find maximum neighborhood response
-            T max_v;
-            max_v = max(resp_in[(y-1) * idim0 + x-1], resp_in[y * idim0 + x-1]);
-            max_v = max(max_v, resp_in[(y+1) * idim0 + x-1]);
-            max_v = max(max_v, resp_in[(y-1) * idim0 + x  ]);
-            max_v = max(max_v, resp_in[(y+1) * idim0 + x  ]);
-            max_v = max(max_v, resp_in[(y-1) * idim0 + x+1]);
-            max_v = max(max_v, resp_in[(y)   * idim0 + x+1]);
-            max_v = max(max_v, resp_in[(y+1) * idim0 + x+1]);
-
-            // Stores corner to {x,y,resp}_out if it's response is maximum compared
-            // to its 8-neighborhood and greater or equal minimum response
-            if (v > max_v) {
-                const unsigned idx = *count;
-                *count += 1;
-                if (idx < max_corners) {
-                    x_out[idx]    = (float)x;
-                    y_out[idx]    = (float)y;
-                    resp_out[idx] = (float)v;
-                }
-            }
-        }
-    }
-}
-
-template<typename T>
 unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
                const Array<T> &in,
                const unsigned radius, const float diff_thr, const float geom_thr,
@@ -118,9 +40,9 @@ unsigned susan(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
     auto corners_found= std::shared_ptr<unsigned>(memAlloc<unsigned>(1), memFree<unsigned>);
     corners_found.get()[0] = 0;
 
-    getQueue().enqueue(susan_responses<T>, response, in, idims[0], idims[1],
+    getQueue().enqueue(kernel::susan_responses<T>, response, in, idims[0], idims[1],
                        radius, diff_thr, geom_thr, edge);
-    getQueue().enqueue(non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
+    getQueue().enqueue(kernel::non_maximal<T>, x_corners, y_corners, resp_corners, corners_found,
                        idims[0], idims[1], response, edge, corner_lim);
     getQueue().sync();
 
diff --git a/src/backend/cpu/tile.cpp b/src/backend/cpu/tile.cpp
index 4f03545..6526917 100644
--- a/src/backend/cpu/tile.cpp
+++ b/src/backend/cpu/tile.cpp
@@ -9,10 +9,9 @@
 
 #include <Array.hpp>
 #include <tile.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/tile.hpp>
 
 namespace cpu
 {
@@ -32,40 +31,7 @@ Array<T> tile(const Array<T> &in, const af::dim4 &tileDims)
 
     Array<T> out = createEmptyArray<T>(oDims);
 
-    auto func = [=] (Array<T> out, const Array<T> in) {
-
-        T* outPtr = out.get();
-        const T* inPtr = in.get();
-
-        const af::dim4 iDims = in.dims();
-        const af::dim4 oDims = out.dims();
-        const af::dim4 ist = in.strides();
-        const af::dim4 ost = out.strides();
-
-        for(dim_t ow = 0; ow < oDims[3]; ow++) {
-            const dim_t iw = ow % iDims[3];
-            const dim_t iW = iw * ist[3];
-            const dim_t oW = ow * ost[3];
-            for(dim_t oz = 0; oz < oDims[2]; oz++) {
-                const dim_t iz = oz % iDims[2];
-                const dim_t iZW = iW + iz * ist[2];
-                const dim_t oZW = oW + oz * ost[2];
-                for(dim_t oy = 0; oy < oDims[1]; oy++) {
-                    const dim_t iy = oy % iDims[1];
-                    const dim_t iYZW = iZW + iy * ist[1];
-                    const dim_t oYZW = oZW + oy * ost[1];
-                    for(dim_t ox = 0; ox < oDims[0]; ox++) {
-                        const dim_t ix = ox % iDims[0];
-                        const dim_t iMem = iYZW + ix;
-                        const dim_t oMem = oYZW + ox;
-                        outPtr[oMem] = inPtr[iMem];
-                    }
-                }
-            }
-        }
-    };
-
-    getQueue().enqueue(func, out, in);
+    getQueue().enqueue(kernel::tile<T>, out, in);
 
     return out;
 }
diff --git a/src/backend/cpu/transform.cpp b/src/backend/cpu/transform.cpp
index a7287ce..fc71458 100644
--- a/src/backend/cpu/transform.cpp
+++ b/src/backend/cpu/transform.cpp
@@ -10,99 +10,14 @@
 #include <Array.hpp>
 #include <transform.hpp>
 #include <math.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
 #include "transform_interp.hpp"
+#include <kernel/transform.hpp>
 
 namespace cpu
 {
 
-template <typename T>
-void calc_affine_inverse(T *txo, const T *txi)
-{
-    T det = txi[0]*txi[4] - txi[1]*txi[3];
-
-    txo[0] = txi[4] / det;
-    txo[1] = txi[3] / det;
-    txo[3] = txi[1] / det;
-    txo[4] = txi[0] / det;
-
-    txo[2] = txi[2] * -txo[0] + txi[5] * -txo[1];
-    txo[5] = txi[2] * -txo[3] + txi[5] * -txo[4];
-}
-
-template <typename T>
-void calc_affine_inverse(T *tmat, const T *tmat_ptr, const bool inverse)
-{
-    // The way kernel is structured, it expects an inverse
-    // transform matrix by default.
-    // If it is an forward transform, then we need its inverse
-    if(inverse) {
-        for(int i = 0; i < 6; i++)
-            tmat[i] = tmat_ptr[i];
-    } else {
-        calc_affine_inverse(tmat, tmat_ptr);
-    }
-}
-
-template<typename T, af_interp_type method>
-void transform_(Array<T> output, const Array<T> input,
-                const Array<float> transform, const bool inverse)
-{
-    const af::dim4 idims    = input.dims();
-    const af::dim4 odims    = output.dims();
-    const af::dim4 istrides = input.strides();
-    const af::dim4 ostrides = output.strides();
-
-    T * out = output.get();
-    const T * in = input.get();
-    const float* tf = transform.get();
-
-    dim_t nimages     = idims[2];
-    // Multiplied in src/backend/transform.cpp
-    dim_t ntransforms = odims[2] / idims[2];
-
-    void (*t_fn)(T *, const T *, const float *, const af::dim4 &,
-                 const af::dim4 &, const af::dim4 &,
-                 const dim_t, const dim_t, const dim_t, const dim_t);
-
-    switch(method) {
-        case AF_INTERP_NEAREST:
-            t_fn = &transform_n;
-            break;
-        case AF_INTERP_BILINEAR:
-            t_fn = &transform_b;
-            break;
-        case AF_INTERP_LOWER:
-            t_fn = &transform_l;
-            break;
-        default:
-            AF_ERROR("Unsupported interpolation type", AF_ERR_ARG);
-            break;
-    }
-
-
-    // For each transform channel
-    for(int t_idx = 0; t_idx < (int)ntransforms; t_idx++) {
-        // Compute inverse if required
-        const float *tmat_ptr = tf + t_idx * 6;
-        float tmat[6];
-        calc_affine_inverse(tmat, tmat_ptr, inverse);
-
-        // Offset for output pointer
-        dim_t o_offset = t_idx * nimages * ostrides[2];
-
-        // Do transform for image
-        for(int yy = 0; yy < (int)odims[1]; yy++) {
-            for(int xx = 0; xx < (int)odims[0]; xx++) {
-                t_fn(out, in, tmat, idims, ostrides, istrides, nimages, o_offset, xx, yy);
-            }
-        }
-    }
-}
-
 template<typename T>
 Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::dim4 &odims,
                     const af_interp_type method, const bool inverse)
@@ -114,13 +29,13 @@ Array<T> transform(const Array<T> &in, const Array<float> &transform, const af::
 
     switch(method) {
         case AF_INTERP_NEAREST :
-            getQueue().enqueue(transform_<T, AF_INTERP_NEAREST >, out, in, transform, inverse);
+            getQueue().enqueue(kernel::transform<T, AF_INTERP_NEAREST >, out, in, transform, inverse);
             break;
         case AF_INTERP_BILINEAR:
-            getQueue().enqueue(transform_<T, AF_INTERP_BILINEAR>, out, in, transform, inverse);
+            getQueue().enqueue(kernel::transform<T, AF_INTERP_BILINEAR>, out, in, transform, inverse);
             break;
         case AF_INTERP_LOWER   :
-            getQueue().enqueue(transform_<T, AF_INTERP_LOWER   >, out, in, transform, inverse);
+            getQueue().enqueue(kernel::transform<T, AF_INTERP_LOWER   >, out, in, transform, inverse);
             break;
         default: AF_ERROR("Unsupported interpolation type", AF_ERR_ARG); break;
     }
diff --git a/src/backend/cpu/transform_interp.hpp b/src/backend/cpu/transform_interp.hpp
index 5ad4750..d90ae38 100644
--- a/src/backend/cpu/transform_interp.hpp
+++ b/src/backend/cpu/transform_interp.hpp
@@ -7,6 +7,8 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
+#pragma once
+#include <math.hpp>
 #include <types.hpp>
 #include <af/traits.hpp>
 
diff --git a/src/backend/cpu/transpose.cpp b/src/backend/cpu/transpose.cpp
index 7e7eec1..32663e1 100644
--- a/src/backend/cpu/transpose.cpp
+++ b/src/backend/cpu/transpose.cpp
@@ -14,7 +14,7 @@
 #include <transpose.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
-
+#include <kernel/transpose.hpp>
 #include <utility>
 #include <cassert>
 
@@ -23,74 +23,6 @@ using af::dim4;
 namespace cpu
 {
 
-static inline unsigned getIdx(const dim4 &strides,
-        int i, int j = 0, int k = 0, int l = 0)
-{
-    return (l * strides[3] +
-            k * strides[2] +
-            j * strides[1] +
-            i );
-}
-
-template<typename T>
-T getConjugate(const T &in)
-{
-    // For non-complex types return same
-    return in;
-}
-
-template<>
-cfloat getConjugate(const cfloat &in)
-{
-    return std::conj(in);
-}
-
-template<>
-cdouble getConjugate(const cdouble &in)
-{
-    return std::conj(in);
-}
-
-template<typename T, bool conjugate>
-void transpose_(Array<T> output, const Array<T> input)
-{
-    const dim4 odims    = output.dims();
-    const dim4 ostrides = output.strides();
-    const dim4 istrides = input.strides();
-
-    T * out = output.get();
-    T const * const in = input.get();
-
-    for (dim_t l = 0; l < odims[3]; ++l) {
-        for (dim_t k = 0; k < odims[2]; ++k) {
-            // Outermost loop handles batch mode
-            // if input has no data along third dimension
-            // this loop runs only once
-            for (dim_t j = 0; j < odims[1]; ++j) {
-                for (dim_t i = 0; i < odims[0]; ++i) {
-                    // calculate array indices based on offsets and strides
-                    // the helper getIdx takes care of indices
-                    const dim_t inIdx  = getIdx(istrides,j,i,k,l);
-                    const dim_t outIdx = getIdx(ostrides,i,j,k,l);
-                    if(conjugate)
-                        out[outIdx] = getConjugate(in[inIdx]);
-                    else
-                        out[outIdx] = in[inIdx];
-                }
-            }
-            // outData and inData pointers doesn't need to be
-            // offset as the getIdx function is taking care
-            // of the batch parameter
-        }
-    }
-}
-
-template<typename T>
-void transpose_(Array<T> out, const Array<T> in, const bool conjugate)
-{
-    return (conjugate ? transpose_<T, true>(out, in) : transpose_<T, false>(out, in));
-}
-
 template<typename T>
 Array<T> transpose(const Array<T> &in, const bool conjugate)
 {
@@ -101,57 +33,16 @@ Array<T> transpose(const Array<T> &in, const bool conjugate)
     // create an array with first two dimensions swapped
     Array<T> out  = createEmptyArray<T>(outDims);
 
-    getQueue().enqueue(transpose_<T>, out, in, conjugate);
+    getQueue().enqueue(kernel::transpose<T>, out, in, conjugate);
 
     return out;
 }
 
-template<typename T, bool conjugate>
-void transpose_inplace(Array<T> input)
-{
-    const dim4 idims    = input.dims();
-    const dim4 istrides = input.strides();
-
-    T * in = input.get();
-
-    for (dim_t l = 0; l < idims[3]; ++l) {
-        for (dim_t k = 0; k < idims[2]; ++k) {
-            // Outermost loop handles batch mode
-            // if input has no data along third dimension
-            // this loop runs only once
-            //
-            // Run only bottom triangle. std::swap swaps with upper triangle
-            for (dim_t j = 0; j < idims[1]; ++j) {
-                for (dim_t i = j + 1; i < idims[0]; ++i) {
-                    // calculate array indices based on offsets and strides
-                    // the helper getIdx takes care of indices
-                    const dim_t iIdx  = getIdx(istrides,j,i,k,l);
-                    const dim_t oIdx = getIdx(istrides,i,j,k,l);
-                    if(conjugate) {
-                        in[iIdx] = getConjugate(in[iIdx]);
-                        in[oIdx] = getConjugate(in[oIdx]);
-                        std::swap(in[iIdx], in[oIdx]);
-                    }
-                    else {
-                        std::swap(in[iIdx], in[oIdx]);
-                    }
-                }
-            }
-        }
-    }
-}
-
-template<typename T>
-void transpose_inplace_(Array<T> in, const bool conjugate)
-{
-    return (conjugate ? transpose_inplace<T, true >(in) : transpose_inplace<T, false>(in));
-}
-
 template<typename T>
 void transpose_inplace(Array<T> &in, const bool conjugate)
 {
     in.eval();
-    getQueue().enqueue(transpose_inplace_<T>, in, conjugate);
+    getQueue().enqueue(kernel::transpose_inplace<T>, in, conjugate);
 }
 
 #define INSTANTIATE(T)                                                      \
diff --git a/src/backend/cpu/triangle.cpp b/src/backend/cpu/triangle.cpp
index 13bee16..2a9553c 100644
--- a/src/backend/cpu/triangle.cpp
+++ b/src/backend/cpu/triangle.cpp
@@ -14,6 +14,7 @@
 #include <math.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/triangle.hpp>
 
 namespace cpu
 {
@@ -21,46 +22,7 @@ namespace cpu
 template<typename T, bool is_upper, bool is_unit_diag>
 void triangle(Array<T> &out, const Array<T> &in)
 {
-    auto func = [=] (Array<T> out, const Array<T> in) {
-        T *o = out.get();
-        const T *i = in.get();
-
-        dim4 odm = out.dims();
-
-        dim4 ost = out.strides();
-        dim4 ist = in.strides();
-
-        for(dim_t ow = 0; ow < odm[3]; ow++) {
-            const dim_t oW = ow * ost[3];
-            const dim_t iW = ow * ist[3];
-
-            for(dim_t oz = 0; oz < odm[2]; oz++) {
-                const dim_t oZW = oW + oz * ost[2];
-                const dim_t iZW = iW + oz * ist[2];
-
-                for(dim_t oy = 0; oy < odm[1]; oy++) {
-                    const dim_t oYZW = oZW + oy * ost[1];
-                    const dim_t iYZW = iZW + oy * ist[1];
-
-                    for(dim_t ox = 0; ox < odm[0]; ox++) {
-                        const dim_t oMem = oYZW + ox;
-                        const dim_t iMem = iYZW + ox;
-
-                        bool cond = is_upper ? (oy >= ox) : (oy <= ox);
-                        bool do_unit_diag = (is_unit_diag && ox == oy);
-                        if(cond) {
-                            o[oMem] = do_unit_diag ? scalar<T>(1) : i[iMem];
-                        } else {
-                            o[oMem] = scalar<T>(0);
-                        }
-
-                    }
-                }
-            }
-        }
-    };
-
-    getQueue().enqueue(func, out, in);
+    getQueue().enqueue(kernel::triangle<T, is_upper, is_unit_diag>, out, in);
 }
 
 template<typename T, bool is_upper, bool is_unit_diag>
diff --git a/src/backend/cpu/unwrap.cpp b/src/backend/cpu/unwrap.cpp
index 41423c7..1aa37a4 100644
--- a/src/backend/cpu/unwrap.cpp
+++ b/src/backend/cpu/unwrap.cpp
@@ -9,76 +9,15 @@
 
 #include <Array.hpp>
 #include <unwrap.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <dispatch.hpp>
 #include <math.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/unwrap.hpp>
 
 namespace cpu
 {
 
-template<typename T, int d>
-void unwrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
-                const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
-{
-    const T *inPtr = in.get();
-    T *outPtr      = out.get();
-
-    af::dim4 idims    = in.dims();
-    af::dim4 odims    = out.dims();
-    af::dim4 istrides = in.strides();
-    af::dim4 ostrides = out.strides();
-
-    dim_t nx = (idims[0] + 2 * px - wx) / sx + 1;
-
-    for(dim_t w = 0; w < odims[3]; w++) {
-        for(dim_t z = 0; z < odims[2]; z++) {
-
-            dim_t cOut = w * ostrides[3] + z * ostrides[2];
-            dim_t cIn  = w * istrides[3] + z * istrides[2];
-            const T* iptr = inPtr  + cIn;
-            T* optr_= outPtr + cOut;
-
-            for(dim_t col = 0; col < odims[d]; col++) {
-                // Offset output ptr
-                T* optr = optr_ + col * ostrides[d];
-
-                // Calculate input window index
-                dim_t winy = (col / nx);
-                dim_t winx = (col % nx);
-
-                dim_t startx = winx * sx;
-                dim_t starty = winy * sy;
-
-                dim_t spx = startx - px;
-                dim_t spy = starty - py;
-
-                // Short cut condition ensuring all values within input dimensions
-                bool cond = (spx >= 0 && spx + wx < idims[0] && spy >= 0 && spy + wy < idims[1]);
-
-                for(dim_t y = 0; y < wy; y++) {
-                    for(dim_t x = 0; x < wx; x++) {
-                        dim_t xpad = spx + x;
-                        dim_t ypad = spy + y;
-
-                        dim_t oloc = (y * wx + x);
-                        if (d == 0) oloc *= ostrides[1];
-
-                        if(cond || (xpad >= 0 && xpad < idims[0] && ypad >= 0 && ypad < idims[1])) {
-                            dim_t iloc = (ypad * istrides[1] + xpad * istrides[0]);
-                            optr[oloc] = iptr[iloc];
-                        } else {
-                            optr[oloc] = scalar<T>(0.0);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 template<typename T>
 Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
                 const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const bool is_column)
@@ -98,9 +37,9 @@ Array<T> unwrap(const Array<T> &in, const dim_t wx, const dim_t wy,
     Array<T> outArray = createEmptyArray<T>(odims);
 
     if (is_column) {
-        getQueue().enqueue(unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
+        getQueue().enqueue(kernel::unwrap_dim<T, 1>, outArray, in, wx, wy, sx, sy, px, py);
     } else {
-        getQueue().enqueue(unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
+        getQueue().enqueue(kernel::unwrap_dim<T, 0>, outArray, in, wx, wy, sx, sy, px, py);
     }
 
     return outArray;
diff --git a/src/backend/cpu/wrap.cpp b/src/backend/cpu/wrap.cpp
index 3ff54de..07487e0 100644
--- a/src/backend/cpu/wrap.cpp
+++ b/src/backend/cpu/wrap.cpp
@@ -9,75 +9,15 @@
 
 #include <Array.hpp>
 #include <wrap.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <dispatch.hpp>
 #include <math.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/wrap.hpp>
 
 namespace cpu
 {
 
-template<typename T, int d>
-void wrap_dim(Array<T> out, const Array<T> in, const dim_t wx, const dim_t wy,
-              const dim_t sx, const dim_t sy, const dim_t px, const dim_t py)
-{
-    const T *inPtr = in.get();
-    T *outPtr      = out.get();
-
-    af::dim4 idims    = in.dims();
-    af::dim4 odims    = out.dims();
-    af::dim4 istrides = in.strides();
-    af::dim4 ostrides = out.strides();
-
-    dim_t nx = (odims[0] + 2 * px - wx) / sx + 1;
-
-    for(dim_t w = 0; w < idims[3]; w++) {
-        for(dim_t z = 0; z < idims[2]; z++) {
-
-            dim_t cIn  = w * istrides[3] + z * istrides[2];
-            dim_t cOut = w * ostrides[3] + z * ostrides[2];
-            const T* iptr_ = inPtr  + cIn;
-            T* optr= outPtr + cOut;
-
-            for(dim_t col = 0; col < idims[d]; col++) {
-                // Offset output ptr
-                const T* iptr = iptr_ + col * istrides[d];
-
-                // Calculate input window index
-                dim_t winy = (col / nx);
-                dim_t winx = (col % nx);
-
-                dim_t startx = winx * sx;
-                dim_t starty = winy * sy;
-
-                dim_t spx = startx - px;
-                dim_t spy = starty - py;
-
-                // Short cut condition ensuring all values within input dimensions
-                bool cond = (spx >= 0 && spx + wx < odims[0] && spy >= 0 && spy + wy < odims[1]);
-
-                for(dim_t y = 0; y < wy; y++) {
-                    for(dim_t x = 0; x < wx; x++) {
-                        dim_t xpad = spx + x;
-                        dim_t ypad = spy + y;
-
-                        dim_t iloc = (y * wx + x);
-                        if (d == 0) iloc *= istrides[1];
-
-                        if(cond || (xpad >= 0 && xpad < odims[0] && ypad >= 0 && ypad < odims[1])) {
-                            dim_t oloc = (ypad * ostrides[1] + xpad * ostrides[0]);
-                            // FIXME: When using threads, atomize this
-                            optr[oloc] += iptr[iloc];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 template<typename T>
 Array<T> wrap(const Array<T> &in,
               const dim_t ox, const dim_t oy,
@@ -94,9 +34,9 @@ Array<T> wrap(const Array<T> &in,
     in.eval();
 
     if (is_column) {
-        getQueue().enqueue(wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
+        getQueue().enqueue(kernel::wrap_dim<T, 1>, out, in, wx, wy, sx, sy, px, py);
     } else {
-        getQueue().enqueue(wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
+        getQueue().enqueue(kernel::wrap_dim<T, 0>, out, in, wx, wy, sx, sy, px, py);
     }
 
     return out;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git