[arrayfire] 91/284: moved diff, fast, gradient, harris, histogram to kernel namespace

Sun Feb 7 18:59:22 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/experimental
in repository arrayfire.

commit 71298c69887f23b3f18354098b6cadd62968158b
Author: pradeep <pradeep at arrayfire.com>
Date:   Sat Dec 19 00:43:53 2015 -0500

    moved diff, fast, gradient, harris, histogram to kernel namespace
---
 src/backend/cpu/diff.cpp                      |  75 +--------
 src/backend/cpu/fast.cpp                      | 227 +-------------------------
 src/backend/cpu/gradient.cpp                  |  69 +-------
 src/backend/cpu/harris.cpp                    | 133 +--------------
 src/backend/cpu/histogram.cpp                 |  32 +---
 src/backend/cpu/kernel/diff.hpp               |  91 +++++++++++
 src/backend/cpu/{fast.cpp => kernel/fast.hpp} | 145 ++--------------
 src/backend/cpu/kernel/gradient.hpp           |  87 ++++++++++
 src/backend/cpu/kernel/harris.hpp             | 139 ++++++++++++++++
 src/backend/cpu/kernel/histogram.hpp          |  47 ++++++
 10 files changed, 396 insertions(+), 649 deletions(-)

diff --git a/src/backend/cpu/diff.cpp b/src/backend/cpu/diff.cpp
index 8f9c0f1..3f639ca 100644
--- a/src/backend/cpu/diff.cpp
+++ b/src/backend/cpu/diff.cpp
@@ -9,62 +9,25 @@
 
 #include <Array.hpp>
 #include <diff.hpp>
-#include <stdexcept>
-#include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/diff.hpp>
 
 namespace cpu
 {
 
-unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
-{
-    return (l * strides[3] +
-            k * strides[2] +
-            j * strides[1] +
-            i);
-}
-
 template<typename T>
 Array<T>  diff1(const Array<T> &in, const int dim)
 {
     in.eval();
-    // Bool for dimension
-    bool is_dim0 = dim == 0;
-    bool is_dim1 = dim == 1;
-    bool is_dim2 = dim == 2;
-    bool is_dim3 = dim == 3;
 
     // Decrement dimension of select dimension
     af::dim4 dims = in.dims();
     dims[dim]--;
 
-    // Create output placeholder
     Array<T> outArray = createEmptyArray<T>(dims);
 
-    auto func = [=] (Array<T> outArray, Array<T> in) {
-        // Get pointers to raw data
-        const T *inPtr = in.get();
-              T *outPtr = outArray.get();
-
-        // TODO: Improve this
-        for(dim_t l = 0; l < dims[3]; l++) {
-            for(dim_t k = 0; k < dims[2]; k++) {
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        // Operation: out[index] = in[index + 1 * dim_size] - in[index]
-                        int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
-                        int jdx = getIdx(in.strides(), in.offsets(),
-                                        i + is_dim0, j + is_dim1,
-                                        k + is_dim2, l + is_dim3);
-                        int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
-                        outPtr[odx] = inPtr[jdx] - inPtr[idx];
-                    }
-                }
-            }
-        }
-    };
-    getQueue().enqueue(func, outArray, in);
+    getQueue().enqueue(kernel::diff1<T>, outArray, in, dim);
 
     return outArray;
 }
@@ -73,46 +36,14 @@ template<typename T>
 Array<T>  diff2(const Array<T> &in, const int dim)
 {
     in.eval();
-    // Bool for dimension
-    bool is_dim0 = dim == 0;
-    bool is_dim1 = dim == 1;
-    bool is_dim2 = dim == 2;
-    bool is_dim3 = dim == 3;
 
     // Decrement dimension of select dimension
     af::dim4 dims = in.dims();
     dims[dim] -= 2;
 
-    // Create output placeholder
     Array<T> outArray = createEmptyArray<T>(dims);
 
-    auto func = [=] (Array<T> outArray, Array<T> in) {
-        // Get pointers to raw data
-        const T *inPtr = in.get();
-            T *outPtr = outArray.get();
-
-        // TODO: Improve this
-        for(dim_t l = 0; l < dims[3]; l++) {
-            for(dim_t k = 0; k < dims[2]; k++) {
-                for(dim_t j = 0; j < dims[1]; j++) {
-                    for(dim_t i = 0; i < dims[0]; i++) {
-                        // Operation: out[index] = in[index + 1 * dim_size] - in[index]
-                        int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
-                        int jdx = getIdx(in.strides(), in.offsets(),
-                                        i + is_dim0, j + is_dim1,
-                                        k + is_dim2, l + is_dim3);
-                        int kdx = getIdx(in.strides(), in.offsets(),
-                                        i + 2 * is_dim0, j + 2 * is_dim1,
-                                        k + 2 * is_dim2, l + 2 * is_dim3);
-                        int odx = getIdx(outArray.strides(), outArray.offsets(), i, j, k, l);
-                        outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
-                    }
-                }
-            }
-        }
-    };
-
-    getQueue().enqueue(func, outArray, in);
+    getQueue().enqueue(kernel::diff2<T>, outArray, in, dim);
 
     return outArray;
 }
diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/fast.cpp
index c8b0514..fe02387 100644
--- a/src/backend/cpu/fast.cpp
+++ b/src/backend/cpu/fast.cpp
@@ -16,234 +16,13 @@
 #include <fast.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/fast.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-inline int clamp(int f, int a, int b)
-{
-    return std::max(a, std::min(f, b));
-}
-
-inline int idx_y(int i)
-{
-    if (i >= 8)
-        return clamp(-(i-8-4), -3, 3);
-
-    return clamp(i-4, -3, 3);
-}
-
-inline int idx_x(int i)
-{
-    if (i < 12)
-        return idx_y(i+4);
-
-    return idx_y(i-12);
-}
-
-inline int idx(int y, int x, unsigned idim0)
-{
-    return x * idim0 + y;
-}
-
-// test_greater()
-// Tests if a pixel x > p + thr
-inline int test_greater(float x, float p, float thr)
-{
-    return (x >= p + thr);
-}
-
-// test_smaller()
-// Tests if a pixel x < p - thr
-inline int test_smaller(float x, float p, float thr)
-{
-    return (x <= p - thr);
-}
-
-// test_pixel()
-// Returns -1 when x < p - thr
-// Returns  0 when x >= p - thr && x <= p + thr
-// Returns  1 when x > p + thr
-template<typename T>
-inline int test_pixel(const T* image, const float p, float thr, int y, int x, unsigned idim0)
-{
-    return -test_smaller((float)image[idx(y,x,idim0)], p, thr) | test_greater((float)image[idx(y,x,idim0)], p, thr);
-}
-
-// abs_diff()
-// Returns absolute difference of x and y
-inline int abs_diff(int x, int y)
-{
-    return abs(x - y);
-}
-inline unsigned abs_diff(unsigned x, unsigned y)
-{
-    return (unsigned)abs((int)x - (int)y);
-}
-inline float abs_diff(float x, float y)
-{
-    return fabs(x - y);
-}
-inline double abs_diff(double x, double y)
-{
-    return fabs(x - y);
-}
-
-template<typename T>
-void locate_features(
-    const Array<T> &in,
-    Array<float> &score,
-    Array<float> &x_out,
-    Array<float> &y_out,
-    Array<float> &score_out,
-    unsigned* count,
-    const float thr,
-    const unsigned arc_length,
-    const unsigned nonmax,
-    const unsigned max_feat,
-    const unsigned edge)
-{
-    dim4 in_dims = in.dims();
-    const T* in_ptr = in.get();
-
-    for (int y = edge; y < (int)(in_dims[0] - edge); y++) {
-        for (int x = edge; x < (int)(in_dims[1] - edge); x++) {
-            float p = in_ptr[idx(y, x, in_dims[0])];
-
-            // Start by testing opposite pixels of the circle that will result in
-            // a non-kepoint
-            int d;
-            d  = test_pixel<T>(in_ptr, p, thr, y-3,   x, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+3,   x, in_dims[0]);
-            if (d == 0)
-                continue;
-
-            d &= test_pixel<T>(in_ptr, p, thr, y-2, x+2, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+2, x-2, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y  , x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y  , x-3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y+2, x+2, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-2, x-2, in_dims[0]);
-            if (d == 0)
-                continue;
-
-            d &= test_pixel<T>(in_ptr, p, thr, y-3, x+1, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+3, x-1, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y-1, x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y+1, x-3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y+1, x+3, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-1, x-3, in_dims[0]);
-            d &= test_pixel<T>(in_ptr, p, thr, y+3, x+1, in_dims[0]) | test_pixel<T>(in_ptr, p, thr, y-3, x-1, in_dims[0]);
-            if (d == 0)
-                continue;
-
-            int sum = 0;
-
-            // Sum responses [-1, 0 or 1] of first arc_length pixels
-            for (int i = 0; i < static_cast<int>(arc_length); i++)
-                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
-
-            // Test maximum and mininmum responses of first segment of arc_length
-            // pixels
-            int max_sum = 0, min_sum = 0;
-            max_sum = std::max(max_sum, sum);
-            min_sum = std::min(min_sum, sum);
-
-            // Sum responses and test the remaining 16-arc_length pixels of the circle
-            for (int i = arc_length; i < 16; i++) {
-                sum -= test_pixel<T>(in_ptr, p, thr, y+idx_y(i-arc_length), x+idx_x(i-arc_length), in_dims[0]);
-                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
-                max_sum = std::max(max_sum, sum);
-                min_sum = std::min(min_sum, sum);
-            }
-
-            // To completely test all possible segments, it's necessary to test
-            // segments that include the top junction of the circle
-            for (int i = 0; i < static_cast<int>(arc_length-1); i++) {
-                sum -= test_pixel<T>(in_ptr, p, thr, y+idx_y(16-arc_length+i), x+idx_x(16-arc_length+i), in_dims[0]);
-                sum += test_pixel<T>(in_ptr, p, thr, y+idx_y(i), x+idx_x(i), in_dims[0]);
-                max_sum = std::max(max_sum, sum);
-                min_sum = std::min(min_sum, sum);
-            }
-
-            float s_bright = 0, s_dark = 0;
-            for (int i = 0; i < 16; i++) {
-                float p_x = (float)in_ptr[idx(y+idx_y(i), x+idx_x(i), in_dims[0])];
-
-                s_bright += test_greater(p_x, p, thr) * (abs_diff(p_x, p) - thr);
-                s_dark   += test_smaller(p_x, p, thr) * (abs_diff(p, p_x) - thr);
-            }
-
-            // If sum at some point was equal to (+-)arc_length, there is a segment
-            // that for which all pixels are much brighter or much brighter than
-            // central pixel p.
-            if (max_sum == static_cast<int>(arc_length) || min_sum == -static_cast<int>(arc_length)) {
-                unsigned j = *count;
-                ++*count;
-                if (j < max_feat) {
-                    float *x_out_ptr = x_out.get();
-                    float *y_out_ptr = y_out.get();
-                    float *score_out_ptr = score_out.get();
-                    x_out_ptr[j]     = static_cast<float>(x);
-                    y_out_ptr[j]     = static_cast<float>(y);
-                    score_out_ptr[j] = static_cast<float>(std::max(s_bright, s_dark));
-                    if (nonmax == 1) {
-                        float* score_ptr = score.get();
-                        score_ptr[idx(y, x, in_dims[0])] = std::max(s_bright, s_dark);
-                    }
-                }
-            }
-        }
-    }
-}
-
-void non_maximal(
-    const Array<float> &score,
-    const Array<float> &x_in,
-    const Array<float> &y_in,
-    Array<float> &x_out,
-    Array<float> &y_out,
-    Array<float> &score_out,
-    unsigned* count,
-    const unsigned total_feat,
-    const unsigned edge)
-{
-    const float *score_ptr = score.get();
-    const float *x_in_ptr = x_in.get();
-    const float *y_in_ptr = y_in.get();
-
-    dim4 score_dims = score.dims();
-
-    for (unsigned k = 0; k < total_feat; k++) {
-        unsigned x = static_cast<unsigned>(round(x_in_ptr[k]));
-        unsigned y = static_cast<unsigned>(round(y_in_ptr[k]));
-
-        float v = score_ptr[y + score_dims[0] * x];
-        float max_v;
-        max_v = std::max(score_ptr[y-1 + score_dims[0] * (x-1)], score_ptr[y-1 + score_dims[0] * x]);
-        max_v = std::max(max_v, score_ptr[y-1 + score_dims[0] * (x+1)]);
-        max_v = std::max(max_v, score_ptr[y   + score_dims[0] * (x-1)]);
-        max_v = std::max(max_v, score_ptr[y   + score_dims[0] * (x+1)]);
-        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x-1)]);
-        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x)  ]);
-        max_v = std::max(max_v, score_ptr[y+1 + score_dims[0] * (x+1)]);
-
-        if (y >= score_dims[1] - edge - 1 || y <= edge + 1 ||
-            x >= score_dims[0] - edge - 1 || x <= edge + 1)
-            continue;
-
-        // Stores keypoint to feat_out if it's response is maximum compared to
-        // its 8-neighborhood
-        if (v > max_v) {
-            unsigned j = *count;
-            ++*count;
-
-            float *x_out_ptr = x_out.get();
-            float *y_out_ptr = y_out.get();
-            float *score_out_ptr = score_out.get();
-
-            x_out_ptr[j]     = static_cast<float>(x);
-            y_out_ptr[j]     = static_cast<float>(y);
-            score_out_ptr[j] = static_cast<float>(v);
-        }
-    }
-}
-
 template<typename T>
 unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
               const Array<T> &in, const float thr, const unsigned arc_length,
@@ -274,7 +53,7 @@ unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
     // Feature counter
     unsigned count = 0;
 
-    locate_features<T>(in, V, x, y, score, &count, thr, arc_length,
+    kernel::locate_features<T>(in, V, x, y, score, &count, thr, arc_length,
                        nonmax, max_feat, edge);
 
     // If more features than max_feat were detected, feat wasn't populated
@@ -293,7 +72,7 @@ unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
         score_total = createEmptyArray<float>(feat_found_dims);
 
         count = 0;
-        non_maximal(V, x, y,
+        kernel::non_maximal(V, x, y,
                     x_total, y_total, score_total,
                     &count, feat_found, edge);
 
diff --git a/src/backend/cpu/gradient.cpp b/src/backend/cpu/gradient.cpp
index 06c15cf..d1a8b0d 100644
--- a/src/backend/cpu/gradient.cpp
+++ b/src/backend/cpu/gradient.cpp
@@ -14,6 +14,7 @@
 #include <err_cpu.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/gradient.hpp>
 
 namespace cpu
 {
@@ -25,73 +26,7 @@ void gradient(Array<T> &grad0, Array<T> &grad1, const Array<T> &in)
     grad1.eval();
     in.eval();
 
-    auto func = [=] (Array<T> grad0, Array<T> grad1, const Array<T> in) {
-        const af::dim4 dims = in.dims();
-
-        T *d_grad0    = grad0.get();
-        T *d_grad1    = grad1.get();
-        const T *d_in = in.get();
-
-        const af::dim4 inst = in.strides();
-        const af::dim4 g0st = grad0.strides();
-        const af::dim4 g1st = grad1.strides();
-
-        T v5 = scalar<T>(0.5);
-        T v1 = scalar<T>(1.0);
-
-        for(dim_t idw = 0; idw < dims[3]; idw++) {
-            const dim_t inW = idw * inst[3];
-            const dim_t g0W = idw * g0st[3];
-            const dim_t g1W = idw * g1st[3];
-            for(dim_t idz = 0; idz < dims[2]; idz++) {
-                const dim_t inZW = inW + idz * inst[2];
-                const dim_t g0ZW = g0W + idz * g0st[2];
-                const dim_t g1ZW = g1W + idz * g1st[2];
-                dim_t xl, xr, yl,yr;
-                T f0, f1;
-                for(dim_t idy = 0; idy < dims[1]; idy++) {
-                    const dim_t inYZW = inZW + idy * inst[1];
-                    const dim_t g0YZW = g0ZW + idy * g0st[1];
-                    const dim_t g1YZW = g1ZW + idy * g1st[1];
-                    if(idy == 0) {
-                        yl = inYZW + inst[1];
-                        yr = inYZW;
-                        f1 = v1;
-                    } else if(idy == dims[1] - 1) {
-                        yl = inYZW;
-                        yr = inYZW - inst[1];
-                        f1 = v1;
-                    } else {
-                        yl = inYZW + inst[1];
-                        yr = inYZW - inst[1];
-                        f1 = v5;
-                    }
-                    for(dim_t idx = 0; idx < dims[0]; idx++) {
-                        const dim_t inMem = inYZW + idx;
-                        const dim_t g0Mem = g0YZW + idx;
-                        const dim_t g1Mem = g1YZW + idx;
-                        if(idx == 0) {
-                            xl = inMem + 1;
-                            xr = inMem;
-                            f0 = v1;
-                        } else if(idx == dims[0] - 1) {
-                            xl = inMem;
-                            xr = inMem - 1;
-                            f0 = v1;
-                        } else {
-                            xl = inMem + 1;
-                            xr = inMem - 1;
-                            f0 = v5;
-                        }
-
-                        d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]);
-                        d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]);
-                    }
-                }
-            }
-        }
-    };
-    getQueue().enqueue(func, grad0, grad1, in);
+    getQueue().enqueue(kernel::gradient<T>, grad0, grad1, in);
 }
 
 #define INSTANTIATE(T)                                                                  \
diff --git a/src/backend/cpu/harris.cpp b/src/backend/cpu/harris.cpp
index b57b940..e5ff906 100644
--- a/src/backend/cpu/harris.cpp
+++ b/src/backend/cpu/harris.cpp
@@ -12,7 +12,6 @@
 #include <af/constants.h>
 #include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
 #include <handle.hpp>
 #include <harris.hpp>
 #include <convolve.hpp>
@@ -21,133 +20,13 @@
 #include <cstring>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/harris.hpp>
 
 using af::dim4;
 
 namespace cpu
 {
 
-template<typename T>
-void gaussian1D(T* out, const int dim, double sigma=0.0)
-{
-    if(!(sigma>0)) sigma = 0.25*dim;
-
-    T sum = (T)0;
-    for(int i=0;i<dim;i++)
-    {
-        int x = i-(dim-1)/2;
-        T el = 1. / sqrt(2 * af::Pi * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
-        out[i] = el;
-        sum   += el;
-    }
-
-    for(int k=0;k<dim;k++)
-        out[k] /= sum;
-}
-
-template<typename T>
-void second_order_deriv(Array<T> ixx, Array<T> ixy, Array<T> iyy,
-                        const unsigned in_len, const Array<T> ix, const Array<T> iy)
-{
-    T* ixx_out     = ixx.get();
-    T* ixy_out     = ixy.get();
-    T* iyy_out     = iyy.get();
-    const T* ix_in = ix.get();
-    const T* iy_in = iy.get();
-    for (unsigned x = 0; x < in_len; x++) {
-        ixx_out[x] = ix_in[x] * ix_in[x];
-        ixy_out[x] = ix_in[x] * iy_in[x];
-        iyy_out[x] = iy_in[x] * iy_in[x];
-    }
-}
-
-template<typename T>
-void harris_responses(Array<T> resp, const unsigned idim0, const unsigned idim1,
-                      const Array<T> ixx, const Array<T> ixy, const Array<T> iyy,
-                      const float k_thr, const unsigned border_len)
-{
-    T* resp_out      = resp.get();
-    const T* ixx_in  = ixx.get();
-    const T* ixy_in  = ixy.get();
-    const T* iyy_in  = iyy.get();
-    const unsigned r = border_len;
-
-    for (unsigned x = r; x < idim1 - r; x++) {
-        for (unsigned y = r; y < idim0 - r; y++) {
-            const unsigned idx = x * idim0 + y;
-
-            // Calculates matrix trace and determinant
-            T tr = ixx_in[idx] + iyy_in[idx];
-            T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx];
-
-            // Calculates local Harris response
-            resp_out[idx] = det - k_thr * (tr*tr);
-        }
-    }
-}
-
-template<typename T>
-void non_maximal(Array<float> xOut, Array<float> yOut, Array<float> respOut, unsigned* count,
-                 const unsigned idim0, const unsigned idim1, const Array<T> respIn,
-                 const float min_resp, const unsigned border_len, const unsigned max_corners)
-{
-    float* x_out = xOut.get();
-    float* y_out = yOut.get();
-    float* resp_out = respOut.get();
-    const T* resp_in = respIn.get();
-    // Responses on the border don't have 8-neighbors to compare, discard them
-    const unsigned r = border_len + 1;
-
-    for (unsigned x = r; x < idim1 - r; x++) {
-        for (unsigned y = r; y < idim0 - r; y++) {
-            const T v = resp_in[x * idim0 + y];
-
-            // Find maximum neighborhood response
-            T max_v;
-            max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]);
-            max_v = max(max_v, resp_in[(x-1) * idim0 + y  ]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y  ]);
-            max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]);
-            max_v = max(max_v, resp_in[(x)   * idim0 + y+1]);
-            max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]);
-
-            // Stores corner to {x,y,resp}_out if it's response is maximum compared
-            // to its 8-neighborhood and greater or equal minimum response
-            if (v > max_v && v >= (T)min_resp) {
-                const unsigned idx = *count;
-                *count += 1;
-                if (idx < max_corners) {
-                    x_out[idx]    = (float)x;
-                    y_out[idx]    = (float)y;
-                    resp_out[idx] = (float)v;
-                }
-            }
-        }
-    }
-}
-
-static void keep_corners(Array<float> xOut, Array<float> yOut, Array<float> respOut,
-                         const Array<float> xIn, const Array<float> yIn,
-                         const Array<float> respIn, const Array<unsigned> respIdx,
-                         const unsigned n_corners)
-{
-    float* x_out = xOut.get();
-    float* y_out = yOut.get();
-    float* resp_out = respOut.get();
-    const float* x_in = xIn.get();
-    const float* y_in = yIn.get();
-    const float* resp_in = respIn.get();
-    const uint* resp_idx = respIdx.get();
-
-    // Keep only the first n_feat features
-    for (unsigned f = 0; f < n_corners; f++) {
-        x_out[f] = x_in[resp_idx[f]];
-        y_out[f] = y_in[resp_idx[f]];
-        resp_out[f] = resp_in[f];
-    }
-}
-
 template<typename T, typename convAccT>
 unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out,
                 const Array<T> &in, const unsigned max_corners, const float min_response,
@@ -164,7 +43,7 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
         for (unsigned i = 0; i < filter_len; i++)
             h_filter[i] = (T)1.f / (filter_len);
     } else {
-        gaussian1D<convAccT>(h_filter, (int)filter_len, sigma);
+        kernel::gaussian1D<convAccT>(h_filter, (int)filter_len, sigma);
     }
     Array<convAccT> filter = createDeviceDataArray<convAccT>(dim4(filter_len), (const void*)h_filter);
 
@@ -181,7 +60,7 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
     Array<T> iyy = createEmptyArray<T>(idims);
 
     // Compute second-order derivatives
-    getQueue().enqueue(second_order_deriv<T>, ixx, ixy, iyy, in.elements(), ix, iy);
+    getQueue().enqueue(kernel::second_order_deriv<T>, ixx, ixy, iyy, in.elements(), ix, iy);
 
     // Convolve second-order derivatives with proper window filter
     ixx = convolve2<T, convAccT, false>(ixx, filter, filter);
@@ -192,7 +71,7 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
 
     Array<T> responses = createEmptyArray<T>(dim4(in.elements()));
 
-    getQueue().enqueue(harris_responses<T>, responses, idims[0], idims[1],
+    getQueue().enqueue(kernel::harris_responses<T>, responses, idims[0], idims[1],
                        ixx, ixy, iyy, k_thr, border_len);
 
     Array<float> xCorners    = createEmptyArray<float>(dim4(corner_lim));
@@ -204,7 +83,7 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
     // Performs non-maximal suppression
     getQueue().sync();
     unsigned corners_found = 0;
-    non_maximal<T>(xCorners, yCorners, respCorners, &corners_found,
+    kernel::non_maximal<T>(xCorners, yCorners, respCorners, &corners_found,
                    idims[0], idims[1], responses, min_r, border_len, corner_lim);
 
     const unsigned corners_out = (max_corners > 0) ?
@@ -226,7 +105,7 @@ unsigned harris(Array<float> &x_out, Array<float> &y_out, Array<float> &resp_out
         resp_out = createEmptyArray<float>(dim4(corners_out));
 
         // Keep only the corners with higher Harris responses
-        getQueue().enqueue(keep_corners, x_out, y_out, resp_out, xCorners, yCorners,
+        getQueue().enqueue(kernel::keep_corners, x_out, y_out, resp_out, xCorners, yCorners,
                            harris_sorted, harris_idx, corners_out);
     } else if (max_corners == 0 && corners_found < corner_lim) {
         x_out = createEmptyArray<float>(dim4(corners_out));
diff --git a/src/backend/cpu/histogram.cpp b/src/backend/cpu/histogram.cpp
index 8fb3e43..7e20247 100644
--- a/src/backend/cpu/histogram.cpp
+++ b/src/backend/cpu/histogram.cpp
@@ -14,6 +14,7 @@
 #include <histogram.hpp>
 #include <platform.hpp>
 #include <async_queue.hpp>
+#include <kernel/histogram.hpp>
 
 using af::dim4;
 
@@ -21,7 +22,8 @@ namespace cpu
 {
 
 template<typename inType, typename outType, bool isLinear>
-Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const double &minval, const double &maxval)
+Array<outType> histogram(const Array<inType> &in,
+        const unsigned &nbins, const double &minval, const double &maxval)
 {
     in.eval();
 
@@ -30,32 +32,8 @@ Array<outType> histogram(const Array<inType> &in, const unsigned &nbins, const d
     Array<outType> out = createValueArray<outType>(outDims, outType(0));
     out.eval();
 
-    auto func = [=](Array<outType> out, const Array<inType> in,
-                    const unsigned nbins, const double minval, const double maxval) {
-        const float step     = (maxval - minval)/(float)nbins;
-        const dim4 inDims    = in.dims();
-        const dim4 iStrides  = in.strides();
-        const dim4 oStrides  = out.strides();
-        const dim_t nElems   = inDims[0]*inDims[1];
-
-        outType *outData    = out.get();
-        const inType* inData= in.get();
-
-        for(dim_t b3 = 0; b3 < outDims[3]; b3++) {
-            for(dim_t b2 = 0; b2 < outDims[2]; b2++) {
-                for(dim_t i=0; i<nElems; i++) {
-                    int idx = isLinear ? i : ((i % inDims[0]) + (i / inDims[0])*iStrides[1]);
-                    int bin = (int)((inData[idx] - minval) / step);
-                    bin = std::max(bin, 0);
-                    bin = std::min(bin, (int)(nbins - 1));
-                    outData[bin]++;
-                }
-                inData  += iStrides[2];
-                outData += oStrides[2];
-            }
-        }
-    };
-    getQueue().enqueue(func, out, in, nbins, minval, maxval);
+    getQueue().enqueue(kernel::histogram<inType, outType, isLinear>,
+            out, in, nbins, minval, maxval);
 
     return out;
 }
diff --git a/src/backend/cpu/kernel/diff.hpp b/src/backend/cpu/kernel/diff.hpp
new file mode 100644
index 0000000..e0693b1
--- /dev/null
+++ b/src/backend/cpu/kernel/diff.hpp
@@ -0,0 +1,91 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+unsigned getIdx(af::dim4 strides, af::dim4 offs, int i, int j = 0, int k = 0, int l = 0)
+{
+    return (l * strides[3] + k * strides[2] + j * strides[1] + i);
+}
+
+
+template<typename T>
+void diff1(Array<T> out, Array<T> const in, int const dim)
+{
+    af::dim4 dims = out.dims();
+    // Bool for dimension
+    bool is_dim0 = dim == 0;
+    bool is_dim1 = dim == 1;
+    bool is_dim2 = dim == 2;
+    bool is_dim3 = dim == 3;
+
+    // Get pointers to raw data
+    const T *inPtr = in.get();
+    T *outPtr = out.get();
+
+    // TODO: Improve this
+    for(dim_t l = 0; l < dims[3]; l++) {
+        for(dim_t k = 0; k < dims[2]; k++) {
+            for(dim_t j = 0; j < dims[1]; j++) {
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+                    int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
+                    int jdx = getIdx(in.strides(), in.offsets(),
+                            i + is_dim0, j + is_dim1,
+                            k + is_dim2, l + is_dim3);
+                    int odx = getIdx(out.strides(), out.offsets(), i, j, k, l);
+                    outPtr[odx] = inPtr[jdx] - inPtr[idx];
+                }
+            }
+        }
+    }
+}
+
+template<typename T>
+void diff2(Array<T> out, Array<T> const in, int const dim)
+{
+    af::dim4 dims = out.dims();
+    // Bool for dimension
+    bool is_dim0 = dim == 0;
+    bool is_dim1 = dim == 1;
+    bool is_dim2 = dim == 2;
+    bool is_dim3 = dim == 3;
+
+    // Get pointers to raw data
+    const T *inPtr = in.get();
+    T *outPtr = out.get();
+
+    // TODO: Improve this
+    for(dim_t l = 0; l < dims[3]; l++) {
+        for(dim_t k = 0; k < dims[2]; k++) {
+            for(dim_t j = 0; j < dims[1]; j++) {
+                for(dim_t i = 0; i < dims[0]; i++) {
+                    // Operation: out[index] = in[index + 1 * dim_size] - in[index]
+                    int idx = getIdx(in.strides(), in.offsets(), i, j, k, l);
+                    int jdx = getIdx(in.strides(), in.offsets(),
+                            i + is_dim0, j + is_dim1,
+                            k + is_dim2, l + is_dim3);
+                    int kdx = getIdx(in.strides(), in.offsets(),
+                            i + 2 * is_dim0, j + 2 * is_dim1,
+                            k + 2 * is_dim2, l + 2 * is_dim3);
+                    int odx = getIdx(out.strides(), out.offsets(), i, j, k, l);
+                    outPtr[odx] = inPtr[kdx] + inPtr[idx] - inPtr[jdx] - inPtr[jdx];
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/fast.cpp b/src/backend/cpu/kernel/fast.hpp
similarity index 64%
copy from src/backend/cpu/fast.cpp
copy to src/backend/cpu/kernel/fast.hpp
index c8b0514..a3971dd 100644
--- a/src/backend/cpu/fast.cpp
+++ b/src/backend/cpu/kernel/fast.hpp
@@ -1,5 +1,5 @@
 /*******************************************************
- * Copyright (c) 2014, ArrayFire
+ * Copyright (c) 2015, ArrayFire
  * All rights reserved.
  *
  * This file is distributed under 3-clause BSD license.
@@ -7,20 +7,14 @@
  * http://arrayfire.com/licenses/BSD-3-Clause
  ********************************************************/
 
-#include <af/dim4.hpp>
-#include <af/defines.h>
-#include <ArrayInfo.hpp>
 #include <Array.hpp>
-#include <err_cpu.hpp>
-#include <handle.hpp>
-#include <fast.hpp>
-#include <platform.hpp>
-#include <async_queue.hpp>
-
-using af::dim4;
 
 namespace cpu
 {
+namespace kernel
+{
+
+using af::dim4;
 
 inline int clamp(int f, int a, int b)
 {
@@ -92,18 +86,11 @@ inline double abs_diff(double x, double y)
 }
 
 template<typename T>
-void locate_features(
-    const Array<T> &in,
-    Array<float> &score,
-    Array<float> &x_out,
-    Array<float> &y_out,
-    Array<float> &score_out,
-    unsigned* count,
-    const float thr,
-    const unsigned arc_length,
-    const unsigned nonmax,
-    const unsigned max_feat,
-    const unsigned edge)
+void locate_features(const Array<T> &in, Array<float> &score,
+                     Array<float> &x_out, Array<float> &y_out,
+                     Array<float> &score_out, unsigned* count, const float thr,
+                     const unsigned arc_length, const unsigned nonmax,
+                     const unsigned max_feat, const unsigned edge)
 {
     dim4 in_dims = in.dims();
     const T* in_ptr = in.get();
@@ -192,16 +179,9 @@ void locate_features(
     }
 }
 
-void non_maximal(
-    const Array<float> &score,
-    const Array<float> &x_in,
-    const Array<float> &y_in,
-    Array<float> &x_out,
-    Array<float> &y_out,
-    Array<float> &score_out,
-    unsigned* count,
-    const unsigned total_feat,
-    const unsigned edge)
+void non_maximal(const Array<float> &score, const Array<float> &x_in, const Array<float> &y_in,
+                 Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
+                 unsigned* count, const unsigned total_feat, const unsigned edge)
 {
     const float *score_ptr = score.get();
     const float *x_in_ptr = x_in.get();
@@ -244,104 +224,5 @@ void non_maximal(
     }
 }
 
-template<typename T>
-unsigned fast(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,
-              const Array<T> &in, const float thr, const unsigned arc_length,
-              const bool nonmax, const float feature_ratio,
-              const unsigned edge)
-{
-    in.eval();
-    getQueue().sync();
-
-    dim4 in_dims = in.dims();
-    const unsigned max_feat = ceil(in.elements() * feature_ratio);
-
-    // Matrix containing scores for detected features, scores are stored in the
-    // same coordinates as features, dimensions should be equal to in.
-    Array<float> V = createEmptyArray<float>(dim4());
-    if (nonmax == 1) {
-        dim4 V_dims(in_dims[0], in_dims[1]);
-        V = createValueArray<float>(V_dims, (float)0);
-        V.eval();
-    }
-
-    // Arrays containing all features detected before non-maximal suppression.
-    dim4 max_feat_dims(max_feat);
-    Array<float> x = createEmptyArray<float>(max_feat_dims);
-    Array<float> y = createEmptyArray<float>(max_feat_dims);
-    Array<float> score = createEmptyArray<float>(max_feat_dims);
-
-    // Feature counter
-    unsigned count = 0;
-
-    locate_features<T>(in, V, x, y, score, &count, thr, arc_length,
-                       nonmax, max_feat, edge);
-
-    // If more features than max_feat were detected, feat wasn't populated
-    // with them anyway, so the real number of features will be that of
-    // max_feat and not count.
-    unsigned feat_found = std::min(max_feat, count);
-    dim4 feat_found_dims(feat_found);
-
-    Array<float> x_total = createEmptyArray<float>(af::dim4());
-    Array<float> y_total = createEmptyArray<float>(af::dim4());
-    Array<float> score_total = createEmptyArray<float>(af::dim4());
-
-    if (nonmax == 1) {
-        x_total     = createEmptyArray<float>(feat_found_dims);
-        y_total     = createEmptyArray<float>(feat_found_dims);
-        score_total = createEmptyArray<float>(feat_found_dims);
-
-        count = 0;
-        non_maximal(V, x, y,
-                    x_total, y_total, score_total,
-                    &count, feat_found, edge);
-
-        feat_found = std::min(max_feat, count);
-    } else {
-        x_total = x;
-        y_total = y;
-        score_total = score;
-    }
-
-    if (feat_found > 0) {
-        feat_found_dims = dim4(feat_found);
-
-        x_out = createEmptyArray<float>(feat_found_dims);
-        y_out = createEmptyArray<float>(feat_found_dims);
-        score_out = createEmptyArray<float>(feat_found_dims);
-
-        float *x_total_ptr = x_total.get();
-        float *y_total_ptr = y_total.get();
-        float *score_total_ptr = score_total.get();
-
-
-        float *x_out_ptr = x_out.get();
-        float *y_out_ptr = y_out.get();
-        float *score_out_ptr = score_out.get();
-
-        for (size_t i = 0; i < feat_found; i++) {
-            x_out_ptr[i] = x_total_ptr[i];
-            y_out_ptr[i] = y_total_ptr[i];
-            score_out_ptr[i] = score_total_ptr[i];
-        }
-    }
-
-    return feat_found;
 }
-
-#define INSTANTIATE(T)                                                                              \
-    template unsigned fast<T>(Array<float> &x_out, Array<float> &y_out, Array<float> &score_out,    \
-                              const Array<T> &in, const float thr, const unsigned arc_length,       \
-                              const bool nonmax, const float feature_ratio, const unsigned edge);
-
-INSTANTIATE(float )
-INSTANTIATE(double)
-INSTANTIATE(char  )
-INSTANTIATE(int   )
-INSTANTIATE(uint  )
-INSTANTIATE(uchar )
-INSTANTIATE(short )
-INSTANTIATE(ushort)
-
 }
diff --git a/src/backend/cpu/kernel/gradient.hpp b/src/backend/cpu/kernel/gradient.hpp
new file mode 100644
index 0000000..c152fb3
--- /dev/null
+++ b/src/backend/cpu/kernel/gradient.hpp
@@ -0,0 +1,87 @@
+/*******************************************************
+ * Copyright (c) 2014, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void gradient(Array<T> grad0, Array<T> grad1, Array<T> const in)
+{
+    const af::dim4 dims = in.dims();
+
+    T *d_grad0    = grad0.get();
+    T *d_grad1    = grad1.get();
+    const T *d_in = in.get();
+
+    const af::dim4 inst = in.strides();
+    const af::dim4 g0st = grad0.strides();
+    const af::dim4 g1st = grad1.strides();
+
+    T v5 = scalar<T>(0.5);
+    T v1 = scalar<T>(1.0);
+
+    for(dim_t idw = 0; idw < dims[3]; idw++) {
+        const dim_t inW = idw * inst[3];
+        const dim_t g0W = idw * g0st[3];
+        const dim_t g1W = idw * g1st[3];
+        for(dim_t idz = 0; idz < dims[2]; idz++) {
+            const dim_t inZW = inW + idz * inst[2];
+            const dim_t g0ZW = g0W + idz * g0st[2];
+            const dim_t g1ZW = g1W + idz * g1st[2];
+            dim_t xl, xr, yl,yr;
+            T f0, f1;
+            for(dim_t idy = 0; idy < dims[1]; idy++) {
+                const dim_t inYZW = inZW + idy * inst[1];
+                const dim_t g0YZW = g0ZW + idy * g0st[1];
+                const dim_t g1YZW = g1ZW + idy * g1st[1];
+                if(idy == 0) {
+                    yl = inYZW + inst[1];
+                    yr = inYZW;
+                    f1 = v1;
+                } else if(idy == dims[1] - 1) {
+                    yl = inYZW;
+                    yr = inYZW - inst[1];
+                    f1 = v1;
+                } else {
+                    yl = inYZW + inst[1];
+                    yr = inYZW - inst[1];
+                    f1 = v5;
+                }
+                for(dim_t idx = 0; idx < dims[0]; idx++) {
+                    const dim_t inMem = inYZW + idx;
+                    const dim_t g0Mem = g0YZW + idx;
+                    const dim_t g1Mem = g1YZW + idx;
+                    if(idx == 0) {
+                        xl = inMem + 1;
+                        xr = inMem;
+                        f0 = v1;
+                    } else if(idx == dims[0] - 1) {
+                        xl = inMem;
+                        xr = inMem - 1;
+                        f0 = v1;
+                    } else {
+                        xl = inMem + 1;
+                        xr = inMem - 1;
+                        f0 = v5;
+                    }
+
+                    d_grad0[g0Mem] = f0 * (d_in[xl] - d_in[xr]);
+                    d_grad1[g1Mem] = f1 * (d_in[yl + idx] - d_in[yr + idx]);
+                }
+            }
+        }
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/harris.hpp b/src/backend/cpu/kernel/harris.hpp
new file mode 100644
index 0000000..db6551b
--- /dev/null
+++ b/src/backend/cpu/kernel/harris.hpp
@@ -0,0 +1,139 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename T>
+void gaussian1D(T* out, const int dim, double sigma=0.0)
+{
+    if(!(sigma>0)) sigma = 0.25*dim;
+
+    T sum = (T)0;
+    for(int i=0;i<dim;i++)
+    {
+        int x = i-(dim-1)/2;
+        T el = 1. / sqrt(2 * af::Pi * sigma*sigma) * exp(-((x*x)/(2*(sigma*sigma))));
+        out[i] = el;
+        sum   += el;
+    }
+
+    for(int k=0;k<dim;k++)
+        out[k] /= sum;
+}
+
+template<typename T>
+void second_order_deriv(Array<T> ixx, Array<T> ixy, Array<T> iyy,
+                        const unsigned in_len, const Array<T> ix, const Array<T> iy)
+{
+    T* ixx_out     = ixx.get();
+    T* ixy_out     = ixy.get();
+    T* iyy_out     = iyy.get();
+    const T* ix_in = ix.get();
+    const T* iy_in = iy.get();
+    for (unsigned x = 0; x < in_len; x++) {
+        ixx_out[x] = ix_in[x] * ix_in[x];
+        ixy_out[x] = ix_in[x] * iy_in[x];
+        iyy_out[x] = iy_in[x] * iy_in[x];
+    }
+}
+
+template<typename T>
+void harris_responses(Array<T> resp, const unsigned idim0, const unsigned idim1,
+                      const Array<T> ixx, const Array<T> ixy, const Array<T> iyy,
+                      const float k_thr, const unsigned border_len)
+{
+    T* resp_out      = resp.get();
+    const T* ixx_in  = ixx.get();
+    const T* ixy_in  = ixy.get();
+    const T* iyy_in  = iyy.get();
+    const unsigned r = border_len;
+
+    for (unsigned x = r; x < idim1 - r; x++) {
+        for (unsigned y = r; y < idim0 - r; y++) {
+            const unsigned idx = x * idim0 + y;
+
+            // Calculates matrix trace and determinant
+            T tr = ixx_in[idx] + iyy_in[idx];
+            T det = ixx_in[idx] * iyy_in[idx] - ixy_in[idx] * ixy_in[idx];
+
+            // Calculates local Harris response
+            resp_out[idx] = det - k_thr * (tr*tr);
+        }
+    }
+}
+
+template<typename T>
+void non_maximal(Array<float> xOut, Array<float> yOut, Array<float> respOut, unsigned* count,
+                 const unsigned idim0, const unsigned idim1, const Array<T> respIn,
+                 const float min_resp, const unsigned border_len, const unsigned max_corners)
+{
+    float* x_out = xOut.get();
+    float* y_out = yOut.get();
+    float* resp_out = respOut.get();
+    const T* resp_in = respIn.get();
+    // Responses on the border don't have 8-neighbors to compare, discard them
+    const unsigned r = border_len + 1;
+
+    for (unsigned x = r; x < idim1 - r; x++) {
+        for (unsigned y = r; y < idim0 - r; y++) {
+            const T v = resp_in[x * idim0 + y];
+
+            // Find maximum neighborhood response
+            T max_v;
+            max_v = max(resp_in[(x-1) * idim0 + y-1], resp_in[x * idim0 + y-1]);
+            max_v = max(max_v, resp_in[(x+1) * idim0 + y-1]);
+            max_v = max(max_v, resp_in[(x-1) * idim0 + y  ]);
+            max_v = max(max_v, resp_in[(x+1) * idim0 + y  ]);
+            max_v = max(max_v, resp_in[(x-1) * idim0 + y+1]);
+            max_v = max(max_v, resp_in[(x)   * idim0 + y+1]);
+            max_v = max(max_v, resp_in[(x+1) * idim0 + y+1]);
+
+            // Stores corner to {x,y,resp}_out if it's response is maximum compared
+            // to its 8-neighborhood and greater or equal minimum response
+            if (v > max_v && v >= (T)min_resp) {
+                const unsigned idx = *count;
+                *count += 1;
+                if (idx < max_corners) {
+                    x_out[idx]    = (float)x;
+                    y_out[idx]    = (float)y;
+                    resp_out[idx] = (float)v;
+                }
+            }
+        }
+    }
+}
+
+static void keep_corners(Array<float> xOut, Array<float> yOut, Array<float> respOut,
+                         const Array<float> xIn, const Array<float> yIn,
+                         const Array<float> respIn, const Array<unsigned> respIdx,
+                         const unsigned n_corners)
+{
+    float* x_out = xOut.get();
+    float* y_out = yOut.get();
+    float* resp_out = respOut.get();
+    const float* x_in = xIn.get();
+    const float* y_in = yIn.get();
+    const float* resp_in = respIn.get();
+    const uint* resp_idx = respIdx.get();
+
+    // Keep only the first n_feat features
+    for (unsigned f = 0; f < n_corners; f++) {
+        x_out[f] = x_in[resp_idx[f]];
+        y_out[f] = y_in[resp_idx[f]];
+        resp_out[f] = resp_in[f];
+    }
+}
+
+}
+}
diff --git a/src/backend/cpu/kernel/histogram.hpp b/src/backend/cpu/kernel/histogram.hpp
new file mode 100644
index 0000000..e26965a
--- /dev/null
+++ b/src/backend/cpu/kernel/histogram.hpp
@@ -0,0 +1,47 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace cpu
+{
+namespace kernel
+{
+
+template<typename inType, typename outType, bool isLinear>
+void histogram(Array<outType> out, Array<inType> const in,
+               unsigned const nbins, double const minval, double const maxval)
+{
+    dim4 const outDims   = out.dims();
+    float const step     = (maxval - minval)/(float)nbins;
+    dim4 const inDims    = in.dims();
+    dim4 const iStrides  = in.strides();
+    dim4 const oStrides  = out.strides();
+    dim_t const nElems   = inDims[0]*inDims[1];
+
+    outType *outData    = out.get();
+    const inType* inData= in.get();
+
+    for(dim_t b3 = 0; b3 < outDims[3]; b3++) {
+        for(dim_t b2 = 0; b2 < outDims[2]; b2++) {
+            for(dim_t i=0; i<nElems; i++) {
+                int idx = isLinear ? i : ((i % inDims[0]) + (i / inDims[0])*iStrides[1]);
+                int bin = (int)((inData[idx] - minval) / step);
+                bin = std::max(bin, 0);
+                bin = std::min(bin, (int)(nbins - 1));
+                outData[bin]++;
+            }
+            inData  += iStrides[2];
+            outData += oStrides[2];
+        }
+    }
+}
+
+}
+}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git