[arrayfire] 183/248: Added OpenCL backend for homography

Tue Nov 17 15:54:25 UTC 2015

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch dfsg-clean
in repository arrayfire.

commit 5ca352a2a3aa3de93279d19eda52e1281ba0456e
Author: Peter Andreas Entschev <peter at arrayfire.com>
Date:   Tue Nov 3 14:39:10 2015 -0500

    Added OpenCL backend for homography
---
 src/backend/opencl/homography.cpp        |  94 ++++++
 src/backend/opencl/homography.hpp        |  22 ++
 src/backend/opencl/kernel/homography.cl  | 514 +++++++++++++++++++++++++++++++
 src/backend/opencl/kernel/homography.hpp | 261 ++++++++++++++++
 4 files changed, 891 insertions(+)

diff --git a/src/backend/opencl/homography.cpp b/src/backend/opencl/homography.cpp
new file mode 100644
index 0000000..f93b0fe
--- /dev/null
+++ b/src/backend/opencl/homography.cpp
@@ -0,0 +1,94 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/dim4.hpp>
+#include <af/defines.h>
+#include <ArrayInfo.hpp>
+#include <Array.hpp>
+#include <err_opencl.hpp>
+#include <handle.hpp>
+#include <arith.hpp>
+#include <random.hpp>
+#include <kernel/homography.hpp>
+#include <algorithm>
+
+#include <iostream>
+#include <cfloat>
+
+using af::dim4;
+
+namespace opencl
+{
+
+#define RANSACConfidence 0.99f
+#define LMEDSConfidence 0.99f
+#define LMEDSOutlierRatio 0.4f
+
+template<typename T>
+int homography(Array<T> &bestH,
+               const Array<float> &x_src,
+               const Array<float> &y_src,
+               const Array<float> &x_dst,
+               const Array<float> &y_dst,
+               const af_homography_type htype,
+               const float inlier_thr,
+               const unsigned iterations)
+{
+    const af::dim4 idims = x_src.dims();
+    const unsigned nsamples = idims[0];
+
+    unsigned iter = iterations;
+    Array<float> err = createEmptyArray<float>(af::dim4());
+    if (htype == AF_LMEDS) {
+        iter = ::std::min(iter, (unsigned)(log(1.f - LMEDSConfidence) / log(1.f - pow(1.f - LMEDSOutlierRatio, 4.f))));
+        err = createValueArray<float>(af::dim4(nsamples, iter), FLT_MAX);
+    }
+    else {
+        // Avoid passing "null" cl_mem object to kernels
+        err = createEmptyArray<float>(af::dim4(1));
+    }
+
+    af::dim4 rdims(4, iter);
+    Array<float> frnd = randu<float>(rdims);
+    Array<float> fctr = createValueArray<float>(rdims, (float)nsamples);
+    Array<float> rnd = arithOp<float, af_mul_t>(frnd, fctr, rdims);
+
+    Array<T> tmpH = createValueArray<T>(af::dim4(9, iter), (T)0);
+    Array<T> tmpA = createValueArray<T>(af::dim4(9, 9, iter), (T)0);
+    Array<T> tmpV = createValueArray<T>(af::dim4(9, 9, iter), (T)0);
+
+    bestH = createValueArray<T>(af::dim4(3, 3), (T)0);
+    switch (htype) {
+    case AF_RANSAC:
+        return kernel::computeH<T, AF_RANSAC>(bestH, tmpH, tmpA, tmpV, err,
+                                              x_src, y_src, x_dst, y_dst,
+                                              rnd, iter, nsamples, inlier_thr);
+        break;
+    case AF_LMEDS:
+        return kernel::computeH<T, AF_LMEDS> (bestH, tmpH, tmpA, tmpV, err,
+                                              x_src, y_src, x_dst, y_dst,
+                                              rnd, iter, nsamples, inlier_thr);
+        break;
+    default:
+        return -1;
+        break;
+    }
+}
+
+#define INSTANTIATE(T)                                                              \
+    template int homography(Array<T> &H,                                            \
+                            const Array<float> &x_src, const Array<float> &y_src,   \
+                            const Array<float> &x_dst, const Array<float> &y_dst,   \
+                            const af_homography_type htype, const float inlier_thr, \
+                            const unsigned iterations);
+
+INSTANTIATE(float )
+INSTANTIATE(double)
+
+}
diff --git a/src/backend/opencl/homography.hpp b/src/backend/opencl/homography.hpp
new file mode 100644
index 0000000..6c926e5
--- /dev/null
+++ b/src/backend/opencl/homography.hpp
@@ -0,0 +1,22 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <Array.hpp>
+
+namespace opencl
+{
+
+template<typename T>
+int homography(Array<T> &H,
+               const Array<float> &x_src, const Array<float> &y_src,
+               const Array<float> &x_dst, const Array<float> &y_dst,
+               const af_homography_type htype, const float inlier_thr,
+               const unsigned iterations);
+
+}
diff --git a/src/backend/opencl/kernel/homography.cl b/src/backend/opencl/kernel/homography.cl
new file mode 100644
index 0000000..0dd8ee5
--- /dev/null
+++ b/src/backend/opencl/kernel/homography.cl
@@ -0,0 +1,514 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+T sq(T a)
+{
+    return a * a;
+}
+
+void jacobi_svd(__global T* S, __global T* V, int m, int n)
+{
+    const int iterations = 30;
+
+    int tid_x = get_local_id(0);
+    int bsz_x = get_local_size(0);
+    int tid_y = get_local_id(1);
+    int gid_y = get_global_id(1);
+
+    __local T acc[512];
+    __local T* acc1 = acc;
+    __local T* acc2 = acc + 256;
+
+    __local T l_S[16*81];
+    __local T l_V[16*81];
+    __local T d[16*9];
+
+    for (int i = 0; i <= 4; i++)
+        l_S[tid_y * 81 + i*bsz_x + tid_x] = S[gid_y * 81 + i*bsz_x + tid_x];
+    if (tid_x == 0)
+        l_S[tid_y * 81 + 80] = S[gid_y * 81 + 80];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Copy first 80 elements
+    for (int i = 0; i <= 4; i++) {
+        T t = l_S[tid_y*81 + tid_x+i*bsz_x];
+        acc1[tid_y*bsz_x + tid_x] += t*t;
+    }
+    if (tid_x < 8)
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < 4)
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < 2)
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < 1) {
+        // Copy last element
+        T t = l_S[tid_y*bsz_x + tid_x+80];
+        acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+1] + t*t;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid_x < n)
+        d[tid_y*9 + tid_x] = acc1[tid_y*bsz_x + tid_x];
+
+    // V is initialized as an identity matrix
+    for (int i = 0; i <= 4; i++) {
+        l_V[tid_y*81 + i*bsz_x + tid_x] = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid_x < m)
+        l_V[tid_y*81 + tid_x*m + tid_x] = 1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int it = 0; it < iterations; it++) {
+        int converged = 0;
+
+        for (int i = 0; i < n-1; i++) {
+            for (int j = i+1; j < n; j++) {
+                __local T* Si = l_S + tid_y*81 + i*m;
+                __local T* Sj = l_S + tid_y*81 + j*m;
+
+                T p = (T)0;
+                for (int k = 0; k < m; k++)
+                    p += Si[k]*Sj[k];
+
+                if (fabs(p) <= EPS*sqrt(d[tid_y*9 + i]*d[tid_y*9 + j]))
+                    continue;
+
+                T y = d[tid_y*9 + i] - d[tid_y*9 + j];
+                T r = hypot(p*2, y);
+                T r2 = r*2;
+                T c, s;
+                if (y >= 0) {
+                    c = sqrt((r + y) / r2);
+                    s = p / (r2*c);
+                }
+                else {
+                    s = sqrt((r - y) / r2);
+                    c = p / (r2*s);
+                }
+
+                if (tid_x < m) {
+                    T t0 = c*Si[tid_x] + s*Sj[tid_x];
+                    T t1 = c*Sj[tid_x] - s*Si[tid_x];
+                    Si[tid_x] = t0;
+                    Sj[tid_x] = t1;
+
+                    acc1[tid_y*16 + tid_x] = t0*t0;
+                    acc2[tid_y*16 + tid_x] = t1*t1;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (tid_x < 4) {
+                    acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+4];
+                    acc2[tid_y*16 + tid_x] += acc2[tid_y*16 + tid_x+4];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if (tid_x < 2) {
+                    acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+2];
+                    acc2[tid_y*16 + tid_x] += acc2[tid_y*16 + tid_x+2];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                if (tid_x < 1) {
+                    acc1[tid_y*16 + tid_x] += acc1[tid_y*16 + tid_x+1] + acc1[tid_y*16 + tid_x+8];
+                    acc2[tid_y*16 + tid_x] += acc2[tid_y*16 + tid_x+1] + acc2[tid_y*16 + tid_x+8];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (tid_x == 0) {
+                    d[tid_y*9 + i] = acc1[tid_y*16];
+                    d[tid_y*9 + j] = acc2[tid_y*16];
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                __local T* Vi = l_V + tid_y*81 + i*n;
+                __local T* Vj = l_V + tid_y*81 + j*n;
+
+                if (tid_x < n) {
+                    T t0 = Vi[tid_x] * c + Vj[tid_x] * s;
+                    T t1 = Vj[tid_x] * c - Vi[tid_x] * s;
+
+                    Vi[tid_x] = t0;
+                    Vj[tid_x] = t1;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                converged = 1;
+            }
+            if (converged == 0)
+                break;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = 0; i <= 4; i++)
+        V[gid_y * 81 + tid_x+i*bsz_x] = l_V[tid_y * 81 + tid_x+i*bsz_x];
+    if (tid_x == 0)
+        V[gid_y * 81 + 80] = l_V[tid_y * 81 + 80];
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+int compute_mean_scale(
+    float* x_src_mean,
+    float* y_src_mean,
+    float* x_dst_mean,
+    float* y_dst_mean,
+    float* src_scale,
+    float* dst_scale,
+    float* src_pt_x,
+    float* src_pt_y,
+    float* dst_pt_x,
+    float* dst_pt_y,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    __global const float* rnd,
+    KParam rInfo,
+    int i)
+{
+    const unsigned ridx = rInfo.dims[0] * i;
+    unsigned r[4] = { (unsigned)rnd[ridx],
+                      (unsigned)rnd[ridx+1],
+                      (unsigned)rnd[ridx+2],
+                      (unsigned)rnd[ridx+3] };
+
+    // If one of the points is repeated, it's a bad samples, will still
+    // compute homography to ensure all threads pass barrier()
+    int bad = (r[0] == r[1] || r[0] == r[2] || r[0] == r[3] ||
+               r[1] == r[2] || r[1] == r[3] || r[2] == r[3]);
+
+    for (unsigned j = 0; j < 4; j++) {
+        src_pt_x[j] = x_src[r[j]];
+        src_pt_y[j] = y_src[r[j]];
+        dst_pt_x[j] = x_dst[r[j]];
+        dst_pt_y[j] = y_dst[r[j]];
+    }
+
+    *x_src_mean = (src_pt_x[0] + src_pt_x[1] + src_pt_x[2] + src_pt_x[3]) / 4.f;
+    *y_src_mean = (src_pt_y[0] + src_pt_y[1] + src_pt_y[2] + src_pt_y[3]) / 4.f;
+    *x_dst_mean = (dst_pt_x[0] + dst_pt_x[1] + dst_pt_x[2] + dst_pt_x[3]) / 4.f;
+    *y_dst_mean = (dst_pt_y[0] + dst_pt_y[1] + dst_pt_y[2] + dst_pt_y[3]) / 4.f;
+
+    float src_var = 0.0f, dst_var = 0.0f;
+    for (unsigned j = 0; j < 4; j++) {
+        src_var += sq(src_pt_x[j] - *x_src_mean) + sq(src_pt_y[j] - *y_src_mean);
+        dst_var += sq(dst_pt_x[j] - *x_dst_mean) + sq(dst_pt_y[j] - *y_dst_mean);
+    }
+
+    src_var /= 4.f;
+    dst_var /= 4.f;
+
+    *src_scale = sqrt(2.0f) / sqrt(src_var);
+    *dst_scale = sqrt(2.0f) / sqrt(dst_var);
+
+    return !bad;
+}
+
+#define APTR(Z, Y, X) (A[(Z) * AInfo.dims[0] * AInfo.dims[1] + (Y) * AInfo.dims[0] + (X)])
+
+__kernel void compute_homography(
+    __global T* H,
+    KParam HInfo,
+    __global T* A,
+    KParam AInfo,
+    __global T* V,
+    KParam VInfo,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    __global const float* rnd,
+    KParam rInfo,
+    const unsigned iterations)
+{
+    unsigned i = get_global_id(1);
+
+    if (i < iterations) {
+        float x_src_mean, y_src_mean;
+        float x_dst_mean, y_dst_mean;
+        float src_scale, dst_scale;
+        float src_pt_x[4], src_pt_y[4], dst_pt_x[4], dst_pt_y[4];
+
+        compute_mean_scale(&x_src_mean, &y_src_mean,
+                           &x_dst_mean, &y_dst_mean,
+                           &src_scale, &dst_scale,
+                           src_pt_x, src_pt_y,
+                           dst_pt_x, dst_pt_y,
+                           x_src, y_src, x_dst, y_dst,
+                           rnd, rInfo, i);
+
+        // Compute input matrix
+        for (unsigned j = get_local_id(0); j < 4; j+=get_local_size(0)) {
+            float srcx = (src_pt_x[j] - x_src_mean) * src_scale;
+            float srcy = (src_pt_y[j] - y_src_mean) * src_scale;
+            float dstx = (dst_pt_x[j] - x_dst_mean) * dst_scale;
+            float dsty = (dst_pt_y[j] - y_dst_mean) * dst_scale;
+
+            APTR(i, 3, j*2) = -srcx;
+            APTR(i, 4, j*2) = -srcy;
+            APTR(i, 5, j*2) = -1.0f;
+            APTR(i, 6, j*2) = dsty*srcx;
+            APTR(i, 7, j*2) = dsty*srcy;
+            APTR(i, 8, j*2) = dsty;
+
+            APTR(i, 0, j*2+1) = srcx;
+            APTR(i, 1, j*2+1) = srcy;
+            APTR(i, 2, j*2+1) = 1.0f;
+            APTR(i, 6, j*2+1) = -dstx*srcx;
+            APTR(i, 7, j*2+1) = -dstx*srcy;
+            APTR(i, 8, j*2+1) = -dstx;
+        }
+
+        jacobi_svd(A, V, 9, 9);
+
+        T vH[9], H_tmp[9];
+        for (unsigned j = 0; j < 9; j++)
+            vH[j] = V[i * VInfo.dims[0] * VInfo.dims[1] + 8 * VInfo.dims[0] + j];
+
+        H_tmp[0] = src_scale*x_dst_mean*vH[6] + src_scale*vH[0]/dst_scale;
+        H_tmp[1] = src_scale*x_dst_mean*vH[7] + src_scale*vH[1]/dst_scale;
+        H_tmp[2] = x_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                              (vH[2] - src_scale*y_src_mean*vH[1] - src_scale*x_src_mean*vH[0])/dst_scale;
+
+        H_tmp[3] = src_scale*y_dst_mean*vH[6] + src_scale*vH[3]/dst_scale;
+        H_tmp[4] = src_scale*y_dst_mean*vH[7] + src_scale*vH[4]/dst_scale;
+        H_tmp[5] = y_dst_mean*(vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6]) +
+                              (vH[5] - src_scale*y_src_mean*vH[4] - src_scale*x_src_mean*vH[3])/dst_scale;
+
+        H_tmp[6] = src_scale*vH[6];
+        H_tmp[7] = src_scale*vH[7];
+        H_tmp[8] = vH[8] - src_scale*y_src_mean*vH[7] - src_scale*x_src_mean*vH[6];
+
+        const unsigned Hidx = HInfo.dims[0] * i;
+        __global T* H_ptr = H + Hidx;
+        for (int h = 0; h < 9; h++)
+            H_ptr[h] = H_tmp[h];
+    }
+}
+
+#undef APTR
+
+// LMedS: http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node25.html
+__kernel void eval_homography(
+    __global unsigned* inliers,
+    __global unsigned* idx,
+    __global T* H,
+    KParam HInfo,
+    __global float* err,
+    KParam eInfo,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    __global const float* rnd,
+    const unsigned iterations,
+    const unsigned nsamples,
+    const float inlier_thr)
+{
+    unsigned bid_x = get_group_id(0);
+    unsigned tid_x = get_local_id(0);
+    unsigned i = get_global_id(0);
+
+    __local unsigned l_inliers[256];
+    __local unsigned l_idx[256];
+
+    l_inliers[tid_x] = 0;
+    l_idx[tid_x]     = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (i < iterations) {
+        const unsigned Hidx = HInfo.dims[0] * i;
+        __global T* H_ptr = H + Hidx;
+        T H_tmp[9];
+        for (int h = 0; h < 9; h++)
+            H_tmp[h] = H_ptr[h];
+
+#ifdef RANSAC
+        // Compute inliers
+        unsigned inliers_count = 0;
+        for (unsigned j = 0; j < nsamples; j++) {
+            float z =  H_tmp[6]*x_src[j] + H_tmp[7]*y_src[j] + H_tmp[8];
+            float x = (H_tmp[0]*x_src[j] + H_tmp[1]*y_src[j] + H_tmp[2]) / z;
+            float y = (H_tmp[3]*x_src[j] + H_tmp[4]*y_src[j] + H_tmp[5]) / z;
+
+            float dist = sq(x_dst[j] - x) + sq(y_dst[j] - y);
+            if (dist < inlier_thr*inlier_thr)
+                inliers_count++;
+        }
+
+        l_inliers[tid_x] = inliers_count;
+        l_idx[tid_x]     = i;
+#endif
+#ifdef LMEDS
+        // Compute error
+        for (unsigned j = 0; j < nsamples; j++) {
+            float z =  H_tmp[6]*x_src[j] + H_tmp[7]*y_src[j] + H_tmp[8];
+            float x = (H_tmp[0]*x_src[j] + H_tmp[1]*y_src[j] + H_tmp[2]) / z;
+            float y = (H_tmp[3]*x_src[j] + H_tmp[4]*y_src[j] + H_tmp[5]) / z;
+
+            float dist = sq(x_dst[j] - x) + sq(y_dst[j] - y);
+            err[i*eInfo.dims[0] + j] = sqrt(dist);
+        }
+#endif
+    }
+
+#ifdef RANSAC
+    // Find sample with most inliers
+    for (unsigned tx = 128; tx > 0; tx >>= 1) {
+        if (tid_x < tx) {
+            if (l_inliers[tid_x + tx] > l_inliers[tid_x]) {
+                l_inliers[tid_x] = l_inliers[tid_x + tx];
+                l_idx[tid_x]     = l_idx[tid_x + tx];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    inliers[bid_x] = l_inliers[0];
+    idx[bid_x]     = l_idx[0];
+#endif
+}
+
+__kernel void compute_median(
+    __global float* median,
+    __global unsigned* idx,
+    __global const float* err,
+    KParam eInfo,
+    const unsigned iterations)
+{
+    const unsigned tid = get_local_id(0);
+    const unsigned bid = get_group_id(0);
+    const unsigned i = get_global_id(0);
+
+    __local float l_median[256];
+    __local unsigned l_idx[256];
+
+    l_median[tid] = FLT_MAX;
+    l_idx[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (i < iterations) {
+        const int nsamples = eInfo.dims[0];
+        float m = err[i*nsamples + nsamples / 2];
+        if (nsamples % 2 == 0)
+            m = (m + err[i*nsamples + nsamples / 2 - 1]) * 0.5f;
+
+        l_idx[tid] = i;
+        l_median[tid] = m;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t) {
+            if (l_median[tid + t] < l_median[tid]) {
+                l_median[tid] = l_median[tid + t];
+                l_idx[tid]    = l_idx[tid + t];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    median[bid] = l_median[0];
+    idx[bid] = l_idx[0];
+}
+
+#define DIVUP(A, B) (((A) + (B) - 1) / (B))
+
+__kernel void find_min_median(
+    __global float* minMedian,
+    __global unsigned* minIdx,
+    __global const float* median,
+    KParam mInfo,
+    __global const unsigned* idx)
+{
+    const int tid = get_local_id(0);
+
+    __local float l_minMedian[256];
+    __local unsigned l_minIdx[256];
+
+    l_minMedian[tid] = FLT_MAX;
+    l_minIdx[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    const int loop = DIVUP(mInfo.dims[0], get_local_size(0));
+
+    for (int i = 0; i < loop; i++) {
+        int j = i * get_local_size(0) + tid;
+        if (j < mInfo.dims[0] && median[j] < l_minMedian[tid]) {
+            l_minMedian[tid] = median[j];
+            l_minIdx[tid] = idx[j];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t) {
+            if (l_minMedian[tid + t] < l_minMedian[tid]) {
+                l_minMedian[tid] = l_minMedian[tid + t];
+                l_minIdx[tid]    = l_minIdx[tid + t];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    *minMedian = l_minMedian[0];
+    *minIdx = l_minIdx[0];
+}
+
+#undef DIVUP
+
+__kernel void compute_lmeds_inliers(
+    __global unsigned* inliers,
+    __global const T* H,
+    __global const float* x_src,
+    __global const float* y_src,
+    __global const float* x_dst,
+    __global const float* y_dst,
+    const float minMedian,
+    const unsigned nsamples)
+{
+    unsigned tid = get_local_id(0);
+    unsigned bid = get_group_id(0);
+    unsigned i = get_global_id(0);
+
+    __local T l_H[9];
+    __local unsigned l_inliers[256];
+
+    l_inliers[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 9)
+        l_H[tid] = H[tid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float sigma = fmax(1.4826f * (1 + 5.f/(nsamples - 4)) * (float)sqrt(minMedian), 1e-6f);
+    float dist_thr = sq(2.5f * sigma);
+
+    if (i < nsamples) {
+        float z =  l_H[6]*x_src[i] + l_H[7]*y_src[i] + l_H[8];
+        float x = (l_H[0]*x_src[i] + l_H[1]*y_src[i] + l_H[2]) / z;
+        float y = (l_H[3]*x_src[i] + l_H[4]*y_src[i] + l_H[5]) / z;
+
+        float dist = sq(x_dst[i] - x) + sq(y_dst[i] - y);
+        if (dist <= dist_thr)
+            l_inliers[tid] = 1;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (unsigned t = 128; t > 0; t >>= 1) {
+        if (tid < t)
+            l_inliers[tid] += l_inliers[tid + t];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    inliers[bid] = l_inliers[0];
+}
diff --git a/src/backend/opencl/kernel/homography.hpp b/src/backend/opencl/kernel/homography.hpp
new file mode 100644
index 0000000..fb10e36
--- /dev/null
+++ b/src/backend/opencl/kernel/homography.hpp
@@ -0,0 +1,261 @@
+/*******************************************************
+ * Copyright (c) 2015, ArrayFire
+ * All rights reserved.
+ *
+ * This file is distributed under 3-clause BSD license.
+ * The complete license agreement can be obtained at:
+ * http://arrayfire.com/licenses/BSD-3-Clause
+ ********************************************************/
+
+#include <af/defines.h>
+#include <dispatch.hpp>
+#include <err_opencl.hpp>
+#include <debug_opencl.hpp>
+#include <memory.hpp>
+#include <kernel_headers/homography.hpp>
+#include <kernel/ireduce.hpp>
+#include <kernel/reduce.hpp>
+#include <kernel/sort.hpp>
+#include <cfloat>
+
+using cl::Buffer;
+using cl::Program;
+using cl::Kernel;
+using cl::EnqueueArgs;
+using cl::LocalSpaceArg;
+using cl::NDRange;
+using std::vector;
+
+namespace opencl
+{
+
+namespace kernel
+{
+
+const int HG_THREADS_X = 16;
+const int HG_THREADS_Y = 16;
+const int HG_THREADS   = 256;
+
+template<typename T, af_homography_type htype>
+int computeH(
+    Param bestH,
+    Param H,
+    Param A,
+    Param V,
+    Param err,
+    Param x_src,
+    Param y_src,
+    Param x_dst,
+    Param y_dst,
+    Param rnd,
+    const unsigned iterations,
+    const unsigned nsamples,
+    const float inlier_thr)
+{
+    try {
+        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
+        static std::map<int, Program*> hgProgs;
+        static std::map<int, Kernel*>  chKernel;
+        static std::map<int, Kernel*>  ehKernel;
+        static std::map<int, Kernel*>  cmKernel;
+        static std::map<int, Kernel*>  fmKernel;
+        static std::map<int, Kernel*>  clKernel;
+
+        int device = getActiveDeviceId();
+
+        std::call_once( compileFlags[device], [device] () {
+
+                std::ostringstream options;
+                options << " -D T=" << dtype_traits<T>::getName();
+
+                if (std::is_same<T, double>::value) {
+                    options << " -D USE_DOUBLE";
+                    options << " -D EPS=" << DBL_EPSILON;
+                } else
+                    options << " -D EPS=" << FLT_EPSILON;
+
+                if (htype == AF_RANSAC)
+                    options << " -D RANSAC";
+                else if (htype == AF_LMEDS)
+                    options << " -D LMEDS";
+
+                cl::Program prog;
+                buildProgram(prog, homography_cl, homography_cl_len, options.str());
+                hgProgs[device] = new Program(prog);
+
+                chKernel[device] = new Kernel(*hgProgs[device], "compute_homography");
+                ehKernel[device] = new Kernel(*hgProgs[device], "eval_homography");
+                cmKernel[device] = new Kernel(*hgProgs[device], "compute_median");
+                fmKernel[device] = new Kernel(*hgProgs[device], "find_min_median");
+                clKernel[device] = new Kernel(*hgProgs[device], "compute_lmeds_inliers");
+            });
+
+        const int blk_x_ch = 1;
+        const int blk_y_ch = divup(iterations, HG_THREADS_Y);
+        const NDRange local_ch(HG_THREADS_X, HG_THREADS_Y);
+        const NDRange global_ch(blk_x_ch * HG_THREADS_X, blk_y_ch * HG_THREADS_Y);
+
+        // Build linear system and solve SVD
+        auto chOp = make_kernel<Buffer, KParam, Buffer, KParam,
+                                Buffer, KParam,
+                                Buffer, Buffer, Buffer, Buffer,
+                                Buffer, KParam, unsigned>(*chKernel[device]);
+
+        chOp(EnqueueArgs(getQueue(), global_ch, local_ch),
+             *H.data, H.info, *A.data, A.info,
+             *V.data, V.info,
+             *x_src.data, *y_src.data, *x_dst.data, *y_dst.data,
+             *rnd.data, rnd.info, iterations);
+        CL_DEBUG_FINISH(getQueue());
+
+        const int blk_x_eh = divup(iterations, HG_THREADS);
+        const NDRange local_eh(HG_THREADS);
+        const NDRange global_eh(blk_x_eh * HG_THREADS);
+
+        // Allocate some temporary buffers
+        Param inliers, idx, median;
+        inliers.info.offset = idx.info.offset = median.info.offset = 0;
+        inliers.info.dims[0] = (htype == AF_RANSAC) ? blk_x_eh : divup(nsamples, HG_THREADS);
+        inliers.info.strides[0] = 1;
+        idx.info.dims[0] = median.info.dims[0] = blk_x_eh;
+        idx.info.strides[0] = median.info.strides[0] = 1;
+        for (int k = 1; k < 4; k++) {
+            inliers.info.dims[k] = 1;
+            inliers.info.strides[k] = inliers.info.dims[k-1] * inliers.info.strides[k-1];
+            idx.info.dims[k] = median.info.dims[k] = 1;
+            idx.info.strides[k] = median.info.strides[k] = idx.info.dims[k-1] * idx.info.strides[k-1];
+        }
+        idx.data = bufferAlloc(idx.info.dims[3] * idx.info.strides[3] * sizeof(unsigned));
+        inliers.data = bufferAlloc(inliers.info.dims[3] * inliers.info.strides[3] * sizeof(unsigned));
+        if (htype == AF_LMEDS)
+            median.data = bufferAlloc(median.info.dims[3] * median.info.strides[3] * sizeof(float));
+        else
+            median.data = bufferAlloc(sizeof(float));
+
+        // Compute (and for RANSAC, evaluate) homographies
+        auto ehOp = make_kernel<Buffer, Buffer, Buffer, KParam,
+                                Buffer, KParam,
+                                Buffer, Buffer, Buffer, Buffer,
+                                Buffer, unsigned, unsigned, float>(*ehKernel[device]);
+
+        ehOp(EnqueueArgs(getQueue(), global_eh, local_eh),
+             *inliers.data, *idx.data, *H.data, H.info,
+             *err.data, err.info,
+             *x_src.data, *y_src.data, *x_dst.data, *y_dst.data,
+             *rnd.data, iterations, nsamples, inlier_thr);
+        CL_DEBUG_FINISH(getQueue());
+
+        unsigned inliersH, idxH;
+        if (htype == AF_LMEDS) {
+            // TODO: Improve this sorting, if the number of iterations is
+            // sufficiently large, this can be *very* slow
+            kernel::sort0<float, true>(err);
+
+            unsigned minIdx;
+            float minMedian;
+
+            // Compute median of every iteration
+            auto cmOp = make_kernel<Buffer, Buffer, Buffer, KParam,
+                                    unsigned>(*cmKernel[device]);
+
+            cmOp(EnqueueArgs(getQueue(), global_eh, local_eh),
+                 *median.data, *idx.data, *err.data, err.info,
+                 iterations);
+            CL_DEBUG_FINISH(getQueue());
+
+            // Reduce medians, only in case iterations > 256
+            if (blk_x_eh > 1) {
+                const NDRange local_fm(HG_THREADS);
+                const NDRange global_fm(HG_THREADS);
+
+                cl::Buffer* finalMedian = bufferAlloc(sizeof(float));
+                cl::Buffer* finalIdx = bufferAlloc(sizeof(unsigned));
+
+                auto fmOp = make_kernel<Buffer, Buffer, Buffer, KParam,
+                                        Buffer>(*fmKernel[device]);
+
+                fmOp(EnqueueArgs(getQueue(), global_fm, local_fm),
+                     *finalMedian, *finalIdx, *median.data, median.info,
+                     *idx.data);
+                CL_DEBUG_FINISH(getQueue());
+
+                getQueue().enqueueReadBuffer(*finalMedian, CL_TRUE, 0, sizeof(float), &minMedian);
+                getQueue().enqueueReadBuffer(*finalIdx, CL_TRUE, 0, sizeof(unsigned), &minIdx);
+
+                bufferFree(finalMedian);
+                bufferFree(finalIdx);
+            }
+            else {
+                getQueue().enqueueReadBuffer(*median.data, CL_TRUE, 0, sizeof(float), &minMedian);
+                getQueue().enqueueReadBuffer(*idx.data, CL_TRUE, 0, sizeof(unsigned), &minIdx);
+            }
+
+            // Copy best homography to output
+            getQueue().enqueueCopyBuffer(*H.data, *bestH.data, minIdx*9*sizeof(T), 0, 9*sizeof(T));
+
+            const int blk_x_cl = divup(nsamples, HG_THREADS);
+            const NDRange local_cl(HG_THREADS);
+            const NDRange global_cl(blk_x_cl * HG_THREADS);
+
+            auto clOp = make_kernel<Buffer, Buffer,
+                                    Buffer, Buffer, Buffer, Buffer,
+                                    float, unsigned>(*clKernel[device]);
+
+            clOp(EnqueueArgs(getQueue(), global_cl, local_cl),
+                 *inliers.data, *bestH.data,
+                 *x_src.data, *y_src.data, *x_dst.data, *y_dst.data,
+                 minMedian, nsamples);
+            CL_DEBUG_FINISH(getQueue());
+
+            // Adds up the total number of inliers
+            Param totalInliers;
+            totalInliers.info.offset = 0;
+            for (int k = 0; k < 4; k++)
+                totalInliers.info.dims[k] = totalInliers.info.strides[k] = 1;
+            totalInliers.data = bufferAlloc(sizeof(unsigned));
+
+            kernel::reduce<unsigned, unsigned, af_add_t>(totalInliers, inliers, 0, false, 0.0);
+
+            getQueue().enqueueReadBuffer(*totalInliers.data, CL_TRUE, 0, sizeof(unsigned), &inliersH);
+
+            bufferFree(totalInliers.data);
+        }
+        else if (htype == AF_RANSAC) {
+            Param bestInliers, bestIdx;
+            bestInliers.info.offset = bestIdx.info.offset = 0;
+            for (int k = 0; k < 4; k++) {
+                bestInliers.info.dims[k] = bestIdx.info.dims[k] = 1;
+                bestInliers.info.strides[k] = bestIdx.info.strides[k] = 1;
+            }
+            bestInliers.data = bufferAlloc(sizeof(unsigned));
+            bestIdx.data = bufferAlloc(sizeof(unsigned));
+
+            kernel::ireduce<unsigned, af_max_t>(bestInliers, bestIdx.data, inliers, 0);
+
+            unsigned blockIdx;
+            getQueue().enqueueReadBuffer(*bestIdx.data, CL_TRUE, 0, sizeof(unsigned), &blockIdx);
+
+            // Copies back index and number of inliers of best homography estimation
+            getQueue().enqueueReadBuffer(*idx.data, CL_TRUE, blockIdx*sizeof(unsigned), sizeof(unsigned), &idxH);
+            getQueue().enqueueReadBuffer(*bestInliers.data, CL_TRUE, 0, sizeof(unsigned), &inliersH);
+
+            getQueue().enqueueCopyBuffer(*H.data, *bestH.data, idxH*9*sizeof(T), 0, 9*sizeof(T));
+
+            bufferFree(bestInliers.data);
+            bufferFree(bestIdx.data);
+        }
+
+        bufferFree(inliers.data);
+        bufferFree(idx.data);
+        bufferFree(median.data);
+
+        return (int)inliersH;
+    } catch (cl::Error err) {
+        CL_TO_AF_ERROR(err);
+        throw;
+    }
+}
+
+} // namespace kernel
+
+} // namespace cuda

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/arrayfire.git