[clfft] 23/74: optimizations for swap logic.

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Jan 14 19:52:13 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository clfft.

commit a4c9c86a262c6d968ab798e750341088be912775
Author: santanu-thangaraj <t.santanu at gmail.com>
Date:   Mon Nov 30 11:36:12 2015 +0530

    optimizations for swap logic.
---
 src/library/generator.transpose.nonsquare.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/library/generator.transpose.nonsquare.cpp b/src/library/generator.transpose.nonsquare.cpp
index cb4008d..a83cb79 100644
--- a/src/library/generator.transpose.nonsquare.cpp
+++ b/src/library/generator.transpose.nonsquare.cpp
@@ -976,6 +976,11 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
 
         size_t local_work_size_swap = num_lines_loaded << 4;
         local_work_size_swap = (local_work_size_swap > 256) ? 256 : local_work_size_swap;
+        //number of threads processing each line is assumed to be 16 until this point,
+        //if the work group size is less than 256, then the following logic tries to make
+        // more threads process each row.
+        size_t num_threads_processing_row = (256 / local_work_size_swap) * 16;
+        local_work_size_swap = num_lines_loaded * num_threads_processing_row;
 
         clKernWrite(transKernel, 0) << std::endl;
 
@@ -1026,8 +1031,8 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
             return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
         }
 
-        clKernWrite(transKernel, 3) << "for (int p = get_local_id(0) / 16; p < " << num_lines_loaded << "; p += " << local_work_size_swap / 16 << "){" << std::endl;
-        clKernWrite(transKernel, 6) << "for (int j = get_local_id(0) % 16; j < " << smaller_dim << "; j += " << 16 << "){" << std::endl;
+        clKernWrite(transKernel, 3) << "for (int p = get_local_id(0) / "<< num_threads_processing_row <<"; p < " << num_lines_loaded << "; p += " << local_work_size_swap / num_threads_processing_row << "){" << std::endl;
+        clKernWrite(transKernel, 6) << "for (int j = get_local_id(0) % "<< num_threads_processing_row <<"; j < " << smaller_dim << "; j += " << num_threads_processing_row << "){" << std::endl;
 
         switch (params.fft_inputLayout)
         {

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list