[clfft] 23/74: optimizations for swap logic.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Jan 14 19:52:13 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch debian/sid
in repository clfft.
commit a4c9c86a262c6d968ab798e750341088be912775
Author: santanu-thangaraj <t.santanu at gmail.com>
Date: Mon Nov 30 11:36:12 2015 +0530
optimizations for swap logic.
---
src/library/generator.transpose.nonsquare.cpp | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/src/library/generator.transpose.nonsquare.cpp b/src/library/generator.transpose.nonsquare.cpp
index cb4008d..a83cb79 100644
--- a/src/library/generator.transpose.nonsquare.cpp
+++ b/src/library/generator.transpose.nonsquare.cpp
@@ -976,6 +976,11 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
size_t local_work_size_swap = num_lines_loaded << 4;
local_work_size_swap = (local_work_size_swap > 256) ? 256 : local_work_size_swap;
+ //number of threads processing each line is assumed to be 16 until this point,
+ //if the work group size is less than 256, then the following logic tries to make
+ // more threads process each row.
+ size_t num_threads_processing_row = (256 / local_work_size_swap) * 16;
+ local_work_size_swap = num_lines_loaded * num_threads_processing_row;
clKernWrite(transKernel, 0) << std::endl;
@@ -1026,8 +1031,8 @@ static clfftStatus genTransposeKernel(const FFTGeneratedTransposeNonSquareAction
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
- clKernWrite(transKernel, 3) << "for (int p = get_local_id(0) / 16; p < " << num_lines_loaded << "; p += " << local_work_size_swap / 16 << "){" << std::endl;
- clKernWrite(transKernel, 6) << "for (int j = get_local_id(0) % 16; j < " << smaller_dim << "; j += " << 16 << "){" << std::endl;
+ clKernWrite(transKernel, 3) << "for (int p = get_local_id(0) / "<< num_threads_processing_row <<"; p < " << num_lines_loaded << "; p += " << local_work_size_swap / num_threads_processing_row << "){" << std::endl;
+ clKernWrite(transKernel, 6) << "for (int j = get_local_id(0) % "<< num_threads_processing_row <<"; j < " << smaller_dim << "; j += " << num_threads_processing_row << "){" << std::endl;
switch (params.fft_inputLayout)
{
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list