[clfft] 09/21: fix a couple pre callback and twiddling bugs. pending full suite test and code clean up
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Mar 16 13:14:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 710b83ecfab1fdd3c33dbd63cf24effc25ba07b9
Author: Timmy <timmy.liu at amd.com>
Date: Fri Feb 26 15:17:31 2016 -0600
fix a couple pre callback and twiddling bugs. pending full suite test and code clean up
---
src/library/generator.transpose.cpp | 7 +++++
src/library/plan.cpp | 58 +++++++++++++++++++++++++++----------
2 files changed, 50 insertions(+), 15 deletions(-)
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
index d702771..b07907d 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.cpp
@@ -607,6 +607,7 @@ clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature
clKernWrite(transKernel, 6) << "if (pos == 0){" << std::endl;
clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA, ( is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
+ //clKernWrite(transKernel, 9) << "Ls[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( is *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
if (params.fft_preCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
@@ -614,6 +615,7 @@ clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature
clKernWrite(transKernel, 0) << ");" << std::endl;
clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
+ //clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
if (params.fft_preCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
@@ -623,6 +625,7 @@ clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature
clKernWrite(transKernel, 6) << "else if (pos == 1){" << std::endl;
clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset), pre_userdata";
+ //clKernWrite(transKernel, 9) << "Ld[j] = " << params.fft_preCallback.funcname << "(inputA + iOffset, ( id *" << smaller_dim << " + " << num_elements_loaded << " * work_id + j), pre_userdata";
if (params.fft_preCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
@@ -651,6 +654,10 @@ clfftStatus genSwapKernel(const FFTGeneratedTransposeNonSquareAction::Signature
}
clKernWrite(transKernel, 0) << ");" << std::endl;
}
+ else if (params.fft_hasPreCallback)
+ {
+ clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j + iOffset] = Ls[j];" << std::endl;
+ }
else
{
clKernWrite(transKernel, 6) << "inputA[id*" << smaller_dim << " + " << num_elements_loaded << " * work_id + j] = Ls[j];" << std::endl;
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 85f1f76..895f1d8 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -775,7 +775,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans2Plan->oDist = clLengths[1] * trans2Plan->outStride[1];
trans2Plan->gen = transGen;
- if(transGen != Transpose_NONSQUARE)//Timmy was commented
+ //if(transGen != Transpose_NONSQUARE)//Timmy was commented
trans2Plan->large1D = fftPlan->length[0];
trans2Plan->transflag = true;
@@ -831,11 +831,11 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
row2Plan->oDist *= fftPlan->length[index];
}
//Timmy was group commented
- if (transGen == Transpose_NONSQUARE)
- {
- row2Plan->large1D = fftPlan->length[0];
- row2Plan->twiddleFront = true;
- }
+ //if (transGen == Transpose_NONSQUARE)
+ //{
+ // row2Plan->large1D = fftPlan->length[0];
+ // row2Plan->twiddleFront = true;
+ //}
OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
_T( "BakePlan large1d second row plan failed" ) );
@@ -1957,12 +1957,13 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
/*
- There are two ways of conducting inplace transpose with 1:2 dimension ratio.
- A. first conduct batched square transpose along leading dim (row dim)
- then conduct line swapping kernels for the whole non square matrix
- B. first conduct line swapping kernels for the whole non square matrix
+ There are three ways of conducting inplace transpose with 1:2 (or 2:1) dimension ratio.
+ A. first conduct line swapping kernels for the whole non square matrix
then conduct batched square transpose along column dim (a 'real' batched transpose)
-
+ B. first conduct batched square transpose along column dim (a 'real' batched transpose)
+ then conduct line swapping kernels for the whole non square matrix (for 2:1 case)
+ C. first conduct batched square transpose along leading dim (row dim)
+ then conduct line swapping kernels for the whole non square matrix
Note that the twiddle computation has to go at the begining of the first kernel or the end of the second kernel
if leading dimension is bigger, it makes more sense (faster) to swap line first and then conduct batched square transpose
@@ -1971,15 +1972,38 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
enum NON_SQUARE_KERNEL_ORDER
{
SWAP_AND_TRANSPOSE,
- TRANSPOSE_AND_SWAP
+ TRANSPOSE_AND_SWAP,
+ TRANSPOSE_LEADING_AND_SWAP,
};
NON_SQUARE_KERNEL_ORDER currKernelOrder;
+ //controling the transpose and swap kernel order
if (clLengths[0] > clLengths[1])
+ {
currKernelOrder = SWAP_AND_TRANSPOSE;
+ std::cout << "SWAP_AND_TRANSPOSE" << std::endl;
+ }
else
- currKernelOrder = TRANSPOSE_AND_SWAP;
- //currKernelOrder = TRANSPOSE_AND_SWAP;
+ {
+ if (fftPlan->large1D != 0)
+ {
+ //currently tranpose twiddling is only supported in below case
+ //TODO support tranpose twiddling for all cases.
+ currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
+ std::cout << "TRANSPOSE_LEADING_AND_SWAP" << std::endl;
+ }
+ else
+ {
+ currKernelOrder = TRANSPOSE_AND_SWAP;
+ std::cout << "TRANSPOSE_AND_SWAP" << std::endl;
+ }
+ }
+ //if the original input data is more than 1d only TRANSPOSE_LEADING_AND_SWAP order is supported
+ //TODO need to fix this here. related to sub batch size.
+ if (fftPlan->length.size() > 2)
+ currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
+ else
+ std::cout << "new order" << std::endl;
//Transpose stage 1
OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
@@ -2004,8 +2028,10 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans1Plan->gen = Transpose_NONSQUARE;
if(currKernelOrder == SWAP_AND_TRANSPOSE)
trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;// was NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
- else
+ else if (currKernelOrder == TRANSPOSE_AND_SWAP)
trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
+ else
+ trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
trans1Plan->transflag = true;
trans1Plan->large1D = fftPlan->large1D;
@@ -2075,6 +2101,8 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans2Plan->gen = Transpose_NONSQUARE;
if (currKernelOrder == SWAP_AND_TRANSPOSE)
trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED; //was NON_SQUARE_TRANS_SWAP;
+ else if(currKernelOrder == TRANSPOSE_AND_SWAP)
+ trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
else
trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
trans2Plan->transflag = true;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list