[clfft] 14/32: fixed pre/post callback bugs. some code cleanup
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Apr 26 08:34:09 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit d2614f6c2846612c21680eadc6cb6cd7e2528dec
Author: Timmy <timmy.liu at amd.com>
Date: Thu Mar 24 14:14:44 2016 -0500
fixed pre/post callback bugs. some code cleanup
---
src/library/action.transpose.cpp | 12 ++--
src/library/enqueue.cpp | 2 +-
src/library/generator.transpose.cpp | 140 ++++++++++++++++++++----------------
src/library/plan.cpp | 4 +-
4 files changed, 87 insertions(+), 71 deletions(-)
diff --git a/src/library/action.transpose.cpp b/src/library/action.transpose.cpp
index 82789b8..c9dc999 100644
--- a/src/library/action.transpose.cpp
+++ b/src/library/action.transpose.cpp
@@ -222,7 +222,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
- std::cout << programCode << std::endl;//TIMMY
+ //std::cout << programCode << std::endl;//TIMMY
}
else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
@@ -246,7 +246,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
- std::cout << programCode << std::endl;//TIMMY
+ //std::cout << programCode << std::endl;//TIMMY
}
else
{
@@ -282,7 +282,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
*/
//general swap kernel takes care of all ratio
OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
- std::cout << programCode << std::endl;//TIMMY
+ //std::cout << programCode << std::endl;//TIMMY
}
cl_int status = CL_SUCCESS;
@@ -349,7 +349,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
{
- std::cout << "TIMMY"<< std::endl;
if (smaller_dim % (16 * reShapeFactor) == 0)
wg_slice = smaller_dim / 16 / reShapeFactor;
else
@@ -375,7 +374,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
}
else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
- std::cout << "TIMMY" << std::endl;
if (smaller_dim % (16 * reShapeFactor) == 0)
wg_slice = smaller_dim / 16 / reShapeFactor;
else
@@ -383,7 +381,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
- for (int i = 2; i < this->plan->length.size(); i++)//Timmy delete
+ for (int i = 2; i < this->plan->length.size(); i++)
{
global_item_size *= this->plan->length[i];
}
@@ -721,7 +719,7 @@ clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo,
{
OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
}
- std::cout << programCode << std::endl;//TIMMY
+ //std::cout << programCode << std::endl;//TIMMY
return CLFFT_SUCCESS;
}
diff --git a/src/library/enqueue.cpp b/src/library/enqueue.cpp
index 5c7de09..a4b9660 100644
--- a/src/library/enqueue.cpp
+++ b/src/library/enqueue.cpp
@@ -624,7 +624,7 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
std::vector< size_t > gWorkSize;
std::vector< size_t > lWorkSize;
clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
- std::cout << "work sizes are " << gWorkSize[0] << ", " << lWorkSize[0] << std::endl;
+ //std::cout << "work sizes are " << gWorkSize[0] << ", " << lWorkSize[0] << std::endl;
// TODO: if getWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
// that this multidimensional input data array is too large to be transformed
// with a single call to clEnqueueNDRangeKernel. For now, we will just return
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
index a2374c6..d9e9a20 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.cpp
@@ -1042,6 +1042,12 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
clKernWrite(transKernel, 0) << std::endl;
}
//if post-callback is set for the plan
+ //rarely do we need post callback in swap kernel. But it is possible.
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
+ clKernWrite(transKernel, 0) << std::endl;
+ }
//twiddle in swap kernel (for now, swap with twiddle seems to always be the second kernel after transpose)
bool twiddleSwapKernel = params.fft_3StepTwiddle && (dim_ratio > 1);
@@ -1181,16 +1187,10 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
else
{
- if (WG_per_line == 1)
- {
- //might look like: int group_offset = prev*729;
+ if (WG_per_line == 1)//might look like: int group_offset = prev*729;
clKernWrite(transKernel, 3) << "int group_offset = (prev*" << smaller_dim << ");" << std::endl;
- }
- else
- {
- //if smaller_dim is 2187 > 1024 this should look like int group_offset = prev*2187 + (group_id % 3)*729;
+ else//if smaller_dim is 2187 > 1024 this should look like int group_offset = prev*2187 + (group_id % 3)*729;
clKernWrite(transKernel, 3) << "int group_offset = (prev*" << smaller_dim << ") + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
- }
}
clKernWrite(transKernel, 3) << std::endl;
@@ -1301,15 +1301,10 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
//takes care the last row
clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
- if (WG_per_line == 1)
- {
- clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ");" << std::endl; //might look like: int group_offset = prev*729;
- }
- else
- {
- //if smaller_dim is 2187 > 1024 this should look like int group_offset = next*2187 + (group_id % 3)*729;
+ if (WG_per_line == 1) //might look like: int group_offset = prev*729;
+ clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ");" << std::endl;
+ else//if smaller_dim is 2187 > 1024 this should look like int group_offset = next*2187 + (group_id % 3)*729;
clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ") + (group_id % " << WG_per_line << ")*" << LDS_per_WG << ";" << std::endl;
- }
}
@@ -1473,15 +1468,37 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
{
for (int i = 0; i < LDS_per_WG; i = i + 256)
{
- if (i + 256 < LDS_per_WG)
- clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
- else
- {
- // need to handle boundary
- clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
- clKernWrite(transKernel, 9) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
- clKernWrite(transKernel, 6) << "}" << std::endl;
- }
+ //twiddling and callback do not coexist
+ if (params.fft_hasPostCallback)
+ {
+ if (i + 256 < LDS_per_WG)
+ {
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
+ << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i
+ << ");" << std::endl;
+ }
+ else
+ {
+ // need to handle boundary
+ clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
+ clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(inputA - batch_offset*" << smaller_dim * bigger_dim
+ << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offSet+idx+" << i << ", post_userdata, preValue[idx+" << i
+ << ");" << std::endl;
+ clKernWrite(transKernel, 6) << "}" << std::endl;
+ }
+ }
+ else
+ {
+ if (i + 256 < LDS_per_WG)
+ clKernWrite(transKernel, 6) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
+ else
+ {
+ // need to handle boundary
+ clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
+ clKernWrite(transKernel, 9) << "inputA[group_offset+idx+" << i << "] = prevValue[idx+" << i << "];" << std::endl;
+ clKernWrite(transKernel, 6) << "}" << std::endl;
+ }
+ }
}
}
break;
@@ -1564,19 +1581,43 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
{
for (int i = 0; i < LDS_per_WG; i = i + 256)
{
- if (i + 256 < LDS_per_WG)
- {
- clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
- clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
- }
- else
- {
- // need to handle boundary
- clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
- clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
- clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
- clKernWrite(transKernel, 6) << "}" << std::endl;
- }
+ //twiddling and callback do not coexist
+ if (params.fft_hasPostCallback)
+ {
+ if (i + 256 < LDS_per_WG)
+ {
+ //clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
+ //clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim
+ << ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
+ << "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
+ }
+ else
+ {
+ // need to handle boundary
+ clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "inputA_R - batch_offset*" << smaller_dim * bigger_dim
+ << ", inputA_I - batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim
+ << "+group_offset+idx+" << i << ", post_userdata, preValue[idx+" << i << "].x, prevValue[idx+" << i << "].y);" << std::endl;
+ clKernWrite(transKernel, 6) << "}" << std::endl;
+ }
+ }
+ else
+ {
+ if (i + 256 < LDS_per_WG)
+ {
+ clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
+ }
+ else
+ {
+ // need to handle boundary
+ clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << LDS_per_WG << "){" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
+ clKernWrite(transKernel, 6) << "}" << std::endl;
+ }
+ }
}
}
break;
@@ -1823,13 +1864,9 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
{
case CLFFT_COMPLEX_INTERLEAVED:
if (params.fft_hasPreCallback)
- {
clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA + iOffset;" << std::endl;
- }
else
- {
clKernWrite(transKernel, 3) << "global " << dtInput << " *outputA = inputA;" << std::endl;
- }
break;
case CLFFT_COMPLEX_PLANAR:
if (params.fft_hasPreCallback)
@@ -2046,9 +2083,7 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
if (params.fft_hasPostCallback)
{
if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
- {
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
- }
else
{
size_t blockOffset = params.fft_inStride[2];
@@ -2064,9 +2099,7 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
clKernWrite(transKernel, 0) << ");" << std::endl;
if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
- {
clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
- }
else
{
size_t blockOffset = params.fft_inStride[2];
@@ -2275,22 +2308,16 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
if (params.fft_hasPostCallback)
{
if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
- }
else
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
- }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
}
clKernWrite(transKernel, 0) << ");" << std::endl;
if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
- }
else
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA - iOffset, iOffset + ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
if (params.fft_postCallback.localMemSize > 0)
@@ -2309,13 +2336,9 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
if (params.fft_hasPostCallback)
{
if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
- }
else
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
- }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
@@ -2323,14 +2346,9 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
clKernWrite(transKernel, 0) << ");" << std::endl;
if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
- }
else
- {
clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R-iOffset, outputA_I-iOffset, iOffset+((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
-
- }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 0428c1c..a1c1614 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -623,7 +623,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0]/clLengths[1];
//timmy ensure clLengths[0] > clLengths[1] only when inplace is enabled
- //so that swap kernel is launched after the square transpose kernel since twiddling is only enabled as the second kernel
+ //so that swap kernel is launched after the square transpose kernel since twiddling is only enabled in swap kernel if it is the second kernel
if (clLengths[0] < clLengths[1] && clfftGetRequestLibNoMemAlloc() && fftPlan->placeness == CLFFT_INPLACE)
{
std::cout << "switch lengths" << std::endl;
@@ -2107,7 +2107,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
{
if (fftPlan->large1D != 0 && 0)
{
- //this is not going to happen
+ //this is not going to happen anymore
currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
}
else
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list