[clfft] 07/21: fix intorduced post call back test fails. precallback not fixed yet.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Mar 16 13:14:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 5651302b69dba28ec75112921f5109982afc40ad
Author: Timmy <timmy.liu at amd.com>
Date: Wed Feb 24 17:28:02 2016 -0600
fix intorduced post call back test fails. precallback not fixed yet.
---
src/library/action.transpose.cpp | 9 ++
src/library/generator.transpose.cpp | 223 ++++++++++++++++++++++++++++++------
src/library/plan.cpp | 2 +
src/library/plan.h | 16 +++
4 files changed, 216 insertions(+), 34 deletions(-)
diff --git a/src/library/action.transpose.cpp b/src/library/action.transpose.cpp
index 487a1ef..baf4022 100644
--- a/src/library/action.transpose.cpp
+++ b/src/library/action.transpose.cpp
@@ -181,6 +181,9 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::initParams()
}
this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
+ this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
+ this->signature.transposeBatchSize = this->plan->batchsize;
+
return CLFFT_SUCCESS;
}
@@ -217,6 +220,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
+ //std::cout << programCode << std::endl;//TIMMY
}
else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
@@ -240,6 +244,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
+ //std::cout << programCode << std::endl;//TIMMY
}
else
{
@@ -266,6 +271,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
OPENCL_V(clfft_transpose_generator::genSwapKernel(this->signature, programCode, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
+ //std::cout << programCode << std::endl;//TIMMY
}
cl_int status = CL_SUCCESS;
@@ -563,6 +569,9 @@ clfftStatus FFTGeneratedTransposeSquareAction::initParams()
}
this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
+ this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
+ this->signature.transposeBatchSize = this->plan->batchsize;
+
return CLFFT_SUCCESS;
}
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
index 0b4d133..319e207 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.cpp
@@ -867,12 +867,12 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
// Generate kernel API
genTransposePrototype(params, lwSize, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
-
+ int wgPerBatch;
if (mult_of_16)
- clKernWrite(transKernel, 3) << "const int numGroupsY_1 = " << (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2 << ";" << std::endl;
+ wgPerBatch = (params.fft_N[0] / 16 / reShapeFactor)*(params.fft_N[0] / 16 / reShapeFactor + 1) / 2;
else
- clKernWrite(transKernel, 3) << "const int numGroupsY_1 = " << (params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) / 2 << ";" << std::endl;
-
+ wgPerBatch = (params.fft_N[0] / (16 * reShapeFactor) + 1)*(params.fft_N[0] / (16 * reShapeFactor) + 1 + 1) / 2;
+ clKernWrite(transKernel, 3) << "const int numGroupsY_1 = " << wgPerBatch << ";" << std::endl;
for (int i = 2; i < params.fft_DataDim - 1; i++)
{
@@ -1123,14 +1123,32 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
case CLFFT_COMPLEX_INTERLEAVED:
if (params.fft_hasPostCallback)
{
- clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+ if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+ else
+ {
+ //assume tranpose is only two dimensional for now
+ //int actualBatchSize = params.transposeBatchSize / params.transposeMiniBatchSize;
+ int blockOffset = params.fft_inStride[2];
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-" << blockOffset <<"*((get_group_id(0)/numGroupsY_1)%"<< params.transposeMiniBatchSize <<"), ((idy + loop*" << 16 / reShapeFactor << ")*"
+ << params.fft_N[0] << " + idx + "<< blockOffset <<"*( (get_group_id(0)/numGroupsY_1 )%" << params.transposeMiniBatchSize <<") " << "), post_userdata, yx_s[index]";
+ }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
}
clKernWrite(transKernel, 0) << ");" << std::endl;
- clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index]";
+ if (params.transposeMiniBatchSize < 2)
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
+ else
+ {
+ int blockOffset = params.fft_inStride[2];
+ //clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-iOffset, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx +iOffset), post_userdata, xy_s[index]";
+ //clKernWrite(transKernel, 0) << std::endl;
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA-" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize << "), ((lidy + loop*" << 16 / reShapeFactor << ")*"
+ << params.fft_N[0] << " + lidx + starting_index_yx + " << blockOffset << "*( (get_group_id(0)/numGroupsY_1 )%" << params.transposeMiniBatchSize << ") " << "), post_userdata, xy_s[index]";
+ }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
@@ -1146,14 +1164,36 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
case CLFFT_COMPLEX_PLANAR:
if (params.fft_hasPostCallback)
{
- clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+ if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
+ {
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+ }
+ else
+ {
+ int blockOffset = params.fft_inStride[2];
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R - "<< blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
+ "), outputA_I -" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
+ "), ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx +"<< blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
+ ")), post_userdata, yx_s[index].x, yx_s[index].y";
+ }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
}
clKernWrite(transKernel, 0) << ");" << std::endl;
- clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+ if (params.transposeMiniBatchSize < 2)//which means the matrix was not broken down into sub square matrics
+ {
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+ }
+ else
+ {
+ int blockOffset = params.fft_inStride[2];
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R - " << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
+ "), outputA_I -" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
+ "), ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx +" << blockOffset << "*((get_group_id(0)/numGroupsY_1)%" << params.transposeMiniBatchSize <<
+ ")), post_userdata, xy_s[index].x, xy_s[index].y";
+ }
if (params.fft_postCallback.localMemSize > 0)
{
clKernWrite(transKernel, 0) << ", localmem";
@@ -1864,22 +1904,60 @@ clfftStatus genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTranspos
switch (params.fft_outputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- case CLFFT_REAL:
- clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
- clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index]";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index];" << std::endl;
+ }
break;
case CLFFT_COMPLEX_PLANAR:
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
- clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
- clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
+ clKernWrite(transKernel, 6) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
- clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
- clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx+ starting_index_yx] = xy_s[index].y;" << std::endl;
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
default:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
@@ -2054,24 +2132,59 @@ clfftStatus genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTranspos
switch (params.fft_outputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- case CLFFT_REAL:
- clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
- clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+
+ clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 9) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index];" << std::endl;
+ clKernWrite(transKernel, 9) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index]; " << std::endl;
+ }
break;
case CLFFT_COMPLEX_PLANAR:
- clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
- clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
- clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
- clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
-
-
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+ clKernWrite(transKernel, 9) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 9) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x;" << std::endl;
+ clKernWrite(transKernel, 9) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y;" << std::endl;
+ clKernWrite(transKernel, 9) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x; " << std::endl;
+ clKernWrite(transKernel, 9) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; " << std::endl;
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
-
+ case CLFFT_REAL:
+ break;
default:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
@@ -2089,26 +2202,68 @@ clfftStatus genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTranspos
switch (params.fft_outputLayout)
{
case CLFFT_COMPLEX_INTERLEAVED:
- case CLFFT_REAL:
clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << " && idx<" << smaller_dim << ")" << std::endl;
- clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
- clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
- clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index]";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+
+ clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
+ clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index]";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << ");" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 12) << "outputA[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index]; " << std::endl;
+ clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ")" << std::endl;
+ clKernWrite(transKernel, 12) << "outputA[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index];" << std::endl;
+ }
break;
case CLFFT_COMPLEX_PLANAR:
clKernWrite(transKernel, 9) << "if ((idy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << " && idx<" << smaller_dim << ") {" << std::endl;
- clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
- clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
- clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
- clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
- clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
+ if (params.fft_hasPostCallback)
+ {
+ clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx), post_userdata, yx_s[index].x, yx_s[index].y";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << "); }" << std::endl;
+
+ clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
+
+ clKernWrite(transKernel, 12) << params.fft_postCallback.funcname << "(outputA_R, outputA_I, ((lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx), post_userdata, xy_s[index].x, xy_s[index].y";
+ if (params.fft_postCallback.localMemSize > 0)
+ {
+ clKernWrite(transKernel, 0) << ", localmem";
+ }
+ clKernWrite(transKernel, 0) << "); }" << std::endl;
+ }
+ else
+ {
+ clKernWrite(transKernel, 12) << "outputA_R[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].x; " << std::endl;
+ clKernWrite(transKernel, 12) << "outputA_I[(idy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + idx] = yx_s[index].y; }" << std::endl;
+ clKernWrite(transKernel, 9) << "if ((t_gy_p * " << 16 * reShapeFactor << " + lidx)<" << smaller_dim << " && (t_gx_p * " << 16 * reShapeFactor << " + lidy + loop*" << 16 / reShapeFactor << ")<" << smaller_dim << ") {" << std::endl;
+ clKernWrite(transKernel, 12) << "outputA_R[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].x;" << std::endl;
+ clKernWrite(transKernel, 12) << "outputA_I[(lidy + loop*" << 16 / reShapeFactor << ")*" << params.fft_N[0] << " + lidx + starting_index_yx] = xy_s[index].y; }" << std::endl;
+ }
break;
case CLFFT_HERMITIAN_INTERLEAVED:
case CLFFT_HERMITIAN_PLANAR:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
+ case CLFFT_REAL:
+ break;
default:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 46c240a..85f1f76 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -2017,6 +2017,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
int lengthY = trans1Plan->length[1];
int BatchFactor = (lengthX > lengthY) ? (lengthX / lengthY) : (lengthY / lengthX);
+ trans1Plan->transposeMiniBatchSize = BatchFactor;
trans1Plan->batchsize *= BatchFactor;
trans1Plan->iDist = trans1Plan->iDist / BatchFactor;
if (lengthX > lengthY)
@@ -2085,6 +2086,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
int lengthY = trans2Plan->length[1];
int BatchFactor = (lengthX > lengthY) ? (lengthX/lengthY) : (lengthY/lengthX);
+ trans2Plan->transposeMiniBatchSize = BatchFactor;
trans2Plan->batchsize *= BatchFactor;
trans2Plan->iDist = trans2Plan->iDist / BatchFactor;
if (lengthX > lengthY)
diff --git a/src/library/plan.h b/src/library/plan.h
index a9cf370..f0f7962 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -155,6 +155,14 @@ struct FFTKernelGenKeyParams {
size_t blockLDS;
NonSquareTransposeKernelType nonSquareKernelType;
+ // sometimes non square matrix are broken down into a number of
+ // square matrix during inplace transpose
+ // let's call this number transposeMiniBatchSize
+ // no user of the library should set its value
+ size_t transposeMiniBatchSize;
+ // transposeBatchSize is the number of batchs times transposeMiniBatchSzie
+ // no user of the library should set its value
+ size_t transposeBatchSize;
bool fft_hasPreCallback;
clfftCallbackParam fft_preCallback;
@@ -199,6 +207,8 @@ struct FFTKernelGenKeyParams {
blockSIMD = 0;
blockLDS = 0;
nonSquareKernelType = NON_SQUARE_TRANS_PARENT;
+ transposeMiniBatchSize = 1;
+ transposeBatchSize = 1;
fft_hasPreCallback = false;
fft_hasPostCallback = false;
limit_LocalMemSize = 0;
@@ -482,6 +492,11 @@ public:
FFTAction * action;
NonSquareTransposeKernelType nonSquareKernelType;
+ // sometimes non square matrix are broken down into a number of
+ // square matrix during inplace transpose
+ // let's call this number transposeMiniBatchSize
+ // no user of the library should set its value
+ size_t transposeMiniBatchSize;
FFTPlan ()
: baked (false)
@@ -527,6 +542,7 @@ public:
, gen(Stockham)
, action(0)
, nonSquareKernelType(NON_SQUARE_TRANS_PARENT)
+ , transposeMiniBatchSize(1)
, plHandle(0)
, hasPreCallback(false)
, hasPostCallback(false)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list