[clfft] 07/32: added twiddling to the end of swap kernel. passed some tests.
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Apr 26 08:34:08 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 9a069231eb470f1b0370074e3ebde1934806a626
Author: Timmy <timmy.liu at amd.com>
Date: Tue Mar 15 18:02:55 2016 -0600
added twiddling to the end of swap kernel. passed some tests.
---
src/library/action.transpose.cpp | 24 ++--
src/library/generator.transpose.cpp | 150 +++++++++++++++++++-----
src/library/plan.cpp | 17 +--
src/tests/accuracy_test_pow2.cpp | 2 +-
src/tests/accuracy_test_pow3.cpp | 220 +++++++++++++++++++++++++++++++++---
5 files changed, 351 insertions(+), 62 deletions(-)
diff --git a/src/library/action.transpose.cpp b/src/library/action.transpose.cpp
index 7ec6947..ce657bd 100644
--- a/src/library/action.transpose.cpp
+++ b/src/library/action.transpose.cpp
@@ -269,6 +269,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
}
//here we should decide generate what kind of swap kernel. 1:2 and 1:3 probably need different swap kernels
+ /*
if (this->signature.fft_N[0] == 2 * this->signature.fft_N[1] || 2 * this->signature.fft_N[0] == this->signature.fft_N[1])
{
OPENCL_V(clfft_transpose_generator::genSwapKernel(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
@@ -277,6 +278,9 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
{
OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
}
+ */
+ //general swap kernel takes care of all ratio
+ OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
std::cout << programCode << std::endl;//TIMMY
}
@@ -305,7 +309,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
else if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
- if (this->signature.fft_3StepTwiddle)
+ if (this->signature.fft_3StepTwiddle && (this->signature.transposeMiniBatchSize == 1)) //if miniBatchSize > 1 twiddling is done in swap kernel
{
OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
}
@@ -362,8 +366,8 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
else
{
/*Now calculate the data for the swap kernels */
-
- if(dim_ratio == 2){
+ // general swap kernel takes care of all ratio. need clean up here
+ if(dim_ratio == 2 && 0){
//1:2 ratio
size_t input_elm_size_in_bytes;
switch (this->signature.fft_precision)
@@ -438,18 +442,24 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
}
else
{
- if (dim_ratio == 3 || dim_ratio == 5 || dim_ratio == 10)
+ if (dim_ratio == 2 || dim_ratio == 3 || dim_ratio == 5 || dim_ratio == 10)
{
//1:3 ratio
size_t local_work_size_swap = 256;
std::vector<std::vector<size_t> > permutationTable;
clfft_transpose_generator::permutation_calculation(dim_ratio, smaller_dim, permutationTable);
- size_t global_item_size = permutationTable.size() * local_work_size_swap * this->plan->batchsize;
- for (int i = 2; i < this->plan->length.size(); i++)
- global_item_size *= this->plan->length[i];
+ size_t global_item_size;
+ if(this->plan->large1D && (dim_ratio > 1))
+ global_item_size = (permutationTable.size() + 2) * local_work_size_swap * this->plan->batchsize;
+ else
+ global_item_size = (permutationTable.size() + 2) * local_work_size_swap * this->plan->batchsize;
+ for (int i = 2; i < this->plan->length.size(); i++)
+ global_item_size *= this->plan->length[i];
globalWS.push_back(global_item_size);
localWS.push_back(local_work_size_swap);
}
+ else
+ return CLFFT_NOTIMPLEMENTED;
}
}
return CLFFT_SUCCESS;
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
index c17080a..f4ea9da 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.cpp
@@ -35,7 +35,6 @@ void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& par
clKernWrite(transKernel, 3) << "size_t " << offset << " = 0;" << std::endl;
clKernWrite(transKernel, 3) << "g_index = get_group_id(0);" << std::endl;
-
for (size_t i = params.fft_DataDim - 2; i > 0; i--)
{
clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
@@ -90,6 +89,7 @@ clfftStatus genTwiddleMath(const FFTKernelGenKeyParams& params, std::stringstrea
clKernWrite(transKernel, 9) << dtComplex << " Wm = TW3step( (t_gx_p*32 + lidx) * (t_gy_p*32 + lidy + loop*8) );" << std::endl;
clKernWrite(transKernel, 9) << dtComplex << " Wt = TW3step( (t_gy_p*32 + lidx) * (t_gx_p*32 + lidy + loop*8) );" << std::endl;
+
clKernWrite(transKernel, 9) << dtComplex << " Tm, Tt;" << std::endl;
if (fwd)
@@ -1011,11 +1011,16 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
//if pre-callback is set for the plan
//if post-callback is set for the plan
+ //twiddle in swap kernel
+ bool twiddleSwapKernel = params.fft_3StepTwiddle && (dim_ratio > 1);
+
//generate the swap_table
std::vector<std::vector<size_t> > permutationTable;
permutation_calculation(dim_ratio, smaller_dim, permutationTable);
- clKernWrite(transKernel, 0) << "__constant int swap_table["<< permutationTable.size() <<"][1] = {" << std::endl;
+ clKernWrite(transKernel, 0) << "__constant int swap_table["<< permutationTable.size()+2 <<"][1] = {" << std::endl;
+ clKernWrite(transKernel, 0) << "{0}," << std::endl;
+ clKernWrite(transKernel, 0) << "{"<< smaller_dim * dim_ratio - 1 <<"}," << std::endl;
for (std::vector<std::vector<size_t> >::iterator itor = permutationTable.begin(); itor != permutationTable.end(); itor++)
{
clKernWrite(transKernel, 0) << "{" << (*itor)[0] << "}";
@@ -1025,6 +1030,19 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
clKernWrite(transKernel, 0) << "," << std::endl;
}
+ //twiddle in swap kernel
+ if (twiddleSwapKernel)
+ {
+ std::string str;
+ StockhamGenerator::TwiddleTableLarge twLarge(smaller_dim * smaller_dim * dim_ratio);
+ if ((params.fft_precision == CLFFT_SINGLE) || (params.fft_precision == CLFFT_SINGLE_FAST))
+ twLarge.GenerateTwiddleTable<StockhamGenerator::P_SINGLE>(str);
+ else
+ twLarge.GenerateTwiddleTable<StockhamGenerator::P_DOUBLE>(str);
+ clKernWrite(transKernel, 0) << str << std::endl;
+ clKernWrite(transKernel, 0) << std::endl;
+ }
+
//std::string funcName = "swap_nonsquare_" + std::to_string(smaller_dim) + "_" + std::to_string(dim_ratio);
std::string funcName = "swap_nonsquare_";
std::string smaller_dim_str = static_cast<std::ostringstream*>(&(std::ostringstream() << smaller_dim))->str();
@@ -1042,7 +1060,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
genTransposePrototypeLeadingDimensionBatched(params, local_work_size_swap, dtPlanar, dtComplex, funcName, transKernel, dtInput, dtOutput);
clKernWrite(transKernel, 3) << "//each wg handles one row of " << smaller_dim << " in memory" << std::endl;
- clKernWrite(transKernel, 3) << "const int num_wg_per_batch = " << permutationTable.size() << ";" << std::endl; // number of wg per batch = number of independent cycles
+ clKernWrite(transKernel, 3) << "const int num_wg_per_batch = " << permutationTable.size() + 2 << ";" << std::endl; // number of wg per batch = number of independent cycles
clKernWrite(transKernel, 3) << "int group_id = get_group_id(0);" << std::endl;
clKernWrite(transKernel, 3) << "int idx = get_local_id(0);" << std::endl;
@@ -1066,7 +1084,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
default:
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
- clKernWrite(transKernel, 3) << "group_id -= batch_offset*" << permutationTable.size() << ";" << std::endl;
+ clKernWrite(transKernel, 3) << "group_id -= batch_offset*" << permutationTable.size() + 2 << ";" << std::endl;
clKernWrite(transKernel, 3) << std::endl;
clKernWrite(transKernel, 3) << "int prev = swap_table[group_id][0];" <<std::endl;
@@ -1161,15 +1179,28 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
{
clKernWrite(transKernel, 6) << "next = (prev*" << smaller_dim << ")%" << smaller_dim*dim_ratio - 1 << ";" << std::endl;
+ //ugly
+ clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
+ clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
+
clKernWrite(transKernel, 6) << "group_offset = (next/" << dim_ratio << ")*" << smaller_dim << "*" << dim_ratio
<< " + (next%" << dim_ratio << ")*" << smaller_dim << ";" << std::endl; //might look like: group_offset = (next/3)*729*3 + (next%3)*729;
+ if (twiddleSwapKernel)
+ {
+ //TODO
+ }
}
else
{
clKernWrite(transKernel, 6) << "next = (prev*" << dim_ratio << ")%" << smaller_dim*dim_ratio - 1 << ";" << std::endl;
- clKernWrite(transKernel, 3) << "group_offset = (next*" << smaller_dim << ");" << std::endl; //might look like: int group_offset = prev*729;
+ //ugly
+ clKernWrite(transKernel, 6) << "if (prev == " << smaller_dim * dim_ratio - 1 << ")" << std::endl;
+ clKernWrite(transKernel, 9) << "next = " << smaller_dim * dim_ratio - 1 << ";" << std::endl;
+
+ clKernWrite(transKernel, 6) << "group_offset = (next*" << smaller_dim << ");" << std::endl; //might look like: int group_offset = prev*729;
}
+
clKernWrite(transKernel, 3) << std::endl;
switch (params.fft_inputLayout)
{
@@ -1244,22 +1275,74 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
case CLFFT_COMPLEX_PLANAR:
{
- for (int i = 0; i < smaller_dim; i = i + 256)
- {
- if (i + 256 < smaller_dim)
- {
- clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
- clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
- }
- else
- {
- // need to handle boundary
- clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << smaller_dim << "){" << std::endl;
- clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
- clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
- clKernWrite(transKernel, 6) << "}" << std::endl;
- }
- }
+ if (twiddleSwapKernel)
+ {
+ clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
+ clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
+ clKernWrite(transKernel, 6) << dtComplex <<" twiddle_factor;" << std::endl;
+ for (int i = 0; i < smaller_dim; i = i + 256)
+ {
+ if (i + 256 < smaller_dim)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ // need to handle boundary
+ clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << smaller_dim << "){" << std::endl;
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.x - prevValue[idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x * twiddle_factor.y + prevValue[idx+" << i << "].y * twiddle_factor.x;" << std::endl;
+ clKernWrite(transKernel, 6) << "}" << std::endl;
+ }
+ clKernWrite(transKernel, 3) << std::endl;
+ }
+ }
+ else
+ {
+ for (int i = 0; i < smaller_dim; i = i + 256)
+ {
+ if (i + 256 < smaller_dim)
+ {
+ clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
+ }
+ else
+ {
+ // need to handle boundary
+ clKernWrite(transKernel, 6) << "if(idx+" << i << "<" << smaller_dim << "){" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_R[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].x;" << std::endl;
+ clKernWrite(transKernel, 6) << "inputA_I[group_offset+idx+" << i << "] = prevValue[idx+" << i << "].y;" << std::endl;
+ clKernWrite(transKernel, 6) << "}" << std::endl;
+ }
+ }
+ }
break;
}
default:
@@ -1359,9 +1442,12 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
break;
}
-
+ // it is a better idea to do twiddle in swap kernel if we will have a swap kernel.
+ // for pure square transpose, twiddle will be done in transpose kernel
+ bool twiddleTransposeKernel = params.fft_3StepTwiddle && (params.transposeMiniBatchSize == 1);//when transposeMiniBatchSize == 1 it is guaranteed to be a sqaure matrix transpose
// If twiddle computation has been requested, generate the lookup function
- if (params.fft_3StepTwiddle)
+
+ if (twiddleTransposeKernel)
{
std::string str;
StockhamGenerator::TwiddleTableLarge twLarge(params.fft_N[0] * params.fft_N[1]);
@@ -1372,7 +1458,7 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
clKernWrite(transKernel, 0) << str << std::endl;
clKernWrite(transKernel, 0) << std::endl;
}
-
+
// This detects whether the input matrix is square
@@ -1407,7 +1493,7 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
}
std::string funcName;
- if (params.fft_3StepTwiddle) // TODO
+ if (twiddleTransposeKernel) // it makes more sense to do twiddling in swap kernel
funcName = fwd ? "transpose_square_tw_fwd" : "transpose_square_tw_back";
else
funcName = "transpose_square";
@@ -1432,6 +1518,7 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
OffsetCalc(transKernel, params, true);
+
if (params.fft_placeness == CLFFT_OUTOFPLACE)
OffsetCalc(transKernel, params, false);
@@ -1644,8 +1731,9 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
+ // it makes more sense to do twiddling in swap kernel
// If requested, generate the Twiddle math to multiply constant values
- if (params.fft_3StepTwiddle)
+ if (twiddleTransposeKernel)
genTwiddleMath(params, transKernel, dtComplex, fwd);
clKernWrite(transKernel, 6) << "xy_s[index] = tmpm; " << std::endl;
@@ -1835,8 +1923,9 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
}
+ // it makes more sense to do twiddling in swap kernel
// If requested, generate the Twiddle math to multiply constant values
- if (params.fft_3StepTwiddle)
+ if (twiddleTransposeKernel)
genTwiddleMath(params, transKernel, dtComplex, fwd);
clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
@@ -1915,7 +2004,7 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
// If requested, generate the Twiddle math to multiply constant values
- if (params.fft_3StepTwiddle)
+ if (twiddleTransposeKernel)
genTwiddleMath(params, transKernel, dtComplex, fwd);
clKernWrite(transKernel, 9) << "xy_s[index] = tmpm;" << std::endl;
@@ -2083,8 +2172,9 @@ clfftStatus genTransposeKernelBatched(const FFTGeneratedTransposeSquareAction::S
strKernel = transKernel.str();
- if (!params.fft_3StepTwiddle)
- break;
+
+ if (!twiddleTransposeKernel)
+ break; // break for bothDir
}
return CLFFT_SUCCESS;
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 7b7e103..9dc23ef 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -609,7 +609,9 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
}
}
}
-
+ // add some special cases
+ if (fftPlan->length[0] == 100000)
+ clLengths[1] = 100;
clLengths[0] = fftPlan->length[0]/clLengths[1];
// Start of block where transposes are generated; 1D FFT
@@ -781,7 +783,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans2Plan->oDist = clLengths[1] * trans2Plan->outStride[1];
trans2Plan->gen = transGen;
- if(transGen != Transpose_NONSQUARE)//TIMMY twiddle
+ if (transGen != Transpose_NONSQUARE)//TIMMY twiddle
trans2Plan->large1D = fftPlan->length[0];
trans2Plan->transflag = true;
@@ -837,7 +839,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
row2Plan->oDist *= fftPlan->length[index];
}
- if (transGen == Transpose_NONSQUARE)//TIMMY twiddle
+ if (transGen != Transpose_NONSQUARE)//TIMMY twiddle in transform
{
row2Plan->large1D = fftPlan->length[0];
row2Plan->twiddleFront = true;
@@ -1991,7 +1993,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
}
else
{
- if (fftPlan->large1D != 0)
+ if (fftPlan->large1D != 0 && 0)
{
//currently tranpose twiddling is only supported in below case
//TODO support tranpose twiddling for all cases.
@@ -2004,8 +2006,8 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
}
//if the original input data is more than 1d only TRANSPOSE_LEADING_AND_SWAP order is supported
//TODO need to fix this here. related to multi dim batch size.
- if (fftPlan->length.size() > 2)
- currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
+ //if (fftPlan->length.size() > 2) //Timmy test
+ // currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
std::cout << "currKernelOrder = " << currKernelOrder << std::endl;
//ends tranpose kernel order
@@ -2038,7 +2040,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
else
trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
trans1Plan->transflag = true;
- trans1Plan->large1D = fftPlan->large1D;
+ trans1Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
if (trans1Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
@@ -2111,6 +2113,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
else
trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
trans2Plan->transflag = true;
+ trans2Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
if (trans2Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
diff --git a/src/tests/accuracy_test_pow2.cpp b/src/tests/accuracy_test_pow2.cpp
index 4c8a0c6..43271ce 100644
--- a/src/tests/accuracy_test_pow2.cpp
+++ b/src/tests/accuracy_test_pow2.cpp
@@ -1298,7 +1298,7 @@ TEST_F(accuracy_test_pow2_double, large_1D_forward_in_place_complex_planar_to_co
// *****************************************************
// *****************************************************
-//#define CLFFT_TEST_HUGE
+#define CLFFT_TEST_HUGE
#ifdef CLFFT_TEST_HUGE
#define HUGE_TEST_MAKE(test_name, len, bat) \
diff --git a/src/tests/accuracy_test_pow3.cpp b/src/tests/accuracy_test_pow3.cpp
index ab88cd5..3e7770e 100644
--- a/src/tests/accuracy_test_pow3.cpp
+++ b/src/tests/accuracy_test_pow3.cpp
@@ -1816,7 +1816,7 @@ TEST_F(accuracy_test_pow3_double, large_1D_out_of_place_hermitian_planar_to_real
// *****************************************************
template< class T, class cl_T, class fftw_T>
-void huge_1D_forward_in_place_complex_planar_to_complex_planar(size_t lenSize, size_t batchSize, layout::buffer_layout_t layoutType, direction::direction_t direction_type)
+void huge_1D_forward_in_place_complex_to_complex(size_t lenSize, size_t batchSize, layout::buffer_layout_t layoutType, direction::direction_t direction_type)
{
std::vector<size_t> lengths;
lengths.push_back(lenSize);
@@ -1836,92 +1836,278 @@ void huge_1D_forward_in_place_complex_planar_to_complex_planar(size_t lenSize, s
//177147 = 243 * 243 * 3, backward and forward, planar and interleaved, single and double, batch size 1 and 3
TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 1, layout::complex_planar, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 1, layout::complex_planar, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 1, layout::complex_planar, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 1, layout::complex_planar, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 3, layout::complex_planar, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 3, layout::complex_planar, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 3, layout::complex_planar, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 3, layout::complex_planar, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_planar_to_complex_planar_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 1, layout::complex_planar, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 1, layout::complex_planar, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_planar_to_complex_planar_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 1, layout::complex_planar, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 1, layout::complex_planar, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_planar_to_complex_planar_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 3, layout::complex_planar, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 3, layout::complex_planar, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_planar_to_complex_planar_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 3, layout::complex_planar, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 3, layout::complex_planar, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
//interleaved
TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 1, layout::complex_interleaved, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 1, layout::complex_interleaved, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 1, layout::complex_interleaved, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 1, layout::complex_interleaved, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 3, layout::complex_interleaved, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 3, layout::complex_interleaved, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< float, cl_float, fftwf_complex >(177147, 3, layout::complex_interleaved, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(177147, 3, layout::complex_interleaved, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 1, layout::complex_interleaved, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 1, layout::complex_interleaved, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_177147_1)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 1, layout::complex_interleaved, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 1, layout::complex_interleaved, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 3, layout::complex_interleaved, direction::forward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 3, layout::complex_interleaved, direction::forward); }
catch (const std::exception& err) { handle_exception(err); }
}
TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_177147_3)
{
- try { huge_1D_forward_in_place_complex_planar_to_complex_planar< double, cl_double, fftw_complex >(177147, 3, layout::complex_interleaved, direction::backward); }
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(177147, 3, layout::complex_interleaved, direction::backward); }
catch (const std::exception& err) { handle_exception(err); }
}
+//78125 = 125 * 125 * 5, backward and forward, planar and interleaved, single and double, batch size 1 and 3
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 1, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 1, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 3, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 3, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_planar_to_complex_planar_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 1, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_planar_to_complex_planar_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 1, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_planar_to_complex_planar_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 3, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_planar_to_complex_planar_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 3, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+//interleaved
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 1, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 1, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 3, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(78125, 3, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 1, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_78125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 1, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 3, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_double, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_78125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(78125, 3, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+//1594323 = 729 * 729 * 3 backward and forward, planar and interleaved, single only, double would require 3d transpose (not supported), batch size 1 and 3
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_1594323_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 1, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_1594323_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 1, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_1594323_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 3, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_1594323_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 3, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_double, huge_1D_forward_in_place_complex_planar_to_complex_planar_1594323_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< double, cl_double, fftw_complex >(1594323, 1, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+//interleaved
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_1594323_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 1, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_1594323_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 1, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_1594323_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 3, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_1594323_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1594323, 3, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+//1953125 = 625 * 625 * 5 backward and forward, planar and interleaved, single only, double would require 3d transpose (not supported), batch size 1 and 3
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_1953125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 1, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_1953125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 1, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_planar_to_complex_planar_1953125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 3, layout::complex_planar, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_planar_to_complex_planar_1953125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 3, layout::complex_planar, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+//interleaved
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_1953125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 1, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_1953125_1)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 1, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
+TEST_F(accuracy_test_pow3_single, huge_1D_forward_in_place_complex_interleaved_to_complex_interleaved_1953125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 3, layout::complex_interleaved, direction::forward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+TEST_F(accuracy_test_pow3_single, huge_1D_backward_in_place_complex_interleaved_to_complex_interleaved_1953125_3)
+{
+ try { huge_1D_forward_in_place_complex_to_complex< float, cl_float, fftwf_complex >(1953125, 3, layout::complex_interleaved, direction::backward); }
+ catch (const std::exception& err) { handle_exception(err); }
+}
+
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ //
// ^^^^^^^^^^^^^^^^^^^^^^^ normal 2D ^^^^^^^^^^^^^^^^^^^^^^ //
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list