[clfft] 15/32: bug fix
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Apr 26 08:34:09 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit de499e6d49bb6386ba55a3927c23f51222676021
Author: Timmy <timmy.liu at amd.com>
Date: Mon Mar 28 12:31:34 2016 -0500
bug fix
---
src/library/action.transpose.cpp | 1 +
src/library/generator.transpose.cpp | 292 ++++++++++++++++++++++++++++++++++--
src/library/plan.cpp | 54 +++----
src/library/plan.h | 25 +++
4 files changed, 320 insertions(+), 52 deletions(-)
diff --git a/src/library/action.transpose.cpp b/src/library/action.transpose.cpp
index c9dc999..4f86859 100644
--- a/src/library/action.transpose.cpp
+++ b/src/library/action.transpose.cpp
@@ -183,6 +183,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::initParams()
this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
this->signature.transposeMiniBatchSize = this->plan->transposeMiniBatchSize;
+ this->signature.nonSquareKernelOrder = this->plan->nonSquareKernelOrder;
this->signature.transposeBatchSize = this->plan->batchsize;
return CLFFT_SUCCESS;
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
index d9e9a20..5c28076 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.cpp
@@ -1051,6 +1051,11 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
//twiddle in swap kernel (for now, swap with twiddle seems to always be the second kernel after transpose)
bool twiddleSwapKernel = params.fft_3StepTwiddle && (dim_ratio > 1);
+ //twiddle factors applied to the output of swap kernels if swap kernels are the last kernel in transpose order
+ bool twiddleSwapKernelOut = twiddleSwapKernel && (params.nonSquareKernelOrder == TRANSPOSE_AND_SWAP || params.nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP);
+ //twiddle factors applied to the input of swap kernels if swap kernels are the first kernel in transpose order
+ bool twiddleSwapKernelIn = twiddleSwapKernel && (params.nonSquareKernelOrder == SWAP_AND_TRANSPOSE);
+
//generate the swap_table
std::vector<std::vector<size_t> > permutationTable;
@@ -1069,6 +1074,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
//twiddle in swap kernel
+ //twiddle in or out should be using the same twiddling table
if (twiddleSwapKernel)
{
std::string str;
@@ -1195,6 +1201,12 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
clKernWrite(transKernel, 3) << std::endl;
//move to that row block and load that row block to LDS
+ if (twiddleSwapKernelIn)
+ {
+ clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
+ clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
+ clKernWrite(transKernel, 6) << dtComplex << " twiddle_factor;" << std::endl;
+ }
switch (params.fft_inputLayout)
{
case CLFFT_REAL:
@@ -1210,7 +1222,38 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
}
else
- clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ {
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ }
}
else
{
@@ -1222,7 +1265,38 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim * bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
}
else
- clKernWrite(transKernel, 6) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ {
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ clKernWrite(transKernel, 6) << "prevValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ }
clKernWrite(transKernel, 3) << "}" << std::endl;
}
}
@@ -1245,8 +1319,39 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
else
{
- clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
- clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ {
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ }
}
}
else
@@ -1261,8 +1366,39 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
else
{
- clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
- clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ {
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
+ clKernWrite(transKernel, 3) << "prevValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ }
}
clKernWrite(transKernel, 3) << "}" << std::endl;
}
@@ -1323,7 +1459,38 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim*bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
}
else
- clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ {
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ }
else
{
// need to handle boundary
@@ -1334,7 +1501,38 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
<< "(inputA-batch_offset*" << smaller_dim * bigger_dim << ", batch_offset*" << smaller_dim*bigger_dim << "+group_offset+idx+" << i << ", pre_userdata);" << std::endl;
}
else
- clKernWrite(transKernel, 9) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ {
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x - inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].x * twiddle_factor.y + inputA[group_offset+idx+" << i << "].y * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA[group_offset+idx+" << i << "].x * twiddle_factor.x + inputA[group_offset+idx+" << i << "].y * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA[group_offset+idx+" << i << "].y * twiddle_factor.x - inputA[group_offset+idx+" << i << "].x * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ clKernWrite(transKernel, 9) << "nextValue[idx+" << i << "] = inputA[group_offset+idx+" << i << "];" << std::endl;
+ }
clKernWrite(transKernel, 6) << "}" << std::endl;
}
}
@@ -1357,8 +1555,39 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
else
{
- clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
- clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
+ clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ }
}
}
else
@@ -1373,8 +1602,39 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
else
{
- clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
- clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ if (twiddleSwapKernelIn)
+ {
+ if (params.fft_N[0] > params.fft_N[1])//decides whether we have a tall or wide rectangle
+ {
+ //input is wide; output is tall; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << bigger_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << bigger_dim << ";" << std::endl;
+ }
+ else
+ {
+ //input is tall; output is wide; read input index realted
+ clKernWrite(transKernel, 6) << "p = (group_offset+idx+" << i << ")/" << smaller_dim << ";" << std::endl;
+ clKernWrite(transKernel, 6) << "q = (group_offset+idx+" << i << ")%" << smaller_dim << ";" << std::endl;
+ }
+ clKernWrite(transKernel, 6) << "twiddle_factor = TW3step(p*q);" << std::endl;
+ if (fwd)
+ {
+ //forward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x;" << std::endl;
+ }
+ else
+ {
+ //backward
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "] * twiddle_factor.x + inputA_I[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ clKernWrite(transKernel, 3) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "] * twiddle_factor.x - inputA_R[group_offset+idx+" << i << "] * twiddle_factor.y;" << std::endl;
+ }
+ }
+ else
+ {
+ clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].x = inputA_R[group_offset+idx+" << i << "];" << std::endl;
+ clKernWrite(transKernel, 6) << "nextValue[idx+" << i << "].y = inputA_I[group_offset+idx+" << i << "];" << std::endl;
+ }
}
clKernWrite(transKernel, 6) << "}" << std::endl;
}
@@ -1393,7 +1653,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
case CLFFT_REAL:// for real case this is different
case CLFFT_COMPLEX_INTERLEAVED:
{
- if (twiddleSwapKernel)
+ if (twiddleSwapKernelOut)
{
clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
@@ -1464,7 +1724,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
}
}
}
- else
+ else if(!twiddleSwapKernelOut)//could be twiddleSwapKernelIn
{
for (int i = 0; i < LDS_per_WG; i = i + 256)
{
@@ -1508,7 +1768,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
return CLFFT_TRANSPOSED_NOTIMPLEMENTED;
case CLFFT_COMPLEX_PLANAR:
{
- if (twiddleSwapKernel)
+ if (twiddleSwapKernelOut)
{
clKernWrite(transKernel, 6) << "size_t p;" << std::endl;
clKernWrite(transKernel, 6) << "size_t q;" << std::endl;
@@ -1577,7 +1837,7 @@ clfftStatus genSwapKernelGeneral(const FFTGeneratedTransposeNonSquareAction::Sig
clKernWrite(transKernel, 3) << std::endl;
}
}
- else
+ else if (!twiddleSwapKernelOut)//could be twiddleSwapKernelIn
{
for (int i = 0; i < LDS_per_WG; i = i + 256)
{
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index a1c1614..8cb07cd 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -627,9 +627,9 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
if (clLengths[0] < clLengths[1] && clfftGetRequestLibNoMemAlloc() && fftPlan->placeness == CLFFT_INPLACE)
{
std::cout << "switch lengths" << std::endl;
- size_t temp = clLengths[0];
- clLengths[0] = clLengths[1];
- clLengths[1] = temp;
+ //size_t temp = clLengths[0];
+ //clLengths[0] = clLengths[1];
+ //clLengths[1] = temp;
}
// Start of block where transposes are generated; 1D FFT
@@ -2073,54 +2073,34 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0];
clLengths[1] = fftPlan->length[1];
-
- /*
- There are three ways of conducting inplace transpose with 1:2 (or 2:1) dimension ratio.
- A. first conduct line swapping kernels for the whole non square matrix
- then conduct batched square transpose along column dim (a 'real' batched transpose)
- B. first conduct batched square transpose along column dim (a 'real' batched transpose)
- then conduct line swapping kernels for the whole non square matrix (for 2:1 case)
- C. first conduct batched square transpose along leading dim (row dim)
- then conduct line swapping kernels for the whole non square matrix
- Note that the twiddle computation has to go at the begining of the first kernel or the end of the second kernel
-
- if leading dimension is bigger, it makes more sense (faster) to swap line first and then conduct batched square transpose
- if leading dimension is smaller, it makes more sense (faster) to conduct batched transpose and then swap lines.
- */
- enum NON_SQUARE_KERNEL_ORDER
- {
- SWAP_AND_TRANSPOSE, // A.
- TRANSPOSE_AND_SWAP, // B.
- TRANSPOSE_LEADING_AND_SWAP, // C.
- };
-
- NON_SQUARE_KERNEL_ORDER currKernelOrder;
+ //NON_SQUARE_KERNEL_ORDER currKernelOrder;
// controling the transpose and swap kernel order
// if leading dim is larger than the other dim it makes sense to swap and transpose
- if (clLengths[0] > clLengths[1] && fftPlan->large1D == 0)
+ if (clLengths[0] > clLengths[1])
{
//twidding can be done in swap when swap is the second kernel for now
//TODO enable twiddling in swap here as well
- currKernelOrder = SWAP_AND_TRANSPOSE;
+ //Twiddling can be done in any swap kernel now
+ fftPlan->nonSquareKernelOrder = SWAP_AND_TRANSPOSE;
}
else
{
if (fftPlan->large1D != 0 && 0)
{
//this is not going to happen anymore
- currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
+ fftPlan->nonSquareKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
}
else
{
//twiddling can be done in swap
- currKernelOrder = TRANSPOSE_AND_SWAP;
+ fftPlan->nonSquareKernelOrder = TRANSPOSE_AND_SWAP;
}
}
//if the original input data is more than 1d only TRANSPOSE_LEADING_AND_SWAP order is supported
//TODO need to fix this here. related to multi dim batch size.
//if (fftPlan->length.size() > 2)
// currKernelOrder = TRANSPOSE_LEADING_AND_SWAP;
- std::cout << "currKernelOrder = " << currKernelOrder << std::endl;
+ std::cout << "currKernelOrder = " << fftPlan->nonSquareKernelOrder << std::endl;
//ends tranpose kernel order
//Transpose stage 1
@@ -2145,11 +2125,12 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans1Plan->iDist = fftPlan->iDist;
trans1Plan->oDist = fftPlan->oDist;
trans1Plan->gen = Transpose_NONSQUARE;
- if(currKernelOrder == SWAP_AND_TRANSPOSE)
+ trans1Plan->nonSquareKernelOrder = fftPlan->nonSquareKernelOrder;
+ if(fftPlan->nonSquareKernelOrder == SWAP_AND_TRANSPOSE)
trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
- else if (currKernelOrder == TRANSPOSE_AND_SWAP)
+ else if (fftPlan->nonSquareKernelOrder == TRANSPOSE_AND_SWAP)
trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
- else
+ else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP)
trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
trans1Plan->transflag = true;
trans1Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
@@ -2218,11 +2199,12 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans2Plan->iDist = fftPlan->iDist;
trans2Plan->oDist = fftPlan->oDist;
trans2Plan->gen = Transpose_NONSQUARE;
- if (currKernelOrder == SWAP_AND_TRANSPOSE)
+ trans2Plan->nonSquareKernelOrder = fftPlan->nonSquareKernelOrder;
+ if (fftPlan->nonSquareKernelOrder == SWAP_AND_TRANSPOSE)
trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
- else if(currKernelOrder == TRANSPOSE_AND_SWAP)
+ else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_AND_SWAP)
trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
- else
+ else if(fftPlan->nonSquareKernelOrder == TRANSPOSE_LEADING_AND_SWAP)
trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
trans2Plan->transflag = true;
trans2Plan->large1D = fftPlan->large1D;//twiddling may happen in this kernel
diff --git a/src/library/plan.h b/src/library/plan.h
index 2b53df4..2a051c9 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -96,6 +96,27 @@ enum NonSquareTransposeKernelType
NON_SQUARE_TRANS_SWAP
};
+/*
+There are three ways of conducting inplace transpose with 1:2 (or 2:1) dimension ratio.
+A. first conduct line swapping kernels for the whole non square matrix
+then conduct batched square transpose along column dim (a 'real' batched transpose)
+B. first conduct batched square transpose along column dim (a 'real' batched transpose)
+then conduct line swapping kernels for the whole non square matrix (for 2:1 case)
+C. first conduct batched square transpose along leading dim (row dim)
+then conduct line swapping kernels for the whole non square matrix
+Note that the twiddle computation has to go at the begining of the first kernel or the end of the second kernel
+
+if leading dimension is bigger, it makes more sense (faster) to swap line first and then conduct batched square transpose
+if leading dimension is smaller, it makes more sense (faster) to conduct batched transpose and then swap lines.
+*/
+enum NON_SQUARE_KERNEL_ORDER
+{
+ NOT_A_TRANSPOSE,
+ SWAP_AND_TRANSPOSE, // A.
+ TRANSPOSE_AND_SWAP, // B.
+ TRANSPOSE_LEADING_AND_SWAP, // C.
+};
+
#define CLFFT_CB_SIZE 32
#define CLFFT_MAX_INTERNAL_DIM 16
@@ -163,6 +184,8 @@ struct FFTKernelGenKeyParams {
// transposeBatchSize is the number of batchs times transposeMiniBatchSzie
// no user of the library should set its value
size_t transposeBatchSize;
+ // no user of the library should set its value
+ NON_SQUARE_KERNEL_ORDER nonSquareKernelOrder;
bool fft_hasPreCallback;
clfftCallbackParam fft_preCallback;
@@ -500,6 +523,7 @@ public:
// let's call this number transposeMiniBatchSize
// no user of the library should set its value
size_t transposeMiniBatchSize;
+ NON_SQUARE_KERNEL_ORDER nonSquareKernelOrder;
FFTPlan ()
: baked (false)
@@ -547,6 +571,7 @@ public:
, action(0)
, nonSquareKernelType(NON_SQUARE_TRANS_PARENT)
, transposeMiniBatchSize(1)
+ , nonSquareKernelOrder(NOT_A_TRANSPOSE)
, plHandle(0)
, hasPreCallback(false)
, hasPostCallback(false)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list