[clfft] 06/21: introduced reverse order of kernel launch for 1:2 ratio inplace transpose. Passed most tests except pre-callback and post-callback tests. Need to enable twiddling in transpose as well
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Mar 16 13:14:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit 4aa3e4d3746c7278cb383a3660e1257b58a02e5b
Author: Timmy <timmy.liu at amd.com>
Date: Wed Feb 24 10:23:51 2016 -0600
introduced reverse order of kernel launch for 1:2 ratio inplace transpose. Passed most tests except pre-callback and post-callback tests. Need to enable twiddling in transpose as well
---
src/library/action.transpose.cpp | 70 +++++++++++++++++++++++---
src/library/enqueue.cpp | 3 +-
src/library/generator.transpose.cpp | 10 +++-
src/library/plan.cpp | 99 ++++++++++++++++++++++++++++++++-----
src/library/plan.h | 3 +-
5 files changed, 164 insertions(+), 21 deletions(-)
diff --git a/src/library/action.transpose.cpp b/src/library/action.transpose.cpp
index db35ce5..487a1ef 100644
--- a/src/library/action.transpose.cpp
+++ b/src/library/action.transpose.cpp
@@ -196,7 +196,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
std::string programCode;
- if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
+ if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
{
//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
@@ -218,11 +218,53 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
}
OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
}
+ else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
+ {
+ //pre call back check
+ //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by transpose kernel
+ if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+ {
+ assert(!this->signature.fft_hasPostCallback);
+
+ bool validLDSSize = false;
+ size_t requestedCallbackLDS = 0;
+
+ requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+
+ validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
+
+ if (!validLDSSize)
+ {
+ fprintf(stderr, "Requested local memory size not available\n");
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+ OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
+ }
else
{
- //No pre-callback possible in swap kernel
- assert(!this->signature.fft_hasPreCallback);
+ //No pre-callback possible in swap kernel
+ //assert(!this->signature.fft_hasPreCallback);
+
+ //pre-callback is possible in swap kernel now
+ if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+ {
+ assert(!this->signature.fft_hasPostCallback);
+
+ bool validLDSSize = false;
+ size_t requestedCallbackLDS = 0;
+
+ requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+ //LDS usage of swap lines is exactly 2 lines
+ int lineSize = (this->signature.fft_N[0]) < (this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
+ validLDSSize = ((2 * this->plan->ElementSize() * lineSize) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
+ if (!validLDSSize)
+ {
+ fprintf(stderr, "Requested local memory size not available\n");
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
OPENCL_V(clfft_transpose_generator::genSwapKernel(this->signature, programCode, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
}
@@ -237,7 +279,7 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
OPENCL_V(fftRepo.setProgramCode(Transpose_NONSQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
- if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
+ if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
{
// Note: See genFunctionPrototype( )
if (this->signature.fft_3StepTwiddle)
@@ -249,6 +291,17 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_nonsquare", "transpose_nonsquare", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
}
}
+ else if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
+ {
+ if (this->signature.fft_3StepTwiddle)
+ {
+ OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
+ }
+ else
+ {
+ OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
+ }
+ }
else
{
OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_NONSQUARE, this->getSignatureData(), "swap_nonsquare", "swap_nonsquare", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
@@ -264,7 +317,8 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
size_t smaller_dim = (this->signature.fft_N[0] < this->signature.fft_N[1]) ? this->signature.fft_N[0] : this->signature.fft_N[1];
size_t global_item_size;
- if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE)
+ if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING
+ || this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
{
if (smaller_dim % (16 * reShapeFactor) == 0)
wg_slice = smaller_dim / 16 / reShapeFactor;
@@ -280,7 +334,11 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
/*Push the data required for the transpose kernels*/
globalWS.clear();
- globalWS.push_back(global_item_size * 2);
+ if(this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING)
+ globalWS.push_back(global_item_size * 2);
+ else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
+ globalWS.push_back(global_item_size);
+
localWS.clear();
localWS.push_back(lwSize);
diff --git a/src/library/enqueue.cpp b/src/library/enqueue.cpp
index 1691a16..155115e 100644
--- a/src/library/enqueue.cpp
+++ b/src/library/enqueue.cpp
@@ -624,6 +624,7 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
std::vector< size_t > gWorkSize;
std::vector< size_t > lWorkSize;
clfftStatus result = this->getWorkSizes (gWorkSize, lWorkSize);
+ //std::cout << "gWorkSize = " << gWorkSize[0] << " lWorkSize = " << lWorkSize[0] << std::endl;
// TODO: if getWorkSizes returns CLFFT_INVALID_GLOBAL_WORK_SIZE, that means
// that this multidimensional input data array is too large to be transformed
@@ -645,7 +646,7 @@ clfftStatus FFTAction::enqueue(clfftPlanHandle plHandle,
cl_int call_status = clEnqueueNDRangeKernel( *commQueues, kern, static_cast< cl_uint >( gWorkSize.size( ) ),
NULL, &gWorkSize[ 0 ], &lWorkSize[ 0 ], numWaitEvents, waitEvents, outEvents );
OPENCL_V( call_status, _T( "clEnqueueNDRangeKernel failed" ) );
-
+ //OPENCL_V( clFinish(*commQueues), "clFinish failed" );
if( fftRepo.pStatTimer )
{
fftRepo.pStatTimer->AddSample( plHandle, this->plan, kern, numQueuesAndEvents, outEvents, gWorkSize, lWorkSize );
diff --git a/src/library/generator.transpose.cpp b/src/library/generator.transpose.cpp
index 0493919..0b4d133 100644
--- a/src/library/generator.transpose.cpp
+++ b/src/library/generator.transpose.cpp
@@ -38,7 +38,8 @@ void OffsetCalc(std::stringstream& transKernel, const FFTKernelGenKeyParams& par
for (size_t i = params.fft_DataDim - 2; i > 0; i--)
{
- clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;
+ clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << stride[i + 1] << ";" << std::endl;//TIMMY
+ //clKernWrite(transKernel, 3) << offset << " += (g_index/numGroupsY_" << i << ")*" << 1048576 << ";" << std::endl;
clKernWrite(transKernel, 3) << "g_index = g_index % numGroupsY_" << i << ";" << std::endl;
}
@@ -1594,6 +1595,13 @@ clfftStatus genTransposeKernelLeadingDimensionBatched(const FFTGeneratedTranspos
clKernWrite(transKernel, 0) << params.fft_preCallback.funcstring << std::endl;
clKernWrite(transKernel, 0) << std::endl;
}
+ //If post-callback is set for the plan
+ if (params.fft_hasPostCallback)
+ {
+ //Insert callback function code at the beginning
+ clKernWrite(transKernel, 0) << params.fft_postCallback.funcstring << std::endl;
+ clKernWrite(transKernel, 0) << std::endl;
+ }
std::string funcName;
if (params.fft_3StepTwiddle) // TODO
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index 7a5953e..46c240a 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -775,7 +775,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans2Plan->oDist = clLengths[1] * trans2Plan->outStride[1];
trans2Plan->gen = transGen;
- // if(transGen != Transpose_NONSQUARE)
+ if(transGen != Transpose_NONSQUARE)//Timmy was commented
trans2Plan->large1D = fftPlan->length[0];
trans2Plan->transflag = true;
@@ -830,12 +830,12 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
row2Plan->iDist *= fftPlan->length[index];
row2Plan->oDist *= fftPlan->length[index];
}
-
-// if (transGen == Transpose_NONSQUARE)
-// {
-// row2Plan->large1D = fftPlan->length[0];
-// row2Plan->twiddleFront = true;
-// }
+ //Timmy was group commented
+ if (transGen == Transpose_NONSQUARE)
+ {
+ row2Plan->large1D = fftPlan->length[0];
+ row2Plan->twiddleFront = true;
+ }
OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
_T( "BakePlan large1d second row plan failed" ) );
@@ -1947,7 +1947,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
else if (fftPlan->gen == Transpose_NONSQUARE)
{
- if(fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE || fftPlan->nonSquareKernelType == NON_SQUARE_TRANS_SWAP)
+ if(fftPlan->nonSquareKernelType != NON_SQUARE_TRANS_PARENT)
fftPlan->action = new FFTGeneratedTransposeNonSquareAction(plHandle, fftPlan, *commQueueFFT, err);
else
{
@@ -1955,7 +1955,32 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0];
clLengths[1] = fftPlan->length[1];
- //Transpose stage 1 first do batched sqaure transpose along leading dim
+
+ /*
+ There are two ways of conducting inplace transpose with 1:2 dimension ratio.
+ A. first conduct batched square transpose along leading dim (row dim)
+ then conduct line swapping kernels for the whole non square matrix
+ B. first conduct line swapping kernels for the whole non square matrix
+ then conduct batched square transpose along column dim (a 'real' batched transpose)
+
+ Note that the twiddle computation has to go at the begining of the first kernel or the end of the second kernel
+
+ if leading dimension is bigger, it makes more sense (faster) to swap line first and then conduct batched square transpose
+ if leading dimension is smaller, it makes more sense (faster) to conduct batched transpose and then swap lines.
+ */
+ enum NON_SQUARE_KERNEL_ORDER
+ {
+ SWAP_AND_TRANSPOSE,
+ TRANSPOSE_AND_SWAP
+ };
+
+ NON_SQUARE_KERNEL_ORDER currKernelOrder;
+ if (clLengths[0] > clLengths[1])
+ currKernelOrder = SWAP_AND_TRANSPOSE;
+ else
+ currKernelOrder = TRANSPOSE_AND_SWAP;
+ //currKernelOrder = TRANSPOSE_AND_SWAP;
+ //Transpose stage 1
OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
@@ -1977,9 +2002,35 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans1Plan->iDist = fftPlan->iDist;
trans1Plan->oDist = fftPlan->oDist;
trans1Plan->gen = Transpose_NONSQUARE;
- trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE;
+ if(currKernelOrder == SWAP_AND_TRANSPOSE)
+ trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;// was NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING;
+ else
+ trans1Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED;
trans1Plan->transflag = true;
trans1Plan->large1D = fftPlan->large1D;
+
+ if (trans1Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
+ {
+ //this should be in a function to avoide duplicate code TODO
+ //need to treat a non square matrix as a sqaure matrix with bigger batch size
+ int lengthX = trans1Plan->length[0];
+ int lengthY = trans1Plan->length[1];
+
+ int BatchFactor = (lengthX > lengthY) ? (lengthX / lengthY) : (lengthY / lengthX);
+ trans1Plan->batchsize *= BatchFactor;
+ trans1Plan->iDist = trans1Plan->iDist / BatchFactor;
+ if (lengthX > lengthY)
+ {
+ trans1Plan->length[0] = lengthX / BatchFactor;
+ trans1Plan->inStride[1] = lengthX / BatchFactor;
+ }
+ else if (lengthX < lengthY)
+ {
+ trans1Plan->length[1] = lengthY / BatchFactor;
+ trans1Plan->inStride[1] = lengthX;
+ }
+ }
+
for (size_t index = 2; index < fftPlan->length.size(); index++)
{
trans1Plan->length.push_back(fftPlan->length[index]);
@@ -1999,7 +2050,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
_T("BakePlan transpose_nsq_stage1 plan failed"));
- //Transpose stage 2 then do swapping lines
+ //Transpose stage 2
OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths),
_T("CreateDefaultPlan transpose_nsq_stage2 plan failed"));
@@ -2021,9 +2072,33 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
trans2Plan->iDist = fftPlan->iDist;
trans2Plan->oDist = fftPlan->oDist;
trans2Plan->gen = Transpose_NONSQUARE;
- trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
+ if (currKernelOrder == SWAP_AND_TRANSPOSE)
+ trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_TRANSPOSE_BATCHED; //was NON_SQUARE_TRANS_SWAP;
+ else
+ trans2Plan->nonSquareKernelType = NON_SQUARE_TRANS_SWAP;
trans2Plan->transflag = true;
+ if (trans2Plan->nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
+ {
+ //need to treat a non square matrix as a sqaure matrix with bigger batch size
+ int lengthX = trans2Plan->length[0];
+ int lengthY = trans2Plan->length[1];
+
+ int BatchFactor = (lengthX > lengthY) ? (lengthX/lengthY) : (lengthY/lengthX);
+ trans2Plan->batchsize *= BatchFactor;
+ trans2Plan->iDist = trans2Plan->iDist / BatchFactor;
+ if (lengthX > lengthY)
+ {
+ trans2Plan->length[0] = lengthX / BatchFactor;
+ trans2Plan->inStride[1] = lengthX / BatchFactor;
+ }
+ else if(lengthX < lengthY)
+ {
+ trans2Plan->length[1] = lengthY / BatchFactor;
+ trans2Plan->inStride[1] = lengthX;
+ }
+ }
+
for (size_t index = 2; index < fftPlan->length.size(); index++)
{
trans2Plan->length.push_back(fftPlan->length[index]);
diff --git a/src/library/plan.h b/src/library/plan.h
index 73caa8d..a9cf370 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -91,7 +91,8 @@ enum BlockComputeType
enum NonSquareTransposeKernelType
{
NON_SQUARE_TRANS_PARENT,
- NON_SQUARE_TRANS_TRANSPOSE,
+ NON_SQUARE_TRANS_TRANSPOSE_BATCHED_LEADING,
+ NON_SQUARE_TRANS_TRANSPOSE_BATCHED,
NON_SQUARE_TRANS_SWAP
};
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list