[clfft] 91/107: updating 1d real with prototype algorithm

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Jul 30 18:06:41 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit 8460bd07934526f0e3ace1db9e137efc6eaef006
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Sun Jun 14 20:08:04 2015 -0500

    updating 1d real with prototype algorithm
---
 src/library/generator.stockham.cpp |   7 +-
 src/library/plan.cpp               | 199 ++++++++++++++++++++++++++++++++++++-
 src/library/plan.h                 |   3 +-
 src/library/transform.cpp          |  86 +++++++++++++++-
 4 files changed, 289 insertions(+), 6 deletions(-)

diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index eb8fcb7..3229e98 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -3216,7 +3216,9 @@ namespace StockhamGenerator
 					str += SztToStr(params.fft_realSpecial_Nr); str += ") ) break;\n";
 
 					str += "\t\tlwbOut += ("; str += SztToStr(params.fft_realSpecial_Nr);
-					str += " - 2*batch)*"; str += SztToStr(Nt); str += ";\n\n";
+					str += " - 2*batch)*"; str += SztToStr(Nt); str += ";\n";
+					str += "\t\tb = "; str += SztToStr(params.fft_realSpecial_Nr);
+					str += " - b;\n\n";
 				}
 
 				if(blockCompute || realSpecial)
@@ -3400,7 +3402,8 @@ clfftStatus FFTGeneratedStockhamAction::initParams ()
         ARG_CHECK (this->signature.fft_N[0] != 0)
         ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
         this->signature.fft_3StepTwiddle = true;
-		ARG_CHECK ( this->plan->large1D  == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
+		if(!(this->plan->realSpecial))
+			ARG_CHECK ( this->plan->large1D  == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
     }
 
     this->signature.fft_fwdScale  = this->plan->forwardScale;
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index c33cd70..1355a6c 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -793,7 +793,202 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				size_t length0 = clLengths[0];
 				size_t length1 = clLengths[1];
 
-				if(fftPlan->inputLayout == CLFFT_REAL)
+
+				// For real transforms
+				// Special case optimization with 5-step algorithm
+				if( (fftPlan->inputLayout == CLFFT_REAL) && IsPo2(fftPlan->length[0]) )
+				{
+					if (fftPlan->length.size() > 1) break;
+					if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
+
+					ARG_CHECK(clLengths[0] <= Large1DThreshold);
+
+
+					size_t biggerDim = clLengths[0] > clLengths[1] ? clLengths[0] : clLengths[1];
+					size_t smallerDim = biggerDim == clLengths[0] ? clLengths[1] : clLengths[0];
+					size_t padding = 0;
+					if( (smallerDim % 64 == 0) || (biggerDim % 64 == 0) )
+						padding = 64;
+
+					if (fftPlan->tmpBufSize==0 )
+					{
+						fftPlan->tmpBufSize = (smallerDim + padding) * biggerDim *
+							fftPlan->batchsize * fftPlan->ElementSize() / 2;
+					}
+
+					//Transpose
+					//Input --> tmp buffer
+					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths ),
+						_T( "CreateDefaultPlan Large1d transpose 1 failed" ) );
+
+					FFTPlan* trans1Plan	= NULL;
+					lockRAII* trans1Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planTX, trans1Plan, trans1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					trans1Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans1Plan->precision     = fftPlan->precision;
+					trans1Plan->tmpBufSize    = 0;
+					trans1Plan->batchsize     = fftPlan->batchsize;
+					trans1Plan->envelope	  = fftPlan->envelope;
+					trans1Plan->inputLayout   = fftPlan->inputLayout;
+					trans1Plan->outputLayout  = CLFFT_REAL;
+					trans1Plan->inStride[0]   = fftPlan->inStride[0];
+					trans1Plan->inStride[1]   = clLengths[0];
+					trans1Plan->outStride[0]  = 1;
+					trans1Plan->outStride[1]  = clLengths[1] + padding;
+					trans1Plan->iDist         = fftPlan->iDist;
+					trans1Plan->oDist         = clLengths[0] * trans1Plan->outStride[1];
+					trans1Plan->gen           = Transpose_GCN;
+					trans1Plan->transflag     = true;
+
+					OPENCL_V(clfftBakePlan(fftPlan->planTX, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d trans1 plan failed" ) );
+
+					//Row transform
+					//tmp->output
+					//size clLengths[1], batch clLengths[0], with length[0] twiddle factor multiplication
+					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planX, fftPlan->context, CLFFT_1D, &clLengths[1] ),
+						_T( "CreateDefaultPlan Large1d column failed" ) );
+
+					FFTPlan* row1Plan	= NULL;
+					lockRAII* row1Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planX, row1Plan, row1Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					row1Plan->placeness     = CLFFT_OUTOFPLACE;
+					row1Plan->precision     = fftPlan->precision;
+					row1Plan->forwardScale  = 1.0f;
+					row1Plan->backwardScale = 1.0f;
+					row1Plan->tmpBufSize    = 0;
+					row1Plan->batchsize     = fftPlan->batchsize;
+
+					row1Plan->gen			= fftPlan->gen;
+					row1Plan->envelope		= fftPlan->envelope;
+
+					// twiddling is done in row2
+					row1Plan->large1D		= 0;
+
+					row1Plan->length.push_back(clLengths[0]);
+					row1Plan->inputLayout   = CLFFT_REAL;
+					row1Plan->outputLayout  = CLFFT_HERMITIAN_INTERLEAVED;
+					row1Plan->inStride[0]   = 1;
+					row1Plan->outStride[0]  = 1;
+					row1Plan->inStride.push_back(clLengths[1]+padding);
+					row1Plan->outStride.push_back(1 + clLengths[1]/2);
+					row1Plan->iDist         = clLengths[0] * row1Plan->inStride[1];
+					row1Plan->oDist         = clLengths[0] * row1Plan->outStride[1]; // tmp buf distance fix
+
+
+					OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d first row plan failed" ) );
+
+					//Transpose 2
+					//Output --> tmp buffer
+					clLengths[2] = clLengths[0];
+					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTY, fftPlan->context, CLFFT_2D, &clLengths[1] ),
+						_T( "CreateDefaultPlan Large1d transpose 2 failed" ) );
+
+					FFTPlan* trans2Plan	= NULL;
+					lockRAII* trans2Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planTY, trans2Plan, trans2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					size_t transLengths[2];
+					transLengths[0] = 1 + clLengths[1]/2;
+					transLengths[1] = clLengths[0];
+					OPENCL_V(clfftSetPlanLength( fftPlan->planTY, CLFFT_2D, transLengths ),
+						_T( "clfftSetPlanLength for planTY transpose failed" ) );
+
+
+
+					trans2Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans2Plan->precision     = fftPlan->precision;
+					trans2Plan->tmpBufSize    = 0;
+					trans2Plan->batchsize     = fftPlan->batchsize;
+					trans2Plan->envelope	  = fftPlan->envelope;
+					trans2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					trans2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					trans2Plan->inStride[0]   = 1;
+					trans2Plan->inStride[1]   = 1 + clLengths[1]/2;
+					trans2Plan->outStride[0]  = 1;
+					trans2Plan->outStride[1]  = clLengths[0] + padding;
+					trans2Plan->iDist         = clLengths[0] * trans2Plan->inStride[1];
+					trans2Plan->oDist         = (1 + clLengths[1]/2) * trans2Plan->outStride[1];
+                    trans2Plan->gen           = Transpose_GCN;
+					trans2Plan->transflag     = true;
+
+					OPENCL_V(clfftBakePlan(fftPlan->planTY, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d trans2 plan failed" ) );
+
+					//Row transform 2
+					//tmp->tmp
+					//size clLengths[0], batch clLengths[1]
+					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planY, fftPlan->context, CLFFT_1D, &clLengths[0] ),
+						_T( "CreateDefaultPlan Large1d second row plan failed" ) );
+
+					FFTPlan* row2Plan	= NULL;
+					lockRAII* row2Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planY, row2Plan, row2Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					row2Plan->placeness     = CLFFT_OUTOFPLACE;
+					row2Plan->precision     = fftPlan->precision;
+					row2Plan->forwardScale  = fftPlan->forwardScale;
+					row2Plan->backwardScale = fftPlan->backwardScale;
+					row2Plan->tmpBufSize    = 0;
+					row2Plan->batchsize     = fftPlan->batchsize;
+
+					row2Plan->gen			= fftPlan->gen;
+					row2Plan->envelope		= fftPlan->envelope;
+
+
+					row2Plan->length.push_back(1+clLengths[1]/2);
+					row2Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					row2Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					row2Plan->inStride[0]   = 1;
+					row2Plan->outStride[0]  = 1;
+					row2Plan->inStride.push_back(clLengths[0] + padding);
+					row2Plan->outStride.push_back(1 + clLengths[0]/2);
+					row2Plan->iDist         = (1 + clLengths[1]/2) * row2Plan->inStride[1];
+					row2Plan->oDist         = clLengths[1] * row2Plan->outStride[1];
+
+					row2Plan->realSpecial = true;
+					row2Plan->realSpecial_Nr = clLengths[1];
+
+					OPENCL_V(clfftBakePlan(fftPlan->planY, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d second row plan failed" ) );
+
+					//Transpose 3
+					//tmp --> output
+					OPENCL_V(clfftCreateDefaultPlanInternal( &fftPlan->planTZ, fftPlan->context, CLFFT_2D, clLengths ),
+						_T( "CreateDefaultPlan Large1d transpose 3 failed" ) );
+
+					FFTPlan* trans3Plan	= NULL;
+					lockRAII* trans3Lock	= NULL;
+					OPENCL_V( fftRepo.getPlan( fftPlan->planTZ, trans3Plan, trans3Lock ), _T( "fftRepo.getPlan failed" ) );
+
+					trans3Plan->placeness     = CLFFT_OUTOFPLACE;
+					trans3Plan->precision     = fftPlan->precision;
+					trans3Plan->tmpBufSize    = 0;
+					trans3Plan->batchsize     = fftPlan->batchsize;
+					trans3Plan->envelope	  = fftPlan->envelope;
+					trans3Plan->inputLayout   = CLFFT_COMPLEX_INTERLEAVED;
+					trans3Plan->outputLayout  = CLFFT_COMPLEX_INTERLEAVED;
+					trans3Plan->inStride[0]   = 1;
+					trans3Plan->inStride[1]   = 1 + clLengths[0]/2;
+					trans3Plan->outStride[0]  = 1;
+					trans3Plan->outStride[1]  = clLengths[1];
+					trans3Plan->iDist         = clLengths[1] * trans3Plan->inStride[1];
+					trans3Plan->oDist         = fftPlan->oDist;
+                    trans3Plan->gen           = Transpose_GCN;
+					trans3Plan->transflag     = true;
+					trans3Plan->transOutHorizontal = true;
+
+					OPENCL_V(clfftBakePlan(fftPlan->planTZ, numQueues, commQueueFFT, NULL, NULL ),
+						_T( "BakePlan large1d trans3 plan failed" ) );
+
+					fftPlan->transflag = true;
+					fftPlan->baked = true;
+					return	CLFFT_SUCCESS;
+				}
+				else if(fftPlan->inputLayout == CLFFT_REAL)
 				{
 					if (fftPlan->tmpBufSizeRC==0 )
 					{
@@ -1622,7 +1817,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				rowPlan->length.push_back(fftPlan->length[1]);
 				rowPlan->inStride.push_back(fftPlan->inStride[1]);
 				rowPlan->iDist           = fftPlan->iDist;
-
+				
 				OPENCL_V(clfftBakePlan(fftPlan->planX, numQueues, commQueueFFT, NULL, NULL ),
 					_T( "BakePlan for planX failed" ) );
 
diff --git a/src/library/plan.h b/src/library/plan.h
index dd8b46a..be6231e 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -417,7 +417,8 @@ public:
 	// if this is set it means we are doing the 4th step in the 5-step real FFT breakdown algorithm
 	bool realSpecial;
 	
-	size_t realSpecial_Nr;
+	size_t realSpecial_Nr; // this value stores the logical column height (N0) of matrix in the 4th step
+	                       // length[1] should be 1 + N0/2
 
 	// User created plan
 	bool userPlan;
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 0ce9288..f9f9b2c 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -129,7 +129,91 @@ clfftStatus clfftEnqueueTransform(
 			if (fftPlan->length[0] <= Large1DThreshold)
 				break;
 
-			if( fftPlan->inputLayout == CLFFT_REAL )
+			if( ( fftPlan->inputLayout == CLFFT_REAL ) && ( fftPlan->planTZ != 0) )
+			{
+					//First transpose
+					// Input->tmp
+					cl_event transTXOutEvents = NULL;
+					OPENCL_V( clfftEnqueueTransform( fftPlan->planTX, dir, numQueuesAndEvents, commQueues, numWaitEvents,
+						waitEvents, &transTXOutEvents, clInputBuffers, &localIntBuffer, NULL ),
+						_T("clfftEnqueueTransform for large1D transTX failed"));
+
+					cl_mem *mybuffers;
+					if (fftPlan->placeness==CLFFT_INPLACE)
+						mybuffers = clInputBuffers;
+					else
+						mybuffers = clOutputBuffers;
+
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
+					//First Row
+					//tmp->output
+					cl_event rowXOutEvents = NULL;
+					OPENCL_V( clfftEnqueueTransform( fftPlan->planX, dir, numQueuesAndEvents, commQueues, 1,
+						&transTXOutEvents, &rowXOutEvents, &localIntBuffer, mybuffers, NULL ),
+						_T("clfftEnqueueTransform for large1D rowX failed"));
+					clReleaseEvent(transTXOutEvents);
+
+
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, *mybuffers, CL_TRUE, 0, 536870912, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
+					//Second Transpose
+					// output->tmp
+					cl_event transTYOutEvents = NULL;
+					OPENCL_V( clfftEnqueueTransform( fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
+						&rowXOutEvents, &transTYOutEvents, mybuffers, &localIntBuffer, NULL ),
+						_T("clfftEnqueueTransform for large1D transTY failed"));
+					clReleaseEvent(rowXOutEvents);
+
+
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
+					//Second Row
+					//tmp->tmp, inplace
+					cl_event rowYOutEvents = NULL;
+					OPENCL_V( clfftEnqueueTransform( fftPlan->planY, dir, numQueuesAndEvents, commQueues, 1,
+						&transTYOutEvents, &rowYOutEvents, &localIntBuffer, NULL, NULL ),
+						_T("clfftEnqueueTransform for large1D rowY failed"));
+					clReleaseEvent(transTYOutEvents);
+
+#if defined(DEBUGGING)
+								//  For debugging interleave data only,
+								//  read the input buffer back into memory.
+						clFinish(*commQueues);
+								OPENCL_V( clEnqueueReadBuffer( *commQueues, localIntBuffer, CL_TRUE, 0, buffSizeBytes_complex, &temp[ 0 ], 0,
+									NULL, NULL ),
+									_T("Reading the result buffer failed") );
+#endif
+
+					//Third Transpose
+					// tmp->output
+					OPENCL_V( clfftEnqueueTransform( fftPlan->planTZ, dir, numQueuesAndEvents, commQueues, 1,
+						&rowYOutEvents, outEvents, &localIntBuffer, mybuffers, NULL ),
+						_T("clfftEnqueueTransform for large1D transTZ failed"));
+					clReleaseEvent(rowYOutEvents);
+			}
+			else if ( fftPlan->inputLayout == CLFFT_REAL )
 			{
 				cl_event colOutEvents = NULL;
 				cl_event copyInEvents = NULL;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list