[clfft] 54/128: Precallback - client updates

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:38 UTC 2015


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit f3d3417a62a2d17a703f33af97ae19dbcf521e5a
Author: Pradeep <pradeep.rao at amd.com>
Date:   Thu Sep 3 17:01:22 2015 +0530

    Precallback - client updates
---
 src/callback-client/callback-client.cpp | 804 +++++++++++++++++++++++---------
 src/callback-client/client.h            | 107 +++++
 2 files changed, 703 insertions(+), 208 deletions(-)

diff --git a/src/callback-client/callback-client.cpp b/src/callback-client/callback-client.cpp
index 357983b..b9b81ee 100644
--- a/src/callback-client/callback-client.cpp
+++ b/src/callback-client/callback-client.cpp
@@ -1,80 +1,22 @@
-#include "stdafx.h"
 #include <functional>
 #include <cmath>
 
 #include "client.h"
 #include "../library/private.h"
 #include "openCL.misc.h"
-#include "../statTimer/statisticalTimer.extern.h"
 #include "../include/sharedLibrary.h"
 #include "../include/unicode.compatibility.h"
 
-#include <fftw3.h>
 
 namespace po = boost::program_options;
 
-enum FFTType
-{
-	FFT_C2C,
-	FFT_R2C,
-	FFT_C2R,
-};
-
-#define ZERO_PAD_C2C __attribute__((always_inline)) \n float2 zeroPad (__global void *input, \n \
-								uint inoffset, \n \
-							__global void *userdata) \n \
-				 { \n \
-					 float2 scalar = 0.0f; \n \
-					 if (inoffset < 512) \n \
-					 { \n \
-						scalar = *((__global float2*)userdata + inoffset); \n \
-					 } \n \
-					 return scalar; \n \
-				} \n
-
-#define ZERO_PAD_C2C_KERNEL __kernel void zeroPad (__global void *input, \n \
-								__global void *userdata, uint batchLength) \n \
-				 { \n \
-					uint inoffset = get_global_id(0); \n \
-					 float2 scalar = 0.0f; \n \
-					 if (inoffset < 512) \n \
-					 { \n \
-					   scalar = *((__global float2*)userdata + inoffset); \n \
-					 } \n \
-					 *((__global float2*)input + inoffset) = scalar; \n \
-				} \n
-
-//forward declarations
-
-template < typename T >
-void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize, 
-				   clfftDim dim, clfftPrecision precision, cl_uint profile_count);
-
-template < typename T >
-void R2C_transform();
-
-template < typename T >
-void C2R_transform();
-
-fftwf_complex* get_fftwf_output(size_t* lengths, size_t fftBatchSize, int batch_size, clfftLayout in_layout,
-								clfftDim dim, clfftDirection dir);
-template < typename T1, typename T2>
-bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
-             size_t length, const float epsilon = 1e-6f);
-
-template < typename T >
-void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
-
-template < typename T >
-void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, cl_device_id device_id, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
-
 int main(int argc, char **argv)
 {
-	size_t lengths[ 3 ] = {1,1,1};
+	size_t lengths[ 3 ] = {BATCH_LENGTH,1,1}; //For simplicity, assuming 1D fft with fixed batch length of BATCH_LENGTH
 	cl_uint profile_count = 0;
-	clfftPrecision precision = CLFFT_SINGLE;
+	clfftPrecision precision = CLFFT_SINGLE; //Testing for single precision. Easily extendable for double
 
-	size_t batchSize = 1; //For simplicity using fixed batch size
+	size_t batchSize = 1; 
 
 	int fftType = 1;
 	FFTType clFFTType = FFT_C2C;
@@ -91,11 +33,9 @@ int main(int argc, char **argv)
 		desc.add_options()
 			( "help,h",        "produces this help message" )
 			( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
-			( "lenX,x",        po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ),   "Specify the length of the 1st dimension of a test array" )
-			( "lenY,y",        po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ),      "Specify the length of the 2nd dimension of a test array" )
-			( "lenZ,z",        po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ),      "Specify the length of the 3rd dimension of a test array" )
+			( "batchSize,b",   po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " )
 			( "profile,p",     po::value< cl_uint >( &profile_count )->default_value( 10 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
-			( "type,t",		   po::value< int	>( &fftType )->default_value( 1 ), "Type of FFT:\n1) Complex-Complex\n2) Real-Complex\n3) Complex-Real\n" )
+			( "type,t",		   po::value< int	>( &fftType )->default_value( 1 ), "Type of FFT:\n1) Complex-Complex (default)\n2) Real-Complex\n3) Complex-Real\n" )
 			;
 
 		po::variables_map vm;
@@ -139,7 +79,7 @@ int main(int argc, char **argv)
 		}
 		else if (clFFTType == FFT_R2C) // Complex-Complex cases
 		{
-			R2C_transform<float>();
+			R2C_transform<float>(setupData, lengths, batchSize, dim, precision, profile_count);
 		}
 		else                          // Complex-Complex cases
 		{
@@ -178,13 +118,13 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
 	cl_command_queue commandQueue = ::clCreateCommandQueue( context, device_id[0], command_queue_flags, &status );
     OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
 
-	//Run clFFT with seaparate Pre-process Kernel
 	if (precision == CLFFT_SINGLE)
 	{
-	runPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+		//Run clFFT with seaparate Pre-process Kernel
+		runC2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
 
-	//Run clFFT using pre-callback 
-	runPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+		//Run clFFT using pre-callback 
+		runC2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
 	}
 
 	OPENCL_V_THROW( clReleaseCommandQueue( commandQueue ), "Error: In clReleaseCommandQueue\n" );
@@ -192,72 +132,60 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
 }
 
 template < typename T >
-void R2C_transform()
-{
-}
-
-template < typename T >
-void C2R_transform()
-{
-}
-
-// Compute reference output using fftw for float type
-fftwf_complex* get_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size, clfftLayout in_layout,
-								clfftDim dim, clfftDirection dir)
+void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize, 
+				   clfftDim dim, clfftPrecision precision,  cl_uint profile_count)
 {
-	//In FFTW last dimension has the fastest changing index
-	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
-
-	fftwf_plan refPlan;
+	//	OpenCL state 
+	cl_device_type		deviceType	= CL_DEVICE_TYPE_ALL;
+	cl_int			deviceId = 0;
+	std::vector< cl_device_id > device_id;
+	cl_int				platformId = 0;
+	cl_context			context;
+	cl_uint command_queue_flags = 0;
+	command_queue_flags |= CL_QUEUE_PROFILING_ENABLE;
+	
+	// Test for in-place Hermitian Interleaved output 
+	// Hence output size is N/2 + 1 complex. So allocate N + 2 real input
+	size_t Nt = inlengths[0] + 2;
+	size_t vectorLength = Nt * inlengths[1] * inlengths[2];
+	size_t fftLength = vectorLength * batchSize;
 
-	fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
-	fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+	//OpenCL initializations
+	device_id = initializeCL( deviceType, deviceId, platformId, context, false);
 
-	size_t fftVectorLength = fftbatchLength/batch_size;
+	cl_int status = 0;
+    
+	cl_command_queue commandQueue = ::clCreateCommandQueue( context, device_id[0], command_queue_flags, &status );
+    OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
 
-	refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, 
-									refin, &fftwLengths[3 - dim], 1, fftVectorLength, 
-									refout, &fftwLengths[3 - dim], 1, fftVectorLength, 
-									dir, FFTW_ESTIMATE);
-	
-	float scalar; 
-	
-	for( size_t i = 0; i < fftbatchLength; i++)
+	if (precision == CLFFT_SINGLE)
 	{
-		scalar = 0.0f;
-		switch (in_layout)
-		{
-		case CLFFT_COMPLEX_INTERLEAVED:
-			if ( (i % fftVectorLength)  < 512)
-			{
-				scalar = 1.0f;
-			}
-			break;
-		default:
-			break;
-		}
+		//Run clFFT with seaparate Pre-process Kernel
+		runR2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, 
+										batchSize, vectorLength, fftLength, profile_count);
 
-		refin[i][0] = scalar;
-		refin[i][1] = 0;
+		//Run clFFT using pre-callback 
+		runR2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, 
+									batchSize, vectorLength, fftLength, profile_count);
 	}
 
-	fftwf_execute(refPlan);
-
-	fftw_free(refin);
-
-	fftwf_destroy_plan(refPlan);
+	OPENCL_V_THROW( clReleaseCommandQueue( commandQueue ), "Error: In clReleaseCommandQueue\n" );
+    OPENCL_V_THROW( clReleaseContext( context ), "Error: In clReleaseContext\n" );
+}
 
-	return refout;
+template < typename T >
+void C2R_transform()
+{
 }
 
 template < typename T >
-void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
 						size_t* inlengths, clfftDim dim, clfftPrecision precision,
 						size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
 {
 	cl_int status = 0;
 
-	size_t userdataLengths[ 3 ] = {512,1,1};
+	size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
 	size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
 	size_t userdataLength = vectorLength_userdata * batchSize;
 
@@ -326,32 +254,33 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
 
 	cl_mem * buffersOut = NULL; //NULL for in-place
 
-	Timer tr;
-	double wtime_acc = 0.0;
+	// for functional test
+	OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+			&fftbuffer, buffersOut, clMedBuffer ),
+			"clfftEnqueueTransform failed" );
+		
+	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
 
 	//	Loop as many times as the user specifies to average out the timings
-	for( cl_uint i = 0; i < profile_count; ++i )
+	if (profile_count > 1)
 	{
+		Timer tr;
 		tr.Start();
-		OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
-			&fftbuffer, buffersOut, clMedBuffer ),
-			"clfftEnqueueTransform failed" );
 		
-		OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
-
-		wtime_acc += tr.Sample();
+		for( cl_uint i = 0; i < profile_count; ++i )
+		{
+			OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+				&fftbuffer, buffersOut, clMedBuffer ),
+				"clfftEnqueueTransform failed" );
+		
+			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+		}
+		double wtimesample = tr.Sample();
+		double wtime = wtimesample/((double)profile_count);
 
-		//Ignore the first time sample if profiling for more than one iteration
-		if (i == 0 && profile_count > 1) wtime_acc = 0.0;
+		tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
 	}
-	double wtime = wtime_acc/((double)profile_count);
-	size_t totalLen = 1;
-	for(int i=0; i<dim; i++) totalLen *= inlengths[i];
-	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
 
-	tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
-	tout << "Execution gflops (with clFFT Pre-callback): " << ((double)batchSize * opsconst)/(1000000000.0*wtime) << std::endl;
-	
 	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
 	
 	if (profile_count == 1)
@@ -364,7 +293,7 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
 		//Reference fftw output
 		fftwf_complex *refout;
 
-		refout = get_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+		refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
 
 		/*for( cl_uint i = 0; i < fftLength; i++)
 		{
@@ -372,11 +301,11 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
 		}*/
 		if (!compare<fftwf_complex, T>(refout, output, fftLength))
 		{
-			std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
 		}
 		else
 		{
-			std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
 		}
 
 		fftwf_free(refout);
@@ -391,14 +320,14 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
 }
 
 template < typename T >
-void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
+void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
 							cl_command_queue commandQueue, cl_device_id device_id,
 							size_t* inlengths, clfftDim dim, clfftPrecision precision,
 							size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
 {
 	cl_int status = 0;
 
-	size_t userdataLengths[ 3 ] = {512,1,1};
+	size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1}; 
 	size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
 	size_t userdataLength = vectorLength_userdata * batchSize;
 
@@ -464,92 +393,106 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
 	//Pre-process kernel string
 	const char* source = STRINGIFY(ZERO_PAD_C2C_KERNEL);
 	
-	double wtime_acc = 0.0;
-	Timer tr;
-		
-	//	Loop as many times as the user specifies to average out the timings
-	for( cl_uint i = 0; i < profile_count; ++i )
-	{
-		tr.Start();
-		cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
-		OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
+	cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
+	OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
 
-		status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
-		OPENCL_V_THROW( status, "clBuildProgram failed" );
+	status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
+	OPENCL_V_THROW( status, "clBuildProgram failed" );
 
 #if defined( _DEBUG )
-		if( status != CL_SUCCESS )
+	if( status != CL_SUCCESS )
+	{
+		if( status == CL_BUILD_PROGRAM_FAILURE )
 		{
-			if( status == CL_BUILD_PROGRAM_FAILURE )
-			{
-				size_t buildLogSize = 0;
-				OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
-								"clGetProgramBuildInfo failed"  );
-
-				std::vector< char > buildLog( buildLogSize );
-				::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
-				OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+			size_t buildLogSize = 0;
+			OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
 							"clGetProgramBuildInfo failed"  );
 
-				std::cerr << "\n\t\t\tBUILD LOG\n";
-				std::cerr << "************************************************\n";
-				std::cerr << &buildLog[ 0 ] << std::endl;
-				std::cerr << "************************************************\n";
-			}
+			std::vector< char > buildLog( buildLogSize );
+			::memset( &buildLog[ 0 ], 0x0, buildLogSize );
 
-			OPENCL_V_THROW( status, "clBuildProgram failed" );
+			OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+						"clGetProgramBuildInfo failed"  );
+
+			std::cerr << "\n\t\t\tBUILD LOG\n";
+			std::cerr << "************************************************\n";
+			std::cerr << &buildLog[ 0 ] << std::endl;
+			std::cerr << "************************************************\n";
 		}
-#endif
 
-		cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
-		OPENCL_V_THROW( status, "clCreateKernel failed" );
+		OPENCL_V_THROW( status, "clBuildProgram failed" );
+	}
+#endif
 
-		cl_uint uarg = 0;
+	cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
+	OPENCL_V_THROW( status, "clCreateKernel failed" );
 
-		//Buffer to be zero-padded
-		OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+	//for functional test
+	cl_uint uarg = 0;
 
-		//originial data
-		OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+	//Buffer to be zero-padded
+	OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
 
-		//single batch length 
-		OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_uint ), (const void *)&vectorLength ), "clSetKernelArg failed" );
+	//originial data
+	OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
 
-		//Launch pre-process kernel
-		size_t gSize = fftLength;
-		size_t lSize = 64;
-		status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
-												NULL, &gSize, &lSize, 0, NULL, NULL );
-		OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+	//Launch pre-process kernel
+	size_t gSize = fftLength;
+	size_t lSize = 64;
+	status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+											NULL, &gSize, &lSize, 0, NULL, NULL );
+	OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
 	
-		OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
 
-		//Now invoke the clfft execute
-		OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
-			&fftbuffer, buffersOut, clMedBuffer ),
-			"clfftEnqueueTransform failed" );
+	//Now invoke the clfft execute
+	OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+		&fftbuffer, buffersOut, clMedBuffer ),
+		"clfftEnqueueTransform failed" );
 		
-		OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+	
+	if (profile_count > 1)
+	{
+		Timer tr;
+		tr.Start();
 
-		wtime_acc += tr.Sample();
+		//	Loop as many times as the user specifies to average out the timings
+		for( cl_uint i = 0; i < profile_count; ++i )
+		{
+			uarg = 0;
 
-		//Ignore the first time sample if profiling for more than one iteration
-		if (i == 0 && profile_count > 1) wtime_acc = 0.0;
+			//Buffer to be zero-padded
+			OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
 
-		//cleanup preprocess kernel opencl objects
-		OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
-		OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
-	}
+			//originial data
+			OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
 
-	double wtime = wtime_acc/((double)profile_count);
-	size_t totalLen = 1;
-	for(int i=0; i<dim; i++) totalLen *= inlengths[i];
-	double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+			//Launch pre-process kernel
+			status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+													NULL, &gSize, &lSize, 0, NULL, NULL );
+			OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+	
+			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
 
-	tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
-	tout << "Execution gflops (Separate Pre-process Kernel): " << ((double)batchSize * opsconst)/(1000000000.0*wtime) << std::endl;
+			//Now invoke the clfft execute
+			OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+				&fftbuffer, buffersOut, clMedBuffer ),
+				"clfftEnqueueTransform failed" );
+		
+			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+		}
+		double wtimesample =  tr.Sample();
 	
+		double wtime = wtimesample/((double)profile_count);
+	
+		tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+	}
+
+	//cleanup preprocess kernel opencl objects
+	OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
+	OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
+
 	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
 
 	if (profile_count == 1)
@@ -562,7 +505,7 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
 		//Reference fftw output
 		fftwf_complex *refout;
 
-		refout = get_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+		refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
 
 		/*for( cl_uint i = 0; i < fftLength; i++)
 		{
@@ -570,11 +513,11 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
 		}*/
 		if (!compare<fftwf_complex, T>(refout, output, fftLength))
 		{
-			std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****FAIL*****" << std::endl;
 		}
 		else
 		{
-			std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+			std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****PASS*****" << std::endl;
 		}
 
 		fftwf_free(refout);
@@ -588,6 +531,358 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
 	OPENCL_V_THROW( clReleaseMemObject( userdatabuffer ), "Error: In clReleaseMemObject\n" );
 }
 
+template < typename T >
+void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+						size_t* inlengths, clfftDim dim, clfftPrecision precision,
+						size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
+{
+	cl_int status = 0;
+
+	size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
+	size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
+	size_t userdataLength = vectorLength_userdata * batchSize;
+
+	//input/output allocation sizes
+	size_t size_of_buffers = fftLength * sizeof( T  );
+	size_t size_of_buffers_userdata = userdataLength * sizeof( T );
+
+	//in-place transform. Same buffer for input and output
+	cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
+    OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+
+	//Initialize Data
+	std::vector< T > userdata( userdataLength );
+
+	// impulse test case
+	for (size_t idx = 0; idx < userdataLength; ++idx)
+	{
+		userdata[idx] = 1;
+	}
+
+	//user data buffer
+	cl_mem userDatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
+    OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userDatabuffer) )" );
+
+	//clFFT initializations
+	
+	//	FFT state
+	clfftResultLocation	place = CLFFT_INPLACE;
+	clfftLayout	inLayout  = CLFFT_REAL;
+	clfftLayout	outLayout = CLFFT_HERMITIAN_INTERLEAVED;
+
+	clfftPlanHandle plan_handle;
+	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
+
+	//Precallback setup
+	char* precallbackstr = STRINGIFY(ZERO_PAD_R2C);
+
+	//Register the callback
+	OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "zeroPad", precallbackstr, NULL, 0, PRECALLBACK, userDatabuffer), "clFFTSetPlanCallback failed");
+
+	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+	OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
+	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
+	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+	//Bake Plan
+	OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
+
+	//get the buffersize
+	size_t buffersize=0;
+	OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+	//allocate the intermediate buffer
+	cl_mem clMedBuffer=NULL;
+
+	if (buffersize)
+	{
+		cl_int medstatus;
+		clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+	}
+
+	cl_mem * buffersOut = NULL; //NULL for in-place
+
+	//for functional test
+	OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+		&fftbuffer, buffersOut, clMedBuffer ),
+		"clfftEnqueueTransform failed" );
+		
+	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+	if (profile_count > 1)
+	{
+		Timer tr;
+		tr.Start();
+
+		//	Loop as many times as the user specifies to average out the timings
+		for( cl_uint i = 0; i < profile_count; ++i )
+		{
+			OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+				&fftbuffer, buffersOut, clMedBuffer ),
+				"clfftEnqueueTransform failed" );
+		
+			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+		}
+		double wtimesample = tr.Sample();
+		double wtime = wtimesample/((double)profile_count);
+	
+		tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
+	}
+
+	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+	
+	if (profile_count == 1)
+	{
+		std::vector< std::complex< T > > output( fftLength/2 );
+
+		OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+			0, NULL, NULL ), "Reading the result buffer failed" );
+
+		/*for( cl_uint i = 0; i < fftLength/2; i++)
+		{
+			std::cout << "i " << i << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+		}*/
+
+		////Reference fftw output
+		//fftwf_complex *refout;
+
+		//refout = get_R2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim);
+
+		///*for( cl_uint i = 0; i < fftLength; i++)
+		//{
+		//	std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+		//}*/
+		//if (!compare<fftwf_complex, T>(refout, output, fftLength))
+		//{
+		//	std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
+		//}
+		//else
+		//{
+		//	std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
+		//}
+
+		//fftwf_free(refout);
+	}
+
+	OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+	OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+	//cleanup
+	OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
+	OPENCL_V_THROW( clReleaseMemObject( userDatabuffer ), "Error: In clReleaseMemObject\n" );
+}
+
+template < typename T >
+void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
+							cl_command_queue commandQueue, cl_device_id device_id,
+							size_t* inlengths, clfftDim dim, clfftPrecision precision,
+							size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
+{
+	cl_int status = 0;
+
+	size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1}; 
+	size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
+	size_t userdataLength = vectorLength_userdata * batchSize;
+
+	//input/output allocation sizes
+	size_t size_of_buffers = fftLength * sizeof( T  );
+	size_t size_of_buffers_userdata = userdataLength * sizeof( T  );
+
+	//in-place transform. Same buffer for input and output
+	cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
+    OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+
+	//Initialize Data
+	std::vector< T > userdata( userdataLength );
+
+	// impulse test case
+	for (size_t idx = 0; idx < userdataLength; ++idx)
+	{
+		userdata[idx] = 1;
+	}
+
+	//user data buffer
+	cl_mem userdatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
+    OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userdatabuffer) )" );
+
+	//clFFT initializations
+
+	//	FFT state
+	clfftResultLocation	place = CLFFT_INPLACE;
+	clfftLayout	inLayout  = CLFFT_REAL;
+	clfftLayout	outLayout = CLFFT_HERMITIAN_INTERLEAVED;
+
+	clfftPlanHandle plan_handle;
+	OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+	OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
+
+	//	Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+	OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+	OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
+	OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
+	OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+		//Bake Plan
+	OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
+
+	//get the buffersize
+	size_t buffersize=0;
+	OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+	//allocate the intermediate buffer
+	cl_mem clMedBuffer=NULL;
+
+	if (buffersize)
+	{
+		cl_int medstatus;
+		clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+		OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+	}
+
+	cl_mem * buffersOut = NULL; //NULL for in-place
+
+	//Pre-process kernel string
+	const char* source = STRINGIFY(ZERO_PAD_R2C_KERNEL);
+	
+	cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
+	OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
+
+	status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
+	OPENCL_V_THROW( status, "clBuildProgram failed" );
+
+#if defined( _DEBUG )
+	if( status != CL_SUCCESS )
+	{
+		if( status == CL_BUILD_PROGRAM_FAILURE )
+		{
+			size_t buildLogSize = 0;
+			OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+							"clGetProgramBuildInfo failed"  );
+
+			std::vector< char > buildLog( buildLogSize );
+			::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+			OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+						"clGetProgramBuildInfo failed"  );
+
+			std::cerr << "\n\t\t\tBUILD LOG\n";
+			std::cerr << "************************************************\n";
+			std::cerr << &buildLog[ 0 ] << std::endl;
+			std::cerr << "************************************************\n";
+		}
+
+		OPENCL_V_THROW( status, "clBuildProgram failed" );
+	}
+#endif
+
+	cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
+	OPENCL_V_THROW( status, "clCreateKernel failed" );
+
+	//for functional test
+	cl_uint uarg = 0;
+
+	//Buffer to be zero-padded
+	OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+
+	//originial data
+	OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+
+	//Launch pre-process kernel
+	size_t gSize = fftLength;
+	status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+											NULL, &gSize, NULL, 0, NULL, NULL );
+	OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+	
+	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+	//Now invoke the clfft execute
+	OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+		&fftbuffer, buffersOut, clMedBuffer ),
+		"clfftEnqueueTransform failed" );
+		
+	OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+	
+	if (profile_count > 1)
+	{
+		Timer tr;
+		tr.Start();
+
+		//	Loop as many times as the user specifies to average out the timings
+		for( cl_uint i = 0; i < profile_count; ++i )
+		{
+			uarg = 0;
+
+			//Buffer to be zero-padded
+			OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+
+			//originial data
+			OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+
+			//Launch pre-process kernel
+			status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+													NULL, &gSize, NULL, 0, NULL, NULL );
+			OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+	
+			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+			//Now invoke the clfft execute
+			OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+				&fftbuffer, buffersOut, clMedBuffer ),
+				"clfftEnqueueTransform failed" );
+		
+			OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+		}
+		double wtimesample = tr.Sample();
+		double wtime = wtimesample/((double)profile_count);
+	
+		tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+	}
+
+	//cleanup preprocess kernel opencl objects
+	OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
+	OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
+
+	if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+	if (profile_count == 1)
+	{
+		std::vector< std::complex< T > > output( fftLength/2 );
+
+		OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+			0, NULL, NULL ), "Reading the result buffer failed" );
+
+		////Reference fftw output
+		//fftwf_complex *refout;
+
+		//refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+
+		///*for( cl_uint i = 0; i < fftLength; i++)
+		//{
+		//	std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+		//}*/
+		//if (!compare<fftwf_complex, T>(refout, output, fftLength))
+		//{
+		//	std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****FAIL*****" << std::endl;
+		//}
+		//else
+		//{
+		//	std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****PASS*****" << std::endl;
+		//}
+
+		//fftwf_free(refout);
+	}
+
+	OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+	OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+	//cleanup
+	OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
+	OPENCL_V_THROW( clReleaseMemObject( userdatabuffer ), "Error: In clReleaseMemObject\n" );
+}
+
 //Compare reference and opencl output 
 template < typename T1, typename T2>
 bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
@@ -645,3 +940,96 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
 
 	return true;
 }
+
+
+// Compute reference output using fftw for float type
+fftwf_complex* get_C2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size, clfftLayout in_layout,
+								clfftDim dim, clfftDirection dir)
+{
+	//In FFTW last dimension has the fastest changing index
+	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+	fftwf_plan refPlan;
+
+	fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+	fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+
+	size_t fftVectorLength = fftbatchLength/batch_size;
+
+	refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size, 
+									refin, &fftwLengths[3 - dim], 1, fftVectorLength, 
+									refout, &fftwLengths[3 - dim], 1, fftVectorLength, 
+									dir, FFTW_ESTIMATE);
+	
+	float scalar; 
+	
+	for( size_t i = 0; i < fftbatchLength; i++)
+	{
+		scalar = 0.0f;
+		switch (in_layout)
+		{
+		case CLFFT_COMPLEX_INTERLEAVED:
+			if ( (i % fftVectorLength)  < USERDATA_LENGTH)
+			{
+				scalar = 1.0f;
+			}
+			break;
+		default:
+			break;
+		}
+
+		refin[i][0] = scalar;
+		refin[i][1] = 0;
+	}
+
+	fftwf_execute(refPlan);
+
+	fftw_free(refin);
+
+	fftwf_destroy_plan(refPlan);
+
+	return refout;
+}
+
+// Compute reference output using fftw for float type
+fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size,
+									clfftLayout in_layout, clfftDim dim)
+{
+	//In FFTW last dimension has the fastest changing index
+	int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+	int inembed[3] = {(int)lengths[2], (int)lengths[1], (int)(lengths[0] + 2)};
+	int outembed[3] = {(int)lengths[2], (int)lengths[1], (int)(lengths[0]/2 + 1)};
+
+	fftwf_plan refPlan;
+		
+	size_t infftVectorLength = inembed[0] * inembed[1] * inembed[2];
+	size_t outfftVectorLength = outembed[0] * outembed[1] * outembed[2];
+
+	float *refin = (float*) malloc(sizeof(float)*fftbatchLength);
+	fftwf_complex *refout = (fftwf_complex*)refin; //(fftwf_complex*)fftwf_malloc(sizeof(fftwf_complex)*outfftVectorLength*batch_size);
+
+	refPlan = fftwf_plan_many_dft_r2c(dim, &fftwLengths[3 - dim], batch_size, 
+									refin, &inembed[3 - dim], 1, infftVectorLength,
+									refout, &outembed[3 - dim], 1, outfftVectorLength, FFTW_ESTIMATE);
+	
+	float scalar; 
+	
+	for( size_t i = 0; i < fftbatchLength; i++)
+	{
+		scalar = 0.0f;
+		if ( (i % infftVectorLength)  < USERDATA_LENGTH)
+		{
+			scalar = 1.0f;
+		}
+		
+		refin[i] = scalar;
+	}
+
+	fftwf_execute(refPlan);
+
+	fftw_free(refin);
+
+	fftwf_destroy_plan(refPlan);
+
+	return refout;
+}
\ No newline at end of file
diff --git a/src/callback-client/client.h b/src/callback-client/client.h
index a1e100d..9ca44dd 100644
--- a/src/callback-client/client.h
+++ b/src/callback-client/client.h
@@ -22,10 +22,117 @@
 //	Boost headers that we want to use
 //	#define BOOST_PROGRAM_OPTIONS_DYN_LINK
 #include <boost/program_options.hpp>
+#include "stdafx.h"
+#include "../statTimer/statisticalTimer.extern.h"
+
+#include <fftw3.h>
 
 #define CALLBCKSTR(...) #__VA_ARGS__
 #define STRINGIFY(...) 	CALLBCKSTR(__VA_ARGS__)
 
+enum FFTType
+{
+	FFT_C2C,
+	FFT_R2C,
+	FFT_C2R,
+};
+#define USERDATA_LENGTH 512
+#define BATCH_LENGTH 1024 // Must be >= USERDATA_LENGTH
+
+#define ZERO_PAD_C2C __attribute__((always_inline)) \n float2 zeroPad (__global void *input, \n \
+								uint inoffset, \n \
+							__global void *userdata) \n \
+				 { \n \
+					 float2 scalar = 0.0f; \n \
+					 uint udoffset; \n \
+					 if ((inoffset % BATCH_LENGTH) < USERDATA_LENGTH) \n \
+					 { \n \
+					    udoffset = ((inoffset/BATCH_LENGTH) * USERDATA_LENGTH) + (inoffset % BATCH_LENGTH); \n \
+						scalar = *((__global float2*)userdata + udoffset); \n \
+					 } \n \
+					 return scalar; \n \
+				} \n
+
+#define ZERO_PAD_C2C_KERNEL __kernel void zeroPad (__global void *input, \n \
+								__global void *userdata) \n \
+				 { \n \
+					uint inoffset = get_global_id(0); \n \
+					 float2 scalar = 0.0f; \n \
+					 uint udoffset; \n \
+					 if ((inoffset % BATCH_LENGTH) < USERDATA_LENGTH) \n \
+					 { \n \
+					   udoffset = ((inoffset/BATCH_LENGTH) * USERDATA_LENGTH) + (inoffset % BATCH_LENGTH); \n \
+					   scalar = *((__global float2*)userdata + udoffset); \n \
+					 } \n \
+					 *((__global float2*)input + inoffset) = scalar; \n \
+				} \n
+
+#define ZERO_PAD_R2C __attribute__((always_inline)) \n float zeroPad (__global void *input, \n \
+								uint inoffset, \n \
+							__global void *userdata) \n \
+				 { \n \
+					 float scalar = 0.0f; \n \
+					 uint udoffset; \n \
+					 if ((inoffset % (BATCH_LENGTH + 2)) < USERDATA_LENGTH) \n \
+					 { \n \
+					    udoffset = ((inoffset/(BATCH_LENGTH + 2)) * USERDATA_LENGTH) + (inoffset % (BATCH_LENGTH + 2)); \n \
+						scalar = *((__global float*)userdata + udoffset); \n \
+					 } \n \
+					 return scalar; \n \
+				} \n
+
+#define ZERO_PAD_R2C_KERNEL __kernel void zeroPad (__global void *input, \n \
+								__global void *userdata) \n \
+				 { \n \
+					uint inoffset = get_global_id(0); \n \
+					 float scalar = 0.0f; \n \
+					 uint udoffset; \n \
+					 if ((inoffset % (BATCH_LENGTH + 2)) < USERDATA_LENGTH) \n \
+					 { \n \
+					   udoffset = ((inoffset/(BATCH_LENGTH + 2)) * USERDATA_LENGTH) + (inoffset % (BATCH_LENGTH + 2)); \n \
+					   scalar = *((__global float*)userdata + udoffset); \n \
+					 } \n \
+					 *((__global float*)input + inoffset) = scalar; \n \
+				} \n
+
+template < typename T >
+void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize, 
+				   clfftDim dim, clfftPrecision precision, cl_uint profile_count);
+
+template < typename T >
+void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize, 
+				   clfftDim dim, clfftPrecision precision,  cl_uint profile_count);
+
+template < typename T >
+void C2R_transform();
+
+fftwf_complex* get_C2C_fftwf_output(size_t* lengths, size_t fftBatchSize, int batch_size, clfftLayout in_layout,
+								clfftDim dim, clfftDirection dir);
+
+fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size,
+									clfftLayout in_layout, clfftDim dim);
+
+template < typename T1, typename T2>
+bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
+             size_t length, const float epsilon = 1e-6f);
+
+template < typename T >
+void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
+template < typename T >
+void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, cl_device_id device_id, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
+template < typename T >
+void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+						size_t* inlengths, clfftDim dim, clfftPrecision precision,
+						size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
+template < typename T >
+void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, 
+							cl_command_queue commandQueue, cl_device_id device_id,
+							size_t* inlengths, clfftDim dim, clfftPrecision precision,
+							size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
 #ifdef WIN32
 
 struct Timer

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list