[clfft] 54/128: Precallback - client updates
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Thu Oct 22 14:54:38 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit f3d3417a62a2d17a703f33af97ae19dbcf521e5a
Author: Pradeep <pradeep.rao at amd.com>
Date: Thu Sep 3 17:01:22 2015 +0530
Precallback - client updates
---
src/callback-client/callback-client.cpp | 804 +++++++++++++++++++++++---------
src/callback-client/client.h | 107 +++++
2 files changed, 703 insertions(+), 208 deletions(-)
diff --git a/src/callback-client/callback-client.cpp b/src/callback-client/callback-client.cpp
index 357983b..b9b81ee 100644
--- a/src/callback-client/callback-client.cpp
+++ b/src/callback-client/callback-client.cpp
@@ -1,80 +1,22 @@
-#include "stdafx.h"
#include <functional>
#include <cmath>
#include "client.h"
#include "../library/private.h"
#include "openCL.misc.h"
-#include "../statTimer/statisticalTimer.extern.h"
#include "../include/sharedLibrary.h"
#include "../include/unicode.compatibility.h"
-#include <fftw3.h>
namespace po = boost::program_options;
-enum FFTType
-{
- FFT_C2C,
- FFT_R2C,
- FFT_C2R,
-};
-
-#define ZERO_PAD_C2C __attribute__((always_inline)) \n float2 zeroPad (__global void *input, \n \
- uint inoffset, \n \
- __global void *userdata) \n \
- { \n \
- float2 scalar = 0.0f; \n \
- if (inoffset < 512) \n \
- { \n \
- scalar = *((__global float2*)userdata + inoffset); \n \
- } \n \
- return scalar; \n \
- } \n
-
-#define ZERO_PAD_C2C_KERNEL __kernel void zeroPad (__global void *input, \n \
- __global void *userdata, uint batchLength) \n \
- { \n \
- uint inoffset = get_global_id(0); \n \
- float2 scalar = 0.0f; \n \
- if (inoffset < 512) \n \
- { \n \
- scalar = *((__global float2*)userdata + inoffset); \n \
- } \n \
- *((__global float2*)input + inoffset) = scalar; \n \
- } \n
-
-//forward declarations
-
-template < typename T >
-void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
- clfftDim dim, clfftPrecision precision, cl_uint profile_count);
-
-template < typename T >
-void R2C_transform();
-
-template < typename T >
-void C2R_transform();
-
-fftwf_complex* get_fftwf_output(size_t* lengths, size_t fftBatchSize, int batch_size, clfftLayout in_layout,
- clfftDim dim, clfftDirection dir);
-template < typename T1, typename T2>
-bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
- size_t length, const float epsilon = 1e-6f);
-
-template < typename T >
-void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
-
-template < typename T >
-void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, cl_device_id device_id, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
-
int main(int argc, char **argv)
{
- size_t lengths[ 3 ] = {1,1,1};
+ size_t lengths[ 3 ] = {BATCH_LENGTH,1,1}; //For simplicity, assuming 1D fft with fixed batch length of BATCH_LENGTH
cl_uint profile_count = 0;
- clfftPrecision precision = CLFFT_SINGLE;
+ clfftPrecision precision = CLFFT_SINGLE; //Testing for single precision. Easily extendable for double
- size_t batchSize = 1; //For simplicity using fixed batch size
+ size_t batchSize = 1;
int fftType = 1;
FFTType clFFTType = FFT_C2C;
@@ -91,11 +33,9 @@ int main(int argc, char **argv)
desc.add_options()
( "help,h", "produces this help message" )
( "dumpKernels,d", "FFT engine will dump generated OpenCL FFT kernels to disk (default: dump off)" )
- ( "lenX,x", po::value< size_t >( &lengths[ 0 ] )->default_value( 1024 ), "Specify the length of the 1st dimension of a test array" )
- ( "lenY,y", po::value< size_t >( &lengths[ 1 ] )->default_value( 1 ), "Specify the length of the 2nd dimension of a test array" )
- ( "lenZ,z", po::value< size_t >( &lengths[ 2 ] )->default_value( 1 ), "Specify the length of the 3rd dimension of a test array" )
+ ( "batchSize,b", po::value< size_t >( &batchSize )->default_value( 1 ), "If this value is greater than one, arrays will be used " )
( "profile,p", po::value< cl_uint >( &profile_count )->default_value( 10 ), "Time and report the kernel speed of the FFT (default: profiling off)" )
- ( "type,t", po::value< int >( &fftType )->default_value( 1 ), "Type of FFT:\n1) Complex-Complex\n2) Real-Complex\n3) Complex-Real\n" )
+ ( "type,t", po::value< int >( &fftType )->default_value( 1 ), "Type of FFT:\n1) Complex-Complex (default)\n2) Real-Complex\n3) Complex-Real\n" )
;
po::variables_map vm;
@@ -139,7 +79,7 @@ int main(int argc, char **argv)
}
else if (clFFTType == FFT_R2C) // Complex-Complex cases
{
- R2C_transform<float>();
+ R2C_transform<float>(setupData, lengths, batchSize, dim, precision, profile_count);
}
else // Complex-Complex cases
{
@@ -178,13 +118,13 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
cl_command_queue commandQueue = ::clCreateCommandQueue( context, device_id[0], command_queue_flags, &status );
OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
- //Run clFFT with seaparate Pre-process Kernel
if (precision == CLFFT_SINGLE)
{
- runPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+ //Run clFFT with seaparate Pre-process Kernel
+ runC2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
- //Run clFFT using pre-callback
- runPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
+ //Run clFFT using pre-callback
+ runC2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision, batchSize, vectorLength, fftLength, profile_count);
}
OPENCL_V_THROW( clReleaseCommandQueue( commandQueue ), "Error: In clReleaseCommandQueue\n" );
@@ -192,72 +132,60 @@ void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths,
}
template < typename T >
-void R2C_transform()
-{
-}
-
-template < typename T >
-void C2R_transform()
-{
-}
-
-// Compute reference output using fftw for float type
-fftwf_complex* get_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size, clfftLayout in_layout,
- clfftDim dim, clfftDirection dir)
+void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+ clfftDim dim, clfftPrecision precision, cl_uint profile_count)
{
- //In FFTW last dimension has the fastest changing index
- int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
-
- fftwf_plan refPlan;
+ // OpenCL state
+ cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+ cl_int deviceId = 0;
+ std::vector< cl_device_id > device_id;
+ cl_int platformId = 0;
+ cl_context context;
+ cl_uint command_queue_flags = 0;
+ command_queue_flags |= CL_QUEUE_PROFILING_ENABLE;
+
+ // Test for in-place Hermitian Interleaved output
+ // Hence output size is N/2 + 1 complex. So allocate N + 2 real input
+ size_t Nt = inlengths[0] + 2;
+ size_t vectorLength = Nt * inlengths[1] * inlengths[2];
+ size_t fftLength = vectorLength * batchSize;
- fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
- fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+ //OpenCL initializations
+ device_id = initializeCL( deviceType, deviceId, platformId, context, false);
- size_t fftVectorLength = fftbatchLength/batch_size;
+ cl_int status = 0;
+
+ cl_command_queue commandQueue = ::clCreateCommandQueue( context, device_id[0], command_queue_flags, &status );
+ OPENCL_V_THROW( status, "Creating Command Queue ( ::clCreateCommandQueue() )" );
- refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size,
- refin, &fftwLengths[3 - dim], 1, fftVectorLength,
- refout, &fftwLengths[3 - dim], 1, fftVectorLength,
- dir, FFTW_ESTIMATE);
-
- float scalar;
-
- for( size_t i = 0; i < fftbatchLength; i++)
+ if (precision == CLFFT_SINGLE)
{
- scalar = 0.0f;
- switch (in_layout)
- {
- case CLFFT_COMPLEX_INTERLEAVED:
- if ( (i % fftVectorLength) < 512)
- {
- scalar = 1.0f;
- }
- break;
- default:
- break;
- }
+ //Run clFFT with seaparate Pre-process Kernel
+ runR2CPreprocessKernelFFT<float>(setupData, context, commandQueue, device_id[0], inlengths, dim, precision,
+ batchSize, vectorLength, fftLength, profile_count);
- refin[i][0] = scalar;
- refin[i][1] = 0;
+ //Run clFFT using pre-callback
+ runR2CPrecallbackFFT<float>(setupData, context, commandQueue, inlengths, dim, precision,
+ batchSize, vectorLength, fftLength, profile_count);
}
- fftwf_execute(refPlan);
-
- fftw_free(refin);
-
- fftwf_destroy_plan(refPlan);
+ OPENCL_V_THROW( clReleaseCommandQueue( commandQueue ), "Error: In clReleaseCommandQueue\n" );
+ OPENCL_V_THROW( clReleaseContext( context ), "Error: In clReleaseContext\n" );
+}
- return refout;
+template < typename T >
+void C2R_transform()
+{
}
template < typename T >
-void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
size_t* inlengths, clfftDim dim, clfftPrecision precision,
size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
{
cl_int status = 0;
- size_t userdataLengths[ 3 ] = {512,1,1};
+ size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
size_t userdataLength = vectorLength_userdata * batchSize;
@@ -326,32 +254,33 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
cl_mem * buffersOut = NULL; //NULL for in-place
- Timer tr;
- double wtime_acc = 0.0;
+ // for functional test
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
// Loop as many times as the user specifies to average out the timings
- for( cl_uint i = 0; i < profile_count; ++i )
+ if (profile_count > 1)
{
+ Timer tr;
tr.Start();
- OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
- &fftbuffer, buffersOut, clMedBuffer ),
- "clfftEnqueueTransform failed" );
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
-
- wtime_acc += tr.Sample();
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+ }
+ double wtimesample = tr.Sample();
+ double wtime = wtimesample/((double)profile_count);
- //Ignore the first time sample if profiling for more than one iteration
- if (i == 0 && profile_count > 1) wtime_acc = 0.0;
+ tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
}
- double wtime = wtime_acc/((double)profile_count);
- size_t totalLen = 1;
- for(int i=0; i<dim; i++) totalLen *= inlengths[i];
- double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
- tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
- tout << "Execution gflops (with clFFT Pre-callback): " << ((double)batchSize * opsconst)/(1000000000.0*wtime) << std::endl;
-
if(clMedBuffer) clReleaseMemObject(clMedBuffer);
if (profile_count == 1)
@@ -364,7 +293,7 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
//Reference fftw output
fftwf_complex *refout;
- refout = get_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+ refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
/*for( cl_uint i = 0; i < fftLength; i++)
{
@@ -372,11 +301,11 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
}*/
if (!compare<fftwf_complex, T>(refout, output, fftLength))
{
- std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+ std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
}
else
{
- std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+ std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
}
fftwf_free(refout);
@@ -391,14 +320,14 @@ void runPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context con
}
template < typename T >
-void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context,
+void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context,
cl_command_queue commandQueue, cl_device_id device_id,
size_t* inlengths, clfftDim dim, clfftPrecision precision,
size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
{
cl_int status = 0;
- size_t userdataLengths[ 3 ] = {512,1,1};
+ size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
size_t userdataLength = vectorLength_userdata * batchSize;
@@ -464,92 +393,106 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
//Pre-process kernel string
const char* source = STRINGIFY(ZERO_PAD_C2C_KERNEL);
- double wtime_acc = 0.0;
- Timer tr;
-
- // Loop as many times as the user specifies to average out the timings
- for( cl_uint i = 0; i < profile_count; ++i )
- {
- tr.Start();
- cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
- OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
+ cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
+ OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
- status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
- OPENCL_V_THROW( status, "clBuildProgram failed" );
+ status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
+ OPENCL_V_THROW( status, "clBuildProgram failed" );
#if defined( _DEBUG )
- if( status != CL_SUCCESS )
+ if( status != CL_SUCCESS )
+ {
+ if( status == CL_BUILD_PROGRAM_FAILURE )
{
- if( status == CL_BUILD_PROGRAM_FAILURE )
- {
- size_t buildLogSize = 0;
- OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
- "clGetProgramBuildInfo failed" );
-
- std::vector< char > buildLog( buildLogSize );
- ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
-
- OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ size_t buildLogSize = 0;
+ OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
"clGetProgramBuildInfo failed" );
- std::cerr << "\n\t\t\tBUILD LOG\n";
- std::cerr << "************************************************\n";
- std::cerr << &buildLog[ 0 ] << std::endl;
- std::cerr << "************************************************\n";
- }
+ std::vector< char > buildLog( buildLogSize );
+ ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
- OPENCL_V_THROW( status, "clBuildProgram failed" );
+ OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ "clGetProgramBuildInfo failed" );
+
+ std::cerr << "\n\t\t\tBUILD LOG\n";
+ std::cerr << "************************************************\n";
+ std::cerr << &buildLog[ 0 ] << std::endl;
+ std::cerr << "************************************************\n";
}
-#endif
- cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
- OPENCL_V_THROW( status, "clCreateKernel failed" );
+ OPENCL_V_THROW( status, "clBuildProgram failed" );
+ }
+#endif
- cl_uint uarg = 0;
+ cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
+ OPENCL_V_THROW( status, "clCreateKernel failed" );
- //Buffer to be zero-padded
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+ //for functional test
+ cl_uint uarg = 0;
- //originial data
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+ //Buffer to be zero-padded
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
- //single batch length
- OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_uint ), (const void *)&vectorLength ), "clSetKernelArg failed" );
+ //originial data
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
- //Launch pre-process kernel
- size_t gSize = fftLength;
- size_t lSize = 64;
- status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
- NULL, &gSize, &lSize, 0, NULL, NULL );
- OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+ //Launch pre-process kernel
+ size_t gSize = fftLength;
+ size_t lSize = 64;
+ status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+ NULL, &gSize, &lSize, 0, NULL, NULL );
+ OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
- //Now invoke the clfft execute
- OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
- &fftbuffer, buffersOut, clMedBuffer ),
- "clfftEnqueueTransform failed" );
+ //Now invoke the clfft execute
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
- OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ if (profile_count > 1)
+ {
+ Timer tr;
+ tr.Start();
- wtime_acc += tr.Sample();
+ // Loop as many times as the user specifies to average out the timings
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ uarg = 0;
- //Ignore the first time sample if profiling for more than one iteration
- if (i == 0 && profile_count > 1) wtime_acc = 0.0;
+ //Buffer to be zero-padded
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
- //cleanup preprocess kernel opencl objects
- OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
- OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
- }
+ //originial data
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
- double wtime = wtime_acc/((double)profile_count);
- size_t totalLen = 1;
- for(int i=0; i<dim; i++) totalLen *= inlengths[i];
- double opsconst = 5.0 * (double)totalLen * log((double)totalLen) / log(2.0);
+ //Launch pre-process kernel
+ status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+ NULL, &gSize, &lSize, 0, NULL, NULL );
+ OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
- tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
- tout << "Execution gflops (Separate Pre-process Kernel): " << ((double)batchSize * opsconst)/(1000000000.0*wtime) << std::endl;
+ //Now invoke the clfft execute
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, dir, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+ }
+ double wtimesample = tr.Sample();
+ double wtime = wtimesample/((double)profile_count);
+
+ tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+ }
+
+ //cleanup preprocess kernel opencl objects
+ OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
+ OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
+
if(clMedBuffer) clReleaseMemObject(clMedBuffer);
if (profile_count == 1)
@@ -562,7 +505,7 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
//Reference fftw output
fftwf_complex *refout;
- refout = get_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+ refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
/*for( cl_uint i = 0; i < fftLength; i++)
{
@@ -570,11 +513,11 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
}*/
if (!compare<fftwf_complex, T>(refout, output, fftLength))
{
- std::cout << "\n\n\t\tInternal Client Test *****FAIL*****" << std::endl;
+ std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****FAIL*****" << std::endl;
}
else
{
- std::cout << "\n\n\t\tInternal Client Test *****PASS*****" << std::endl;
+ std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****PASS*****" << std::endl;
}
fftwf_free(refout);
@@ -588,6 +531,358 @@ void runPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_contex
OPENCL_V_THROW( clReleaseMemObject( userdatabuffer ), "Error: In clReleaseMemObject\n" );
}
+template < typename T >
+void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
+{
+ cl_int status = 0;
+
+ size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
+ size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
+ size_t userdataLength = vectorLength_userdata * batchSize;
+
+ //input/output allocation sizes
+ size_t size_of_buffers = fftLength * sizeof( T );
+ size_t size_of_buffers_userdata = userdataLength * sizeof( T );
+
+ //in-place transform. Same buffer for input and output
+ cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+
+ //Initialize Data
+ std::vector< T > userdata( userdataLength );
+
+ // impulse test case
+ for (size_t idx = 0; idx < userdataLength; ++idx)
+ {
+ userdata[idx] = 1;
+ }
+
+ //user data buffer
+ cl_mem userDatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userDatabuffer) )" );
+
+ //clFFT initializations
+
+ // FFT state
+ clfftResultLocation place = CLFFT_INPLACE;
+ clfftLayout inLayout = CLFFT_REAL;
+ clfftLayout outLayout = CLFFT_HERMITIAN_INTERLEAVED;
+
+ clfftPlanHandle plan_handle;
+ OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+ OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
+
+ //Precallback setup
+ char* precallbackstr = STRINGIFY(ZERO_PAD_R2C);
+
+ //Register the callback
+ OPENCL_V_THROW (clFFTSetPlanCallback(plan_handle, "zeroPad", precallbackstr, NULL, 0, PRECALLBACK, userDatabuffer), "clFFTSetPlanCallback failed");
+
+ // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+ OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+ OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
+ OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
+ OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+ //Bake Plan
+ OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
+
+ //get the buffersize
+ size_t buffersize=0;
+ OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+ //allocate the intermediate buffer
+ cl_mem clMedBuffer=NULL;
+
+ if (buffersize)
+ {
+ cl_int medstatus;
+ clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+ OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+ }
+
+ cl_mem * buffersOut = NULL; //NULL for in-place
+
+ //for functional test
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ if (profile_count > 1)
+ {
+ Timer tr;
+ tr.Start();
+
+ // Loop as many times as the user specifies to average out the timings
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+ }
+ double wtimesample = tr.Sample();
+ double wtime = wtimesample/((double)profile_count);
+
+ tout << "\nExecution wall time (with clFFT Pre-callback): " << 1000.0*wtime << " ms" << std::endl;
+ }
+
+ if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+ if (profile_count == 1)
+ {
+ std::vector< std::complex< T > > output( fftLength/2 );
+
+ OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+ 0, NULL, NULL ), "Reading the result buffer failed" );
+
+ /*for( cl_uint i = 0; i < fftLength/2; i++)
+ {
+ std::cout << "i " << i << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ }*/
+
+ ////Reference fftw output
+ //fftwf_complex *refout;
+
+ //refout = get_R2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim);
+
+ ///*for( cl_uint i = 0; i < fftLength; i++)
+ //{
+ // std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ //}*/
+ //if (!compare<fftwf_complex, T>(refout, output, fftLength))
+ //{
+ // std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****FAIL*****" << std::endl;
+ //}
+ //else
+ //{
+ // std::cout << "\n\n\t\tInternal Client Test (with clFFT Pre-callback) *****PASS*****" << std::endl;
+ //}
+
+ //fftwf_free(refout);
+ }
+
+ OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+ OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+ //cleanup
+ OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
+ OPENCL_V_THROW( clReleaseMemObject( userDatabuffer ), "Error: In clReleaseMemObject\n" );
+}
+
+template < typename T >
+void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context,
+ cl_command_queue commandQueue, cl_device_id device_id,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count)
+{
+ cl_int status = 0;
+
+ size_t userdataLengths[ 3 ] = {USERDATA_LENGTH,1,1};
+ size_t vectorLength_userdata = userdataLengths[0] * userdataLengths[1] * userdataLengths[2];
+ size_t userdataLength = vectorLength_userdata * batchSize;
+
+ //input/output allocation sizes
+ size_t size_of_buffers = fftLength * sizeof( T );
+ size_t size_of_buffers_userdata = userdataLength * sizeof( T );
+
+ //in-place transform. Same buffer for input and output
+ cl_mem fftbuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE, size_of_buffers, NULL, &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(buffer) )" );
+
+ //Initialize Data
+ std::vector< T > userdata( userdataLength );
+
+ // impulse test case
+ for (size_t idx = 0; idx < userdataLength; ++idx)
+ {
+ userdata[idx] = 1;
+ }
+
+ //user data buffer
+ cl_mem userdatabuffer = ::clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size_of_buffers_userdata, &userdata[0], &status);
+ OPENCL_V_THROW( status, "Creating Buffer ( ::clCreateBuffer(userdatabuffer) )" );
+
+ //clFFT initializations
+
+ // FFT state
+ clfftResultLocation place = CLFFT_INPLACE;
+ clfftLayout inLayout = CLFFT_REAL;
+ clfftLayout outLayout = CLFFT_HERMITIAN_INTERLEAVED;
+
+ clfftPlanHandle plan_handle;
+ OPENCL_V_THROW( clfftSetup( setupData.get( ) ), "clfftSetup failed" );
+ OPENCL_V_THROW( clfftCreateDefaultPlan( &plan_handle, context, dim, inlengths ), "clfftCreateDefaultPlan failed" );
+
+ // Default plan creates a plan that expects an inPlace transform with interleaved complex numbers
+ OPENCL_V_THROW( clfftSetResultLocation( plan_handle, place ), "clfftSetResultLocation failed" );
+ OPENCL_V_THROW( clfftSetLayout( plan_handle, inLayout, outLayout ), "clfftSetLayout failed" );
+ OPENCL_V_THROW( clfftSetPlanBatchSize( plan_handle, batchSize ), "clfftSetPlanBatchSize failed" );
+ OPENCL_V_THROW( clfftSetPlanPrecision( plan_handle, precision ), "clfftSetPlanPrecision failed" );
+
+ //Bake Plan
+ OPENCL_V_THROW( clfftBakePlan( plan_handle, 1, &commandQueue, NULL, NULL ), "clfftBakePlan failed" );
+
+ //get the buffersize
+ size_t buffersize=0;
+ OPENCL_V_THROW( clfftGetTmpBufSize(plan_handle, &buffersize ), "clfftGetTmpBufSize failed" );
+
+ //allocate the intermediate buffer
+ cl_mem clMedBuffer=NULL;
+
+ if (buffersize)
+ {
+ cl_int medstatus;
+ clMedBuffer = clCreateBuffer ( context, CL_MEM_READ_WRITE, buffersize, 0, &medstatus);
+ OPENCL_V_THROW( medstatus, "Creating intmediate Buffer failed" );
+ }
+
+ cl_mem * buffersOut = NULL; //NULL for in-place
+
+ //Pre-process kernel string
+ const char* source = STRINGIFY(ZERO_PAD_R2C_KERNEL);
+
+ cl_program program = clCreateProgramWithSource( context, 1, &source, NULL, &status );
+ OPENCL_V_THROW( status, "clCreateProgramWithSource failed." );
+
+ status = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL);
+ OPENCL_V_THROW( status, "clBuildProgram failed" );
+
+#if defined( _DEBUG )
+ if( status != CL_SUCCESS )
+ {
+ if( status == CL_BUILD_PROGRAM_FAILURE )
+ {
+ size_t buildLogSize = 0;
+ OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize ),
+ "clGetProgramBuildInfo failed" );
+
+ std::vector< char > buildLog( buildLogSize );
+ ::memset( &buildLog[ 0 ], 0x0, buildLogSize );
+
+ OPENCL_V_THROW( clGetProgramBuildInfo( program, device_id, CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[ 0 ], NULL ),
+ "clGetProgramBuildInfo failed" );
+
+ std::cerr << "\n\t\t\tBUILD LOG\n";
+ std::cerr << "************************************************\n";
+ std::cerr << &buildLog[ 0 ] << std::endl;
+ std::cerr << "************************************************\n";
+ }
+
+ OPENCL_V_THROW( status, "clBuildProgram failed" );
+ }
+#endif
+
+ cl_kernel kernel = clCreateKernel( program, "zeroPad", &status );
+ OPENCL_V_THROW( status, "clCreateKernel failed" );
+
+ //for functional test
+ cl_uint uarg = 0;
+
+ //Buffer to be zero-padded
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+
+ //originial data
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+
+ //Launch pre-process kernel
+ size_t gSize = fftLength;
+ status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+ NULL, &gSize, NULL, 0, NULL, NULL );
+ OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ //Now invoke the clfft execute
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ if (profile_count > 1)
+ {
+ Timer tr;
+ tr.Start();
+
+ // Loop as many times as the user specifies to average out the timings
+ for( cl_uint i = 0; i < profile_count; ++i )
+ {
+ uarg = 0;
+
+ //Buffer to be zero-padded
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&fftbuffer ), "clSetKernelArg failed" );
+
+ //originial data
+ OPENCL_V_THROW( clSetKernelArg( kernel, uarg++, sizeof( cl_mem ), (void*)&userdatabuffer ), "clSetKernelArg failed" );
+
+ //Launch pre-process kernel
+ status = clEnqueueNDRangeKernel( commandQueue, kernel, 1,
+ NULL, &gSize, NULL, 0, NULL, NULL );
+ OPENCL_V_THROW( status, "clEnqueueNDRangeKernel failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+
+ //Now invoke the clfft execute
+ OPENCL_V_THROW( clfftEnqueueTransform( plan_handle, CLFFT_FORWARD, 1, &commandQueue, 0, NULL, NULL,
+ &fftbuffer, buffersOut, clMedBuffer ),
+ "clfftEnqueueTransform failed" );
+
+ OPENCL_V_THROW( clFinish( commandQueue ), "clFinish failed" );
+ }
+ double wtimesample = tr.Sample();
+ double wtime = wtimesample/((double)profile_count);
+
+ tout << "\nExecution wall time (Separate Pre-process Kernel): " << 1000.0*wtime << " ms" << std::endl;
+ }
+
+ //cleanup preprocess kernel opencl objects
+ OPENCL_V_THROW( clReleaseProgram( program ), "Error: In clReleaseProgram\n" );
+ OPENCL_V_THROW( clReleaseKernel( kernel ), "Error: In clReleaseKernel\n" );
+
+ if(clMedBuffer) clReleaseMemObject(clMedBuffer);
+
+ if (profile_count == 1)
+ {
+ std::vector< std::complex< T > > output( fftLength/2 );
+
+ OPENCL_V_THROW( clEnqueueReadBuffer( commandQueue, fftbuffer, CL_TRUE, 0, size_of_buffers, &output[ 0 ],
+ 0, NULL, NULL ), "Reading the result buffer failed" );
+
+ ////Reference fftw output
+ //fftwf_complex *refout;
+
+ //refout = get_C2C_fftwf_output(inlengths, fftLength, batchSize, inLayout, dim, dir);
+
+ ///*for( cl_uint i = 0; i < fftLength; i++)
+ //{
+ // std::cout << "i " << i << " refreal " << refout[i][0] << " refimag " << refout[i][1] << " clreal " << output[i].real() << " climag " << output[i].imag() << std::endl;
+ //}*/
+ //if (!compare<fftwf_complex, T>(refout, output, fftLength))
+ //{
+ // std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****FAIL*****" << std::endl;
+ //}
+ //else
+ //{
+ // std::cout << "\n\n\t\tInternal Client Test (Separate Pre-process Kernel) *****PASS*****" << std::endl;
+ //}
+
+ //fftwf_free(refout);
+ }
+
+ OPENCL_V_THROW( clfftDestroyPlan( &plan_handle ), "clfftDestroyPlan failed" );
+ OPENCL_V_THROW( clfftTeardown( ), "clfftTeardown failed" );
+
+ //cleanup
+ OPENCL_V_THROW( clReleaseMemObject( fftbuffer ), "Error: In clReleaseMemObject\n" );
+ OPENCL_V_THROW( clReleaseMemObject( userdatabuffer ), "Error: In clReleaseMemObject\n" );
+}
+
//Compare reference and opencl output
template < typename T1, typename T2>
bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
@@ -645,3 +940,96 @@ bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
return true;
}
+
+
+// Compute reference output using fftw for float type
+fftwf_complex* get_C2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size, clfftLayout in_layout,
+ clfftDim dim, clfftDirection dir)
+{
+ //In FFTW last dimension has the fastest changing index
+ int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+
+ fftwf_plan refPlan;
+
+ fftwf_complex *refin = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+ fftwf_complex *refout = (fftwf_complex*) fftw_malloc(sizeof(fftwf_complex)*fftbatchLength);
+
+ size_t fftVectorLength = fftbatchLength/batch_size;
+
+ refPlan = fftwf_plan_many_dft(dim, &fftwLengths[3 - dim], batch_size,
+ refin, &fftwLengths[3 - dim], 1, fftVectorLength,
+ refout, &fftwLengths[3 - dim], 1, fftVectorLength,
+ dir, FFTW_ESTIMATE);
+
+ float scalar;
+
+ for( size_t i = 0; i < fftbatchLength; i++)
+ {
+ scalar = 0.0f;
+ switch (in_layout)
+ {
+ case CLFFT_COMPLEX_INTERLEAVED:
+ if ( (i % fftVectorLength) < USERDATA_LENGTH)
+ {
+ scalar = 1.0f;
+ }
+ break;
+ default:
+ break;
+ }
+
+ refin[i][0] = scalar;
+ refin[i][1] = 0;
+ }
+
+ fftwf_execute(refPlan);
+
+ fftw_free(refin);
+
+ fftwf_destroy_plan(refPlan);
+
+ return refout;
+}
+
+// Compute reference output using fftw for float type
+fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size,
+ clfftLayout in_layout, clfftDim dim)
+{
+ //In FFTW last dimension has the fastest changing index
+ int fftwLengths[3] = {(int)lengths[2], (int)lengths[1], (int)lengths[0]};
+ int inembed[3] = {(int)lengths[2], (int)lengths[1], (int)(lengths[0] + 2)};
+ int outembed[3] = {(int)lengths[2], (int)lengths[1], (int)(lengths[0]/2 + 1)};
+
+ fftwf_plan refPlan;
+
+ size_t infftVectorLength = inembed[0] * inembed[1] * inembed[2];
+ size_t outfftVectorLength = outembed[0] * outembed[1] * outembed[2];
+
+ float *refin = (float*) malloc(sizeof(float)*fftbatchLength);
+ fftwf_complex *refout = (fftwf_complex*)refin; //(fftwf_complex*)fftwf_malloc(sizeof(fftwf_complex)*outfftVectorLength*batch_size);
+
+ refPlan = fftwf_plan_many_dft_r2c(dim, &fftwLengths[3 - dim], batch_size,
+ refin, &inembed[3 - dim], 1, infftVectorLength,
+ refout, &outembed[3 - dim], 1, outfftVectorLength, FFTW_ESTIMATE);
+
+ float scalar;
+
+ for( size_t i = 0; i < fftbatchLength; i++)
+ {
+ scalar = 0.0f;
+ if ( (i % infftVectorLength) < USERDATA_LENGTH)
+ {
+ scalar = 1.0f;
+ }
+
+ refin[i] = scalar;
+ }
+
+ fftwf_execute(refPlan);
+
+ fftw_free(refin);
+
+ fftwf_destroy_plan(refPlan);
+
+ return refout;
+}
\ No newline at end of file
diff --git a/src/callback-client/client.h b/src/callback-client/client.h
index a1e100d..9ca44dd 100644
--- a/src/callback-client/client.h
+++ b/src/callback-client/client.h
@@ -22,10 +22,117 @@
// Boost headers that we want to use
// #define BOOST_PROGRAM_OPTIONS_DYN_LINK
#include <boost/program_options.hpp>
+#include "stdafx.h"
+#include "../statTimer/statisticalTimer.extern.h"
+
+#include <fftw3.h>
#define CALLBCKSTR(...) #__VA_ARGS__
#define STRINGIFY(...) CALLBCKSTR(__VA_ARGS__)
+enum FFTType
+{
+ FFT_C2C,
+ FFT_R2C,
+ FFT_C2R,
+};
+#define USERDATA_LENGTH 512
+#define BATCH_LENGTH 1024 // Must be >= USERDATA_LENGTH
+
+#define ZERO_PAD_C2C __attribute__((always_inline)) \n float2 zeroPad (__global void *input, \n \
+ uint inoffset, \n \
+ __global void *userdata) \n \
+ { \n \
+ float2 scalar = 0.0f; \n \
+ uint udoffset; \n \
+ if ((inoffset % BATCH_LENGTH) < USERDATA_LENGTH) \n \
+ { \n \
+ udoffset = ((inoffset/BATCH_LENGTH) * USERDATA_LENGTH) + (inoffset % BATCH_LENGTH); \n \
+ scalar = *((__global float2*)userdata + udoffset); \n \
+ } \n \
+ return scalar; \n \
+ } \n
+
+#define ZERO_PAD_C2C_KERNEL __kernel void zeroPad (__global void *input, \n \
+ __global void *userdata) \n \
+ { \n \
+ uint inoffset = get_global_id(0); \n \
+ float2 scalar = 0.0f; \n \
+ uint udoffset; \n \
+ if ((inoffset % BATCH_LENGTH) < USERDATA_LENGTH) \n \
+ { \n \
+ udoffset = ((inoffset/BATCH_LENGTH) * USERDATA_LENGTH) + (inoffset % BATCH_LENGTH); \n \
+ scalar = *((__global float2*)userdata + udoffset); \n \
+ } \n \
+ *((__global float2*)input + inoffset) = scalar; \n \
+ } \n
+
+#define ZERO_PAD_R2C __attribute__((always_inline)) \n float zeroPad (__global void *input, \n \
+ uint inoffset, \n \
+ __global void *userdata) \n \
+ { \n \
+ float scalar = 0.0f; \n \
+ uint udoffset; \n \
+ if ((inoffset % (BATCH_LENGTH + 2)) < USERDATA_LENGTH) \n \
+ { \n \
+ udoffset = ((inoffset/(BATCH_LENGTH + 2)) * USERDATA_LENGTH) + (inoffset % (BATCH_LENGTH + 2)); \n \
+ scalar = *((__global float*)userdata + udoffset); \n \
+ } \n \
+ return scalar; \n \
+ } \n
+
+#define ZERO_PAD_R2C_KERNEL __kernel void zeroPad (__global void *input, \n \
+ __global void *userdata) \n \
+ { \n \
+ uint inoffset = get_global_id(0); \n \
+ float scalar = 0.0f; \n \
+ uint udoffset; \n \
+ if ((inoffset % (BATCH_LENGTH + 2)) < USERDATA_LENGTH) \n \
+ { \n \
+ udoffset = ((inoffset/(BATCH_LENGTH + 2)) * USERDATA_LENGTH) + (inoffset % (BATCH_LENGTH + 2)); \n \
+ scalar = *((__global float*)userdata + udoffset); \n \
+ } \n \
+ *((__global float*)input + inoffset) = scalar; \n \
+ } \n
+
+template < typename T >
+void C2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+ clfftDim dim, clfftPrecision precision, cl_uint profile_count);
+
+template < typename T >
+void R2C_transform(std::auto_ptr< clfftSetupData > setupData, size_t* inlengths, size_t batchSize,
+ clfftDim dim, clfftPrecision precision, cl_uint profile_count);
+
+template < typename T >
+void C2R_transform();
+
+fftwf_complex* get_C2C_fftwf_output(size_t* lengths, size_t fftBatchSize, int batch_size, clfftLayout in_layout,
+ clfftDim dim, clfftDirection dir);
+
+fftwf_complex* get_R2C_fftwf_output(size_t* lengths, size_t fftbatchLength, int batch_size,
+ clfftLayout in_layout, clfftDim dim);
+
+template < typename T1, typename T2>
+bool compare(T1 *refData, std::vector< std::complex< T2 > > data,
+ size_t length, const float epsilon = 1e-6f);
+
+template < typename T >
+void runC2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
+template < typename T >
+void runC2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue, cl_device_id device_id, size_t* inlengths, clfftDim dim, clfftPrecision precision, size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
+template < typename T >
+void runR2CPrecallbackFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context, cl_command_queue commandQueue,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
+template < typename T >
+void runR2CPreprocessKernelFFT(std::auto_ptr< clfftSetupData > setupData, cl_context context,
+ cl_command_queue commandQueue, cl_device_id device_id,
+ size_t* inlengths, clfftDim dim, clfftPrecision precision,
+ size_t batchSize, size_t vectorLength, size_t fftLength, cl_uint profile_count);
+
#ifdef WIN32
struct Timer
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list