[clfft] 05/21: code reorganisition. renamed action.cpp to enqueue.cpp. consolidate tranpose related actions to action.transpose.cpp
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Wed Mar 16 13:14:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit f8870fde56ce2fb74134d921eea7d73d9cee0467
Author: Timmy <timmy.liu at amd.com>
Date: Tue Feb 23 10:13:22 2016 -0600
code reorganisition. renamed action.cpp to enqueue.cpp. consolidate tranpose related actions to action.transpose.cpp
---
src/library/CMakeLists.txt | 8 +-
...ranspose.nonsquare.cpp => action.transpose.cpp} | 229 ++++++++++++++++-
...ction.transpose.square.h => action.transpose.h} | 2 +-
src/library/action.transpose.nonsquare.h | 26 --
src/library/action.transpose.square.cpp | 270 ---------------------
src/library/{action.cpp => enqueue.cpp} | 0
src/library/plan.cpp | 6 +-
src/library/transform.cpp | 2 +-
8 files changed, 236 insertions(+), 307 deletions(-)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 436fdf9..132ef86 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -22,13 +22,12 @@ set( clFFT.Source transform.cpp
generator.stockham.cpp
generator.transpose.gcn.cpp
generator.transpose.cpp
- action.transpose.nonsquare.cpp
- action.transpose.square.cpp
+ action.transpose.cpp
generator.copy.cpp
lifetime.cpp
fft_binary_lookup.cpp
md5sum.c
- action.cpp
+ enqueue.cpp
stdafx.cpp )
# Windows only uses dllmain
@@ -46,8 +45,7 @@ set( clFFT.Headers private.h
generator.stockham.h
generator.transpose.gcn.h
generator.transpose.h
- action.transpose.nonsquare.h
- action.transpose.square.h
+ action.transpose.h
fft_binary_lookup.h
md5sum.h
../include/stdafx.h
diff --git a/src/library/action.transpose.nonsquare.cpp b/src/library/action.transpose.cpp
similarity index 62%
rename from src/library/action.transpose.nonsquare.cpp
rename to src/library/action.transpose.cpp
index 8787cc2..db35ce5 100644
--- a/src/library/action.transpose.nonsquare.cpp
+++ b/src/library/action.transpose.cpp
@@ -24,7 +24,7 @@
#include <math.h>
#include <iomanip>
#include "generator.transpose.h"
-#include "action.transpose.nonsquare.h"
+#include "action.transpose.h"
#include "generator.stockham.h"
#include "action.h"
@@ -362,3 +362,230 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
}
return CLFFT_SUCCESS;
}
+
+FFTGeneratedTransposeSquareAction::FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+ : FFTTransposeSquareAction(plHandle, plan, queue, err)
+{
+ if (err != CLFFT_SUCCESS)
+ {
+ // FFTTransposeSquareAction() failed, exit
+ fprintf(stderr, "FFTTransposeSquareAction() failed!\n");
+ return;
+ }
+
+ // Initialize the FFTAction::FFTKernelGenKeyParams member
+ err = this->initParams();
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeSquareAction::initParams() failed!\n");
+ return;
+ }
+
+ FFTRepo &fftRepo = FFTRepo::getInstance();
+
+ err = this->generateKernel(fftRepo, queue);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeSquareAction::generateKernel failed\n");
+ return;
+ }
+
+ err = compileKernels(queue, plHandle, plan);
+
+ if (err != CLFFT_SUCCESS)
+ {
+ fprintf(stderr, "FFTGeneratedTransposeSquareAction::compileKernels failed\n");
+ return;
+ }
+
+ err = CLFFT_SUCCESS;
+}
+
+
+bool FFTGeneratedTransposeSquareAction::buildForwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedTransposeSquareAction::buildBackwardKernel()
+{
+ clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+ clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+ bool r2c_transform = (inputLayout == CLFFT_REAL);
+ bool c2r_transform = (outputLayout == CLFFT_REAL);
+ bool real_transform = (r2c_transform || c2r_transform);
+
+ return (!real_transform) || c2r_transform;
+}
+
+/*sqaure action*/
+clfftStatus FFTGeneratedTransposeSquareAction::initParams()
+{
+
+ this->signature.fft_precision = this->plan->precision;
+ this->signature.fft_placeness = this->plan->placeness;
+ this->signature.fft_inputLayout = this->plan->inputLayout;
+ this->signature.fft_outputLayout = this->plan->outputLayout;
+ this->signature.fft_3StepTwiddle = false;
+
+ this->signature.fft_realSpecial = this->plan->realSpecial;
+
+ this->signature.transOutHorizontal = this->plan->transOutHorizontal; // using the twiddle front flag to specify horizontal write
+ // we do this so as to reuse flags in FFTKernelGenKeyParams
+ // and to avoid making a new one
+
+ ARG_CHECK(this->plan->inStride.size() == this->plan->outStride.size());
+
+ if (CLFFT_INPLACE == this->signature.fft_placeness)
+ {
+ // If this is an in-place transform the
+ // input and output layout, dimensions and strides
+ // *MUST* be the same.
+ //
+ ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
+
+ for (size_t u = this->plan->inStride.size(); u-- > 0; )
+ {
+ ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
+ }
+ }
+
+ this->signature.fft_DataDim = this->plan->length.size() + 1;
+ int i = 0;
+ for (i = 0; i < (this->signature.fft_DataDim - 1); i++)
+ {
+ this->signature.fft_N[i] = this->plan->length[i];
+ this->signature.fft_inStride[i] = this->plan->inStride[i];
+ this->signature.fft_outStride[i] = this->plan->outStride[i];
+
+ }
+ this->signature.fft_inStride[i] = this->plan->iDist;
+ this->signature.fft_outStride[i] = this->plan->oDist;
+
+ if (this->plan->large1D != 0) {
+ ARG_CHECK(this->signature.fft_N[0] != 0)
+ ARG_CHECK((this->plan->large1D % this->signature.fft_N[0]) == 0)
+ this->signature.fft_3StepTwiddle = true;
+ ARG_CHECK(this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]));
+ }
+
+ // Query the devices in this context for their local memory sizes
+ // How we generate a kernel depends on the *minimum* LDS size for all devices.
+ //
+ const FFTEnvelope * pEnvelope = NULL;
+ OPENCL_V(this->plan->GetEnvelope(&pEnvelope), _T("GetEnvelope failed"));
+ BUG_CHECK(NULL != pEnvelope);
+
+ // TODO: Since I am going with a 2D workgroup size now, I need a better check than this 1D use
+ // Check: CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
+ // CL_DEVICE_MAX_WORK_ITEM_SIZES
+ this->signature.fft_R = 1; // Dont think i'll use
+ this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+
+ //Set callback if specified
+ if (this->plan->hasPreCallback)
+ {
+ this->signature.fft_hasPreCallback = true;
+ this->signature.fft_preCallback = this->plan->preCallback;
+ }
+ if (this->plan->hasPostCallback)
+ {
+ this->signature.fft_hasPostCallback = true;
+ this->signature.fft_postCallback = this->plan->postCallbackParam;
+ }
+ this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
+
+ return CLFFT_SUCCESS;
+}
+
+
+// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
+// Feed this generator the FFTPlan, and it returns the generated program as a string
+clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT)
+{
+ //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+ if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) ||
+ (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
+ {
+ assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
+
+ bool validLDSSize = false;
+ size_t requestedCallbackLDS = 0;
+
+ if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+ requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+ else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
+ requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
+
+ validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
+
+ if (!validLDSSize)
+ {
+ fprintf(stderr, "Requested local memory size not available\n");
+ return CLFFT_INVALID_ARG_VALUE;
+ }
+ }
+
+ std::string programCode;
+ OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
+
+ cl_int status = CL_SUCCESS;
+ cl_device_id Device = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+ OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
+
+ cl_context QueueContext = NULL;
+ status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+ OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
+
+
+ OPENCL_V(fftRepo.setProgramCode(Transpose_SQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
+
+ // Note: See genFunctionPrototype( )
+ if (this->signature.fft_3StepTwiddle)
+ {
+ OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
+ }
+ else
+ {
+ OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
+ }
+
+ return CLFFT_SUCCESS;
+}
+
+
+clfftStatus FFTGeneratedTransposeSquareAction::getWorkSizes(std::vector< size_t >& globalWS, std::vector< size_t >& localWS)
+{
+
+ size_t wg_slice;
+ if (this->signature.fft_N[0] % (16 * reShapeFactor) == 0)
+ wg_slice = this->signature.fft_N[0] / 16 / reShapeFactor;
+ else
+ wg_slice = (this->signature.fft_N[0] / (16 * reShapeFactor)) + 1;
+
+ size_t global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
+
+ for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
+ {
+ global_item_size *= this->signature.fft_N[i];
+ }
+
+ globalWS.clear();
+ globalWS.push_back(global_item_size);
+
+ localWS.clear();
+ localWS.push_back(lwSize);
+
+ return CLFFT_SUCCESS;
+}
diff --git a/src/library/action.transpose.square.h b/src/library/action.transpose.h
similarity index 95%
rename from src/library/action.transpose.square.h
rename to src/library/action.transpose.h
index 12ad701..a0a44f0 100644
--- a/src/library/action.transpose.square.h
+++ b/src/library/action.transpose.h
@@ -16,7 +16,7 @@
#pragma once
#if !defined( AMD_CLFFT_generator_transpose_H )
-#define AMD_CLFFT_generator_transpose_H
+#define AMD_CLFFT_action_transpose_H
#include "private.h"
#include "repo.h"
#include "plan.h"
diff --git a/src/library/action.transpose.nonsquare.h b/src/library/action.transpose.nonsquare.h
deleted file mode 100644
index 559ee90..0000000
--- a/src/library/action.transpose.nonsquare.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* ************************************************************************
-* Copyright 2013 Advanced Micro Devices, Inc.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-* ************************************************************************/
-
-#pragma once
-#if !defined( AMD_CLFFT_generator_transpose_H )
-#define AMD_CLFFT_generator_transpose_H
-#include "private.h"
-#include "repo.h"
-#include "plan.h"
-
-#endif
-
-#pragma once
diff --git a/src/library/action.transpose.square.cpp b/src/library/action.transpose.square.cpp
deleted file mode 100644
index a21d773..0000000
--- a/src/library/action.transpose.square.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
- // action.transpose.square.cpp provides the entry points of "baking"
- // square transpose kernels called in plan.cpp.
- // the actual kernel string generation is provided by generator.transpose.cpp
-
-#include "stdafx.h"
-#include <math.h>
-#include "generator.transpose.h"
-#include "action.transpose.square.h"
-#include "generator.stockham.h"
-#include "action.h"
-
-FFTGeneratedTransposeSquareAction::FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
- : FFTTransposeSquareAction(plHandle, plan, queue, err)
-{
- if (err != CLFFT_SUCCESS)
- {
- // FFTTransposeSquareAction() failed, exit
- fprintf(stderr, "FFTTransposeSquareAction() failed!\n");
- return;
- }
-
- // Initialize the FFTAction::FFTKernelGenKeyParams member
- err = this->initParams();
-
- if (err != CLFFT_SUCCESS)
- {
- fprintf(stderr, "FFTGeneratedTransposeSquareAction::initParams() failed!\n");
- return;
- }
-
- FFTRepo &fftRepo = FFTRepo::getInstance();
-
- err = this->generateKernel(fftRepo, queue);
-
- if (err != CLFFT_SUCCESS)
- {
- fprintf(stderr, "FFTGeneratedTransposeSquareAction::generateKernel failed\n");
- return;
- }
-
- err = compileKernels( queue, plHandle, plan);
-
- if (err != CLFFT_SUCCESS)
- {
- fprintf(stderr, "FFTGeneratedTransposeSquareAction::compileKernels failed\n");
- return;
- }
-
- err = CLFFT_SUCCESS;
-}
-
-
-bool FFTGeneratedTransposeSquareAction::buildForwardKernel()
-{
- clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
- clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
- bool r2c_transform = (inputLayout == CLFFT_REAL);
- bool c2r_transform = (outputLayout == CLFFT_REAL);
- bool real_transform = (r2c_transform || c2r_transform);
-
- return (!real_transform) || r2c_transform;
-}
-
-bool FFTGeneratedTransposeSquareAction::buildBackwardKernel()
-{
- clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
- clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
- bool r2c_transform = (inputLayout == CLFFT_REAL);
- bool c2r_transform = (outputLayout == CLFFT_REAL);
- bool real_transform = (r2c_transform || c2r_transform);
-
- return (!real_transform) || c2r_transform;
-}
-
-
-// These strings represent the names that are used as strKernel parameters
-const std::string pmRealIn( "pmRealIn" );
-const std::string pmImagIn( "pmImagIn" );
-const std::string pmRealOut( "pmRealOut" );
-const std::string pmImagOut( "pmImagOut" );
-const std::string pmComplexIn( "pmComplexIn" );
-const std::string pmComplexOut( "pmComplexOut" );
-
-
-
-
-
-clfftStatus FFTGeneratedTransposeSquareAction::initParams ()
-{
-
- this->signature.fft_precision = this->plan->precision;
- this->signature.fft_placeness = this->plan->placeness;
- this->signature.fft_inputLayout = this->plan->inputLayout;
- this->signature.fft_outputLayout = this->plan->outputLayout;
- this->signature.fft_3StepTwiddle = false;
-
- this->signature.fft_realSpecial = this->plan->realSpecial;
-
- this->signature.transOutHorizontal = this->plan->transOutHorizontal; // using the twiddle front flag to specify horizontal write
- // we do this so as to reuse flags in FFTKernelGenKeyParams
- // and to avoid making a new one
-
- ARG_CHECK( this->plan->inStride.size( ) == this->plan->outStride.size( ) );
-
- if( CLFFT_INPLACE == this->signature.fft_placeness )
- {
- // If this is an in-place transform the
- // input and output layout, dimensions and strides
- // *MUST* be the same.
- //
- ARG_CHECK( this->signature.fft_inputLayout == this->signature.fft_outputLayout )
-
- for( size_t u = this->plan->inStride.size(); u-- > 0; )
- {
- ARG_CHECK( this->plan->inStride[u] == this->plan->outStride[u] );
- }
- }
-
- this->signature.fft_DataDim = this->plan->length.size() + 1;
- int i = 0;
- for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
- {
- this->signature.fft_N[i] = this->plan->length[i];
- this->signature.fft_inStride[i] = this->plan->inStride[i];
- this->signature.fft_outStride[i] = this->plan->outStride[i];
-
- }
- this->signature.fft_inStride[i] = this->plan->iDist;
- this->signature.fft_outStride[i] = this->plan->oDist;
-
- if (this->plan->large1D != 0) {
- ARG_CHECK (this->signature.fft_N[0] != 0)
- ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
- this->signature.fft_3StepTwiddle = true;
- ARG_CHECK ( this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
- }
-
- // Query the devices in this context for their local memory sizes
- // How we generate a kernel depends on the *minimum* LDS size for all devices.
- //
- const FFTEnvelope * pEnvelope = NULL;
- OPENCL_V( this->plan->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
- BUG_CHECK( NULL != pEnvelope );
-
- // TODO: Since I am going with a 2D workgroup size now, I need a better check than this 1D use
- // Check: CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
- // CL_DEVICE_MAX_WORK_ITEM_SIZES
- this->signature.fft_R = 1; // Dont think i'll use
- this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
-
- //Set callback if specified
- if (this->plan->hasPreCallback)
- {
- this->signature.fft_hasPreCallback = true;
- this->signature.fft_preCallback = this->plan->preCallback;
- }
- if (this->plan->hasPostCallback)
- {
- this->signature.fft_hasPostCallback = true;
- this->signature.fft_postCallback = this->plan->postCallbackParam;
- }
- this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
-
- return CLFFT_SUCCESS;
-}
-
-
-static const size_t lwSize = 256;
-static const size_t reShapeFactor = 2;
-
-
-// OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
-// Feed this generator the FFTPlan, and it returns the generated program as a string
-clfftStatus FFTGeneratedTransposeSquareAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
-{
- //Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
- if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) ||
- (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
- {
- assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
-
- bool validLDSSize = false;
- size_t requestedCallbackLDS = 0;
-
- if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
- requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
- else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
- requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
-
- validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
-
- if(!validLDSSize)
- {
- fprintf(stderr, "Requested local memory size not available\n");
- return CLFFT_INVALID_ARG_VALUE;
- }
- }
-
- std::string programCode;
- OPENCL_V( clfft_transpose_generator::genTransposeKernelBatched( this->signature, programCode, lwSize, reShapeFactor ), _T( "GenerateTransposeKernel() failed!" ) );
-
- cl_int status = CL_SUCCESS;
- cl_device_id Device = NULL;
- status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
- cl_context QueueContext = NULL;
- status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
- OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
-
- OPENCL_V( fftRepo.setProgramCode( Transpose_SQUARE, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-
- // Note: See genFunctionPrototype( )
- if( this->signature.fft_3StepTwiddle )
- {
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_SQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
- }
- else
- {
- OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
- }
-
- return CLFFT_SUCCESS;
-}
-
-
-clfftStatus FFTGeneratedTransposeSquareAction::getWorkSizes( std::vector< size_t >& globalWS, std::vector< size_t >& localWS )
-{
-
- size_t wg_slice;
- if (this->signature.fft_N[0] % (16 * reShapeFactor) == 0)
- wg_slice = this->signature.fft_N[0] / 16 / reShapeFactor;
- else
- wg_slice = (this->signature.fft_N[0] / (16*reShapeFactor)) + 1;
-
- size_t global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
-
- for(int i = 2; i < this->signature.fft_DataDim - 1; i++)
- {
- global_item_size *= this->signature.fft_N[i];
- }
-
- globalWS.clear( );
- globalWS.push_back(global_item_size);
-
- localWS.clear( );
- localWS.push_back( lwSize );
-
- return CLFFT_SUCCESS;
-}
diff --git a/src/library/action.cpp b/src/library/enqueue.cpp
similarity index 100%
rename from src/library/action.cpp
rename to src/library/enqueue.cpp
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index e33a52a..7a5953e 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1955,7 +1955,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
clLengths[0] = fftPlan->length[0];
clLengths[1] = fftPlan->length[1];
- //Transpose stage 1
+ //Transpose stage 1 first do batched sqaure transpose along leading dim
OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
@@ -1999,9 +1999,9 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
_T("BakePlan transpose_nsq_stage1 plan failed"));
- //Transpose stage 2
+ //Transpose stage 2 then do swapping lines
OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths),
- _T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
+ _T("CreateDefaultPlan transpose_nsq_stage2 plan failed"));
FFTPlan* trans2Plan = NULL;
lockRAII* trans2Lock = NULL;
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 616472c..a066bbd 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -637,7 +637,7 @@ clfftStatus clfftEnqueueTransform(
OPENCL_V(clfftEnqueueTransform(fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
&stage1OutEvents, outEvents, clInputBuffers, NULL, NULL),
- _T("clfftEnqueueTransform stage1 failed"));
+ _T("clfftEnqueueTransform stage2 failed"));
clReleaseEvent(stage1OutEvents);
if (fftRepo.pStatTimer)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list