[clfft] 05/21: code reorganisition. renamed action.cpp to enqueue.cpp. consolidate tranpose related actions to action.transpose.cpp

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Wed Mar 16 13:14:03 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit f8870fde56ce2fb74134d921eea7d73d9cee0467
Author: Timmy <timmy.liu at amd.com>
Date:   Tue Feb 23 10:13:22 2016 -0600

    code reorganisition. renamed action.cpp to enqueue.cpp. consolidate tranpose related actions to action.transpose.cpp
---
 src/library/CMakeLists.txt                         |   8 +-
 ...ranspose.nonsquare.cpp => action.transpose.cpp} | 229 ++++++++++++++++-
 ...ction.transpose.square.h => action.transpose.h} |   2 +-
 src/library/action.transpose.nonsquare.h           |  26 --
 src/library/action.transpose.square.cpp            | 270 ---------------------
 src/library/{action.cpp => enqueue.cpp}            |   0
 src/library/plan.cpp                               |   6 +-
 src/library/transform.cpp                          |   2 +-
 8 files changed, 236 insertions(+), 307 deletions(-)

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 436fdf9..132ef86 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -22,13 +22,12 @@ set( clFFT.Source	transform.cpp
 								generator.stockham.cpp
 								generator.transpose.gcn.cpp
 								generator.transpose.cpp
-								action.transpose.nonsquare.cpp
-								action.transpose.square.cpp
+								action.transpose.cpp
 								generator.copy.cpp
 								lifetime.cpp
 								fft_binary_lookup.cpp
 								md5sum.c
-								action.cpp
+								enqueue.cpp
 								stdafx.cpp )
 
 # Windows only uses dllmain
@@ -46,8 +45,7 @@ set( clFFT.Headers	private.h
 					generator.stockham.h
 					generator.transpose.gcn.h
 					generator.transpose.h
-					action.transpose.nonsquare.h
-					action.transpose.square.h
+					action.transpose.h
 					fft_binary_lookup.h
 					md5sum.h
 					../include/stdafx.h
diff --git a/src/library/action.transpose.nonsquare.cpp b/src/library/action.transpose.cpp
similarity index 62%
rename from src/library/action.transpose.nonsquare.cpp
rename to src/library/action.transpose.cpp
index 8787cc2..db35ce5 100644
--- a/src/library/action.transpose.nonsquare.cpp
+++ b/src/library/action.transpose.cpp
@@ -24,7 +24,7 @@
 #include <math.h>
 #include <iomanip>
 #include "generator.transpose.h"
-#include "action.transpose.nonsquare.h"
+#include "action.transpose.h"
 #include "generator.stockham.h"
 
 #include "action.h"
@@ -362,3 +362,230 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::getWorkSizes(std::vector< size
     }
     return CLFFT_SUCCESS;
 }
+
+FFTGeneratedTransposeSquareAction::FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
+	: FFTTransposeSquareAction(plHandle, plan, queue, err)
+{
+	if (err != CLFFT_SUCCESS)
+	{
+		// FFTTransposeSquareAction() failed, exit
+		fprintf(stderr, "FFTTransposeSquareAction() failed!\n");
+		return;
+	}
+
+	// Initialize the FFTAction::FFTKernelGenKeyParams member
+	err = this->initParams();
+
+	if (err != CLFFT_SUCCESS)
+	{
+		fprintf(stderr, "FFTGeneratedTransposeSquareAction::initParams() failed!\n");
+		return;
+	}
+
+	FFTRepo &fftRepo = FFTRepo::getInstance();
+
+	err = this->generateKernel(fftRepo, queue);
+
+	if (err != CLFFT_SUCCESS)
+	{
+		fprintf(stderr, "FFTGeneratedTransposeSquareAction::generateKernel failed\n");
+		return;
+	}
+
+	err = compileKernels(queue, plHandle, plan);
+
+	if (err != CLFFT_SUCCESS)
+	{
+		fprintf(stderr, "FFTGeneratedTransposeSquareAction::compileKernels failed\n");
+		return;
+	}
+
+	err = CLFFT_SUCCESS;
+}
+
+
+bool FFTGeneratedTransposeSquareAction::buildForwardKernel()
+{
+	clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+	clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+	bool r2c_transform = (inputLayout == CLFFT_REAL);
+	bool c2r_transform = (outputLayout == CLFFT_REAL);
+	bool real_transform = (r2c_transform || c2r_transform);
+
+	return (!real_transform) || r2c_transform;
+}
+
+bool FFTGeneratedTransposeSquareAction::buildBackwardKernel()
+{
+	clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
+	clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
+
+	bool r2c_transform = (inputLayout == CLFFT_REAL);
+	bool c2r_transform = (outputLayout == CLFFT_REAL);
+	bool real_transform = (r2c_transform || c2r_transform);
+
+	return (!real_transform) || c2r_transform;
+}
+
+/*sqaure action*/
+clfftStatus FFTGeneratedTransposeSquareAction::initParams()
+{
+
+	this->signature.fft_precision = this->plan->precision;
+	this->signature.fft_placeness = this->plan->placeness;
+	this->signature.fft_inputLayout = this->plan->inputLayout;
+	this->signature.fft_outputLayout = this->plan->outputLayout;
+	this->signature.fft_3StepTwiddle = false;
+
+	this->signature.fft_realSpecial = this->plan->realSpecial;
+
+	this->signature.transOutHorizontal = this->plan->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
+																			// we do this so as to reuse flags in FFTKernelGenKeyParams
+																			// and to avoid making a new one 
+
+	ARG_CHECK(this->plan->inStride.size() == this->plan->outStride.size());
+
+	if (CLFFT_INPLACE == this->signature.fft_placeness)
+	{
+		//	If this is an in-place transform the
+		//	input and output layout, dimensions and strides
+		//	*MUST* be the same.
+		//
+		ARG_CHECK(this->signature.fft_inputLayout == this->signature.fft_outputLayout)
+
+			for (size_t u = this->plan->inStride.size(); u-- > 0; )
+			{
+				ARG_CHECK(this->plan->inStride[u] == this->plan->outStride[u]);
+			}
+	}
+
+	this->signature.fft_DataDim = this->plan->length.size() + 1;
+	int i = 0;
+	for (i = 0; i < (this->signature.fft_DataDim - 1); i++)
+	{
+		this->signature.fft_N[i] = this->plan->length[i];
+		this->signature.fft_inStride[i] = this->plan->inStride[i];
+		this->signature.fft_outStride[i] = this->plan->outStride[i];
+
+	}
+	this->signature.fft_inStride[i] = this->plan->iDist;
+	this->signature.fft_outStride[i] = this->plan->oDist;
+
+	if (this->plan->large1D != 0) {
+		ARG_CHECK(this->signature.fft_N[0] != 0)
+			ARG_CHECK((this->plan->large1D % this->signature.fft_N[0]) == 0)
+			this->signature.fft_3StepTwiddle = true;
+		ARG_CHECK(this->plan->large1D == (this->signature.fft_N[1] * this->signature.fft_N[0]));
+	}
+
+	//	Query the devices in this context for their local memory sizes
+	//	How we generate a kernel depends on the *minimum* LDS size for all devices.
+	//
+	const FFTEnvelope * pEnvelope = NULL;
+	OPENCL_V(this->plan->GetEnvelope(&pEnvelope), _T("GetEnvelope failed"));
+	BUG_CHECK(NULL != pEnvelope);
+
+	// TODO:  Since I am going with a 2D workgroup size now, I need a better check than this 1D use
+	// Check:  CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
+	// CL_DEVICE_MAX_WORK_ITEM_SIZES
+	this->signature.fft_R = 1; // Dont think i'll use
+	this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
+
+															   //Set callback if specified
+	if (this->plan->hasPreCallback)
+	{
+		this->signature.fft_hasPreCallback = true;
+		this->signature.fft_preCallback = this->plan->preCallback;
+	}
+	if (this->plan->hasPostCallback)
+	{
+		this->signature.fft_hasPostCallback = true;
+		this->signature.fft_postCallback = this->plan->postCallbackParam;
+	}
+	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
+
+	return CLFFT_SUCCESS;
+}
+
+
+//	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
+//	Feed this generator the FFTPlan, and it returns the generated program as a string
+clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo, const cl_command_queue commQueueFFT)
+{
+	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
+	if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) ||
+		(this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
+	{
+		assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
+
+		bool validLDSSize = false;
+		size_t requestedCallbackLDS = 0;
+
+		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
+		else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
+			requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
+
+		validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
+
+		if (!validLDSSize)
+		{
+			fprintf(stderr, "Requested local memory size not available\n");
+			return CLFFT_INVALID_ARG_VALUE;
+		}
+	}
+
+	std::string programCode;
+	OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("GenerateTransposeKernel() failed!"));
+
+	cl_int status = CL_SUCCESS;
+	cl_device_id Device = NULL;
+	status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
+	OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
+
+	cl_context QueueContext = NULL;
+	status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
+	OPENCL_V(status, _T("clGetCommandQueueInfo failed"));
+
+
+	OPENCL_V(fftRepo.setProgramCode(Transpose_SQUARE, this->getSignatureData(), programCode, Device, QueueContext), _T("fftRepo.setclString() failed!"));
+
+	// Note:  See genFunctionPrototype( )
+	if (this->signature.fft_3StepTwiddle)
+	{
+		OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
+	}
+	else
+	{
+		OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
+	}
+
+	return CLFFT_SUCCESS;
+}
+
+
+clfftStatus FFTGeneratedTransposeSquareAction::getWorkSizes(std::vector< size_t >& globalWS, std::vector< size_t >& localWS)
+{
+
+	size_t wg_slice;
+	if (this->signature.fft_N[0] % (16 * reShapeFactor) == 0)
+		wg_slice = this->signature.fft_N[0] / 16 / reShapeFactor;
+	else
+		wg_slice = (this->signature.fft_N[0] / (16 * reShapeFactor)) + 1;
+
+	size_t global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
+
+	for (int i = 2; i < this->signature.fft_DataDim - 1; i++)
+	{
+		global_item_size *= this->signature.fft_N[i];
+	}
+
+	globalWS.clear();
+	globalWS.push_back(global_item_size);
+
+	localWS.clear();
+	localWS.push_back(lwSize);
+
+	return CLFFT_SUCCESS;
+}
diff --git a/src/library/action.transpose.square.h b/src/library/action.transpose.h
similarity index 95%
rename from src/library/action.transpose.square.h
rename to src/library/action.transpose.h
index 12ad701..a0a44f0 100644
--- a/src/library/action.transpose.square.h
+++ b/src/library/action.transpose.h
@@ -16,7 +16,7 @@
 
 #pragma once
 #if !defined( AMD_CLFFT_generator_transpose_H )
-#define AMD_CLFFT_generator_transpose_H
+#define AMD_CLFFT_action_transpose_H
 #include "private.h"
 #include "repo.h"
 #include "plan.h"
diff --git a/src/library/action.transpose.nonsquare.h b/src/library/action.transpose.nonsquare.h
deleted file mode 100644
index 559ee90..0000000
--- a/src/library/action.transpose.nonsquare.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* ************************************************************************
-* Copyright 2013 Advanced Micro Devices, Inc.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-* ************************************************************************/
-
-#pragma once
-#if !defined( AMD_CLFFT_generator_transpose_H )
-#define AMD_CLFFT_generator_transpose_H
-#include "private.h"
-#include "repo.h"
-#include "plan.h"
-
-#endif
-
-#pragma once
diff --git a/src/library/action.transpose.square.cpp b/src/library/action.transpose.square.cpp
deleted file mode 100644
index a21d773..0000000
--- a/src/library/action.transpose.square.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
- // action.transpose.square.cpp provides the entry points of "baking"
- // square transpose kernels called in plan.cpp.
- // the actual kernel string generation is provided by generator.transpose.cpp
-
-#include "stdafx.h"
-#include <math.h>
-#include "generator.transpose.h"
-#include "action.transpose.square.h"
-#include "generator.stockham.h"
-#include "action.h"
-
-FFTGeneratedTransposeSquareAction::FFTGeneratedTransposeSquareAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
-    : FFTTransposeSquareAction(plHandle, plan, queue, err)
-{
-    if (err != CLFFT_SUCCESS)
-    {
-        // FFTTransposeSquareAction() failed, exit
-        fprintf(stderr, "FFTTransposeSquareAction() failed!\n");
-        return;
-    }
-
-    // Initialize the FFTAction::FFTKernelGenKeyParams member
-    err = this->initParams();
-
-    if (err != CLFFT_SUCCESS)
-    {
-        fprintf(stderr, "FFTGeneratedTransposeSquareAction::initParams() failed!\n");
-        return;
-    }
-
-    FFTRepo &fftRepo = FFTRepo::getInstance();
-
-    err = this->generateKernel(fftRepo, queue);
-
-    if (err != CLFFT_SUCCESS)
-    {
-        fprintf(stderr, "FFTGeneratedTransposeSquareAction::generateKernel failed\n");
-        return;
-    }
-
-    err = compileKernels( queue, plHandle, plan);
-
-    if (err != CLFFT_SUCCESS)
-    {
-        fprintf(stderr, "FFTGeneratedTransposeSquareAction::compileKernels failed\n");
-        return;
-    }
-
-    err = CLFFT_SUCCESS;
-}
-
-
-bool FFTGeneratedTransposeSquareAction::buildForwardKernel()
-{
-    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
-    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
-    bool r2c_transform = (inputLayout == CLFFT_REAL);
-    bool c2r_transform = (outputLayout == CLFFT_REAL);
-    bool real_transform = (r2c_transform || c2r_transform);
-
-    return (!real_transform) || r2c_transform;
-}
-
-bool FFTGeneratedTransposeSquareAction::buildBackwardKernel()
-{
-    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
-    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
-    bool r2c_transform = (inputLayout == CLFFT_REAL);
-    bool c2r_transform = (outputLayout == CLFFT_REAL);
-    bool real_transform = (r2c_transform || c2r_transform);
-
-    return (!real_transform) || c2r_transform;
-}
-
-
-// These strings represent the names that are used as strKernel parameters
-const std::string pmRealIn( "pmRealIn" );
-const std::string pmImagIn( "pmImagIn" );
-const std::string pmRealOut( "pmRealOut" );
-const std::string pmImagOut( "pmImagOut" );
-const std::string pmComplexIn( "pmComplexIn" );
-const std::string pmComplexOut( "pmComplexOut" );
-
-
-
-
-
-clfftStatus FFTGeneratedTransposeSquareAction::initParams ()
-{
-
-    this->signature.fft_precision    = this->plan->precision;
-    this->signature.fft_placeness    = this->plan->placeness;
-    this->signature.fft_inputLayout  = this->plan->inputLayout;
-    this->signature.fft_outputLayout = this->plan->outputLayout;
-    this->signature.fft_3StepTwiddle = false;
-
-	this->signature.fft_realSpecial  = this->plan->realSpecial;
-
-	this->signature.transOutHorizontal = this->plan->transOutHorizontal;	// using the twiddle front flag to specify horizontal write
-														// we do this so as to reuse flags in FFTKernelGenKeyParams
-														// and to avoid making a new one 
-
-    ARG_CHECK( this->plan->inStride.size( ) == this->plan->outStride.size( ) );
-
-    if( CLFFT_INPLACE == this->signature.fft_placeness )
-    {
-        //	If this is an in-place transform the
-        //	input and output layout, dimensions and strides
-        //	*MUST* be the same.
-        //
-        ARG_CHECK( this->signature.fft_inputLayout == this->signature.fft_outputLayout )
-
-        for( size_t u = this->plan->inStride.size(); u-- > 0; )
-        {
-            ARG_CHECK( this->plan->inStride[u] == this->plan->outStride[u] );
-        }
-    }
-
-	this->signature.fft_DataDim = this->plan->length.size() + 1;
-	int i = 0;
-	for(i = 0; i < (this->signature.fft_DataDim - 1); i++)
-	{
-        this->signature.fft_N[i]         = this->plan->length[i];
-        this->signature.fft_inStride[i]  = this->plan->inStride[i];
-        this->signature.fft_outStride[i] = this->plan->outStride[i];
-
-	}
-    this->signature.fft_inStride[i]  = this->plan->iDist;
-    this->signature.fft_outStride[i] = this->plan->oDist;
-
-    if (this->plan->large1D != 0) {
-        ARG_CHECK (this->signature.fft_N[0] != 0)
-        ARG_CHECK ((this->plan->large1D % this->signature.fft_N[0]) == 0)
-        this->signature.fft_3StepTwiddle = true;
-		ARG_CHECK ( this->plan->large1D  == (this->signature.fft_N[1] * this->signature.fft_N[0]) );
-    }
-
-    //	Query the devices in this context for their local memory sizes
-    //	How we generate a kernel depends on the *minimum* LDS size for all devices.
-    //
-    const FFTEnvelope * pEnvelope = NULL;
-    OPENCL_V( this->plan->GetEnvelope( &pEnvelope ), _T( "GetEnvelope failed" ) );
-    BUG_CHECK( NULL != pEnvelope );
-
-    // TODO:  Since I am going with a 2D workgroup size now, I need a better check than this 1D use
-    // Check:  CL_DEVICE_MAX_WORK_GROUP_SIZE/CL_KERNEL_WORK_GROUP_SIZE
-    // CL_DEVICE_MAX_WORK_ITEM_SIZES
-    this->signature.fft_R = 1; // Dont think i'll use
-    this->signature.fft_SIMD = pEnvelope->limit_WorkGroupSize; // Use devices maximum workgroup size
-
-	//Set callback if specified
-	if (this->plan->hasPreCallback)
-	{
-		this->signature.fft_hasPreCallback = true;
-		this->signature.fft_preCallback = this->plan->preCallback;
-	}
-	if (this->plan->hasPostCallback)
-	{
-		this->signature.fft_hasPostCallback = true;
-		this->signature.fft_postCallback = this->plan->postCallbackParam;
-	}
-	this->signature.limit_LocalMemSize = this->plan->envelope.limit_LocalMemSize;
-
-    return CLFFT_SUCCESS;
-}
-
-
-static const size_t lwSize = 256;
-static const size_t reShapeFactor = 2;  
-
-
-//	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
-//	Feed this generator the FFTPlan, and it returns the generated program as a string
-clfftStatus FFTGeneratedTransposeSquareAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
-{
-	//Requested local memory size by callback must not exceed the device LDS limits after factoring the LDS size required by main FFT kernel
-	if ((this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0) || 
-		(this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0))
-	{
-		assert(!(this->signature.fft_hasPreCallback && this->signature.fft_hasPostCallback));
-
-		bool validLDSSize = false;
-		size_t requestedCallbackLDS = 0;
-
-		if (this->signature.fft_hasPreCallback && this->signature.fft_preCallback.localMemSize > 0)
-			requestedCallbackLDS = this->signature.fft_preCallback.localMemSize;
-		else if (this->signature.fft_hasPostCallback && this->signature.fft_postCallback.localMemSize > 0)
-			requestedCallbackLDS = this->signature.fft_postCallback.localMemSize;
-		
-		validLDSSize = ((2 * this->plan->ElementSize() * 16 * reShapeFactor * 16 * reShapeFactor) + requestedCallbackLDS) < this->plan->envelope.limit_LocalMemSize;
-	
-		if(!validLDSSize)
-		{
-			fprintf(stderr, "Requested local memory size not available\n");
-			return CLFFT_INVALID_ARG_VALUE;
-		}
-	}
-
-    std::string programCode;
-    OPENCL_V( clfft_transpose_generator::genTransposeKernelBatched( this->signature, programCode, lwSize, reShapeFactor ), _T( "GenerateTransposeKernel() failed!" ) );
-
-    cl_int status = CL_SUCCESS;
-    cl_device_id Device = NULL;
-    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
-    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
-    cl_context QueueContext = NULL;
-    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
-    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
-
-    OPENCL_V( fftRepo.setProgramCode( Transpose_SQUARE, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-
-    // Note:  See genFunctionPrototype( )
-    if( this->signature.fft_3StepTwiddle )
-    {
-        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_SQUARE, this->getSignatureData(), "transpose_square_tw_fwd", "transpose_square_tw_back", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
-    }
-    else
-    {
-        OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
-    }
-
-    return CLFFT_SUCCESS;
-}
-
-
-clfftStatus FFTGeneratedTransposeSquareAction::getWorkSizes( std::vector< size_t >& globalWS, std::vector< size_t >& localWS )
-{
-
-	size_t wg_slice;
-	if (this->signature.fft_N[0] % (16 * reShapeFactor) == 0)
-		wg_slice = this->signature.fft_N[0] / 16 / reShapeFactor;
-	else
-		wg_slice = (this->signature.fft_N[0] / (16*reShapeFactor)) + 1;
-
-	size_t global_item_size = wg_slice*(wg_slice + 1) / 2 * 16 * 16 * this->plan->batchsize;
-
-	for(int i = 2; i < this->signature.fft_DataDim - 1; i++)
-	{
-		global_item_size *= this->signature.fft_N[i];
-	}
-
-    globalWS.clear( );
-	globalWS.push_back(global_item_size);
-
-    localWS.clear( );
-    localWS.push_back( lwSize );
-
-    return CLFFT_SUCCESS;
-}
diff --git a/src/library/action.cpp b/src/library/enqueue.cpp
similarity index 100%
rename from src/library/action.cpp
rename to src/library/enqueue.cpp
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index e33a52a..7a5953e 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -1955,7 +1955,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 						clLengths[0] = fftPlan->length[0];
 						clLengths[1] = fftPlan->length[1];
 
-						//Transpose stage 1
+						//Transpose stage 1 first do batched sqaure transpose along leading dim 
 						OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTX, fftPlan->context, CLFFT_2D, clLengths),
 							_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
 
@@ -1999,9 +1999,9 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 							_T("BakePlan transpose_nsq_stage1 plan failed"));
 
 
-						//Transpose stage 2
+						//Transpose stage 2 then do swapping lines
 						OPENCL_V(clfftCreateDefaultPlanInternal(&fftPlan->planTY, fftPlan->context, CLFFT_2D, clLengths),
-							_T("CreateDefaultPlan transpose_nsq_stage1 plan failed"));
+							_T("CreateDefaultPlan transpose_nsq_stage2 plan failed"));
 
 						FFTPlan* trans2Plan = NULL;
 						lockRAII* trans2Lock = NULL;
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index 616472c..a066bbd 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -637,7 +637,7 @@ clfftStatus clfftEnqueueTransform(
 
 				OPENCL_V(clfftEnqueueTransform(fftPlan->planTY, dir, numQueuesAndEvents, commQueues, 1,
 					&stage1OutEvents, outEvents, clInputBuffers, NULL, NULL),
-					_T("clfftEnqueueTransform stage1 failed"));
+					_T("clfftEnqueueTransform stage2 failed"));
 				clReleaseEvent(stage1OutEvents);
 
 				if (fftRepo.pStatTimer)

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list