[clfft] 24/74: removing VLIW transpose code

Thu Jan 14 19:52:14 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch debian/sid
in repository clfft.

commit f069b93c9c16e8d2528af3627d8cffd4277a7d1d
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Tue Dec 1 18:59:07 2015 -0800

    removing VLIW transpose code
---
 src/library/CMakeLists.txt               |  55 +-
 src/library/action.cpp                   |  12 -
 src/library/action.h                     |  55 --
 src/library/generator.h                  |   1 -
 src/library/generator.transpose.vliw.cpp | 906 -------------------------------
 src/library/generator.transpose.vliw.h   |  25 -
 src/library/plan.cpp                     |  45 +-
 src/library/transform.cpp                |   3 +-
 8 files changed, 36 insertions(+), 1066 deletions(-)

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 3025d0a..c58de4e 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -1,12 +1,12 @@
 # ########################################################################
 # Copyright 2013 Advanced Micro Devices, Inc.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,16 +15,15 @@
 # ########################################################################
 
 # List the names of common files to compile across all platforms
-set( clFFT.Source	transform.cpp 
-								accessors.cpp 
-								plan.cpp 
-								repo.cpp 
-								generator.stockham.cpp 
-								generator.transpose.vliw.cpp 
+set( clFFT.Source	transform.cpp
+								accessors.cpp
+								plan.cpp
+								repo.cpp
+								generator.stockham.cpp
 								generator.transpose.gcn.cpp 
-								generator.transpose.square.cpp 
+								generator.transpose.square.cpp
 								generator.copy.cpp
-								lifetime.cpp 
+								lifetime.cpp
 								fft_binary_lookup.cpp
 								md5sum.c
 								action.cpp
@@ -35,23 +34,22 @@ if( MSVC )
 	set( clFFT.Source ${clFFT.Source} dllmain.cpp )
 endif( )
 
-set( clFFT.Headers	private.h 
+set( clFFT.Headers	private.h
 					action.h
-					repo.h 
-					plan.h 
-					lock.h 
-					mainpage.h  
-					generator.h 
-					generator.stockham.h 
-					generator.transpose.vliw.h 
-					generator.transpose.gcn.h 
-					generator.transpose.square.h 
+					repo.h
+					plan.h
+					lock.h
+					mainpage.h
+					generator.h
+					generator.stockham.h
+					generator.transpose.gcn.h
+					generator.transpose.square.h
 					fft_binary_lookup.h
 					md5sum.h
-					../include/stdafx.h 
-					../include/unicode.compatibility.h 
-					../include/targetver.h 
-					../include/clAmdFft.h 
+					../include/stdafx.h
+					../include/unicode.compatibility.h
+					../include/targetver.h
+					../include/clAmdFft.h
 					../include/clFFT.h )
 
 set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
@@ -59,7 +57,7 @@ set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
 # For a rainy day, add pre-compiled header support
 #if( MSVC )
 #	if (USE_MSVC_PCH)
-	
+
 #		set_source_files_properties(LungAnalysisPCH.cxx
 #			PROPERTIES
 #			COMPILE_FLAGS "/YcLungAnalysisPCH.h"
@@ -71,13 +69,13 @@ set( clFFT.Files ${clFFT.Source} ${clFFT.Headers} )
 #				COMPILE_FLAGS "/YuLungAnalysisPCH.h"
 #				)
 #		endforeach( src_file ${UPMC_LA_SRCS} )
-		
+
 #		list(APPEND UPMC_LA_SRCS LungAnalysisPCH.cxx)
 #		list(APPEND UPMC_LA_HDRS LungAnalysisPCH.h)
 
 #	endif(USE_MSVC_PCH)
 #endif (MSVC)
-  
+
 #	add_definitions( ${Boost_LIB_DIAGNOSTIC_DEFINITIONS} )
 add_definitions( "/DCLFFT_EXPORTS" )
 
@@ -106,4 +104,3 @@ install( TARGETS clFFT
         LIBRARY DESTINATION lib${SUFFIX_LIB}
         ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
         )
-        
diff --git a/src/library/action.cpp b/src/library/action.cpp
index 60508ff..7b2cfc9 100644
--- a/src/library/action.cpp
+++ b/src/library/action.cpp
@@ -41,18 +41,6 @@ FFTCopyAction::FFTCopyAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_comman
     err = CLFFT_SUCCESS;
 }
 
-FFTTransposeVLIWAction::FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
-    : FFTAction(plan, err)
-{
-    if (err != CLFFT_SUCCESS)
-    {
-        // FFTAction() failed, exit constructor
-        return;
-    }
-
-    err = CLFFT_SUCCESS;
-}
-
 FFTTransposeGCNAction::FFTTransposeGCNAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
     : FFTAction(plan, err)
 {
diff --git a/src/library/action.h b/src/library/action.h
index 9c14a74..1619594 100644
--- a/src/library/action.h
+++ b/src/library/action.h
@@ -54,20 +54,6 @@ public:
 };
 
 
-//
-// FFTTransposeVLIWAction
-//
-// Base class for every TransposeVLIW action for the FFT.
-// Currently do nothing special. The kernel generation and compilation occurs
-// by the subclass FFTGeneratedTransposeVLIWAction
-// 
-class FFTTransposeVLIWAction : public FFTAction
-{
-public:
-    FFTTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
-
-    clfftGenerators getGenerator() { return Transpose_VLIW; }
-};
 
 
 //
@@ -185,47 +171,6 @@ public:
     }
 };
 
-//
-// FFTGeneratedTransposeVLIWAction
-//
-// Implements a TransposeVLIW action for the FFT
-// Its signature is represented by FFTKernelGenKeyParams structure
-// 
-// This class implements:
-//  - the generation of the kernel string
-//  - the build of the kernel
-// 
-// The structure FFTKernelGenKeyParams is used to characterize and generate
-// the appropriate transpose kernel. That structure is used for the signature of
-// this action. It is common to Stockham, copy and transpose methods. For
-// convenience, this structure is used for every FFTGenerated*Action class,
-// but in practice the transpose action only use a few information of that
-// structure, so a proper structure should be used instead.
-//
-class FFTGeneratedTransposeVLIWAction : public FFTTransposeVLIWAction
-{
-public:
-    FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err);
-
-    typedef FFTKernelSignature<FFTKernelGenKeyParams, FFT_DEFAULT_TRANSPOSE_ACTION> Signature;
-
-private:
-    Signature signature;
-
-    clfftStatus generateKernel  (FFTRepo& fftRepo, const cl_command_queue commQueueFFT );
-    clfftStatus getWorkSizes    (std::vector<size_t> & globalws, std::vector<size_t> & localws);
-    clfftStatus initParams      ();
-
-    bool buildForwardKernel();
-    bool buildBackwardKernel();
-
-public:
-
-    virtual const Signature * getSignatureData()
-    {
-        return &this->signature;
-    }
-};
 
 
 
diff --git a/src/library/generator.h b/src/library/generator.h
index b27043a..b90090c 100644
--- a/src/library/generator.h
+++ b/src/library/generator.h
@@ -23,7 +23,6 @@
 enum clfftGenerators
 {
     Stockham, // Using the Stockham autosort frameworks
-    Transpose_VLIW,
     Transpose_GCN,
     Transpose_SQUARE,
     Copy,
diff --git a/src/library/generator.transpose.vliw.cpp b/src/library/generator.transpose.vliw.cpp
deleted file mode 100644
index b075314..0000000
--- a/src/library/generator.transpose.vliw.cpp
+++ /dev/null
@@ -1,906 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-
-// clfft.generator.Transpose.cpp : Dynamic run-time generator of openCL transpose kernels
-//
-
-// TODO: generalize the kernel to work with any size
-
-#include "stdafx.h"
-#include <math.h>
-#include "generator.transpose.vliw.h"
-#include "action.h"
-
-
-FFTGeneratedTransposeVLIWAction::FFTGeneratedTransposeVLIWAction(clfftPlanHandle plHandle, FFTPlan * plan, cl_command_queue queue, clfftStatus & err)
-    : FFTTransposeVLIWAction(plHandle, plan, queue, err)
-{
-    if (err != CLFFT_SUCCESS)
-    {
-        // FFTTransposeVLIWAction() failed, exit
-        fprintf(stderr, "FFTTransposeVLIWAction() failed!\n");
-        return;
-    }
-
-    // Initialize the FFTAction::FFTKernelGenKeyParams member
-    err = this->initParams();
-
-    if (err != CLFFT_SUCCESS)
-    {
-        fprintf(stderr, "FFTGeneratedTransposeVLIWAction::initParams() failed!\n");
-        return;
-    }
-
-    FFTRepo &fftRepo = FFTRepo::getInstance();
-
-    err = this->generateKernel(fftRepo, queue);
-
-    if (err != CLFFT_SUCCESS)
-    {
-        fprintf(stderr, "FFTGeneratedTransposeVLIWAction::generateKernel failed\n");
-        return;
-    }
-
-    err = compileKernels( queue, plHandle, plan);
-
-    if (err != CLFFT_SUCCESS)
-    {
-        fprintf(stderr, "FFTGeneratedTransposeVLIWAction::compileKernels failed\n");
-        return;
-    }
-
-    err = CLFFT_SUCCESS;
-}
-
-
-bool FFTGeneratedTransposeVLIWAction::buildForwardKernel()
-{
-    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
-    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
-    bool r2c_transform = (inputLayout == CLFFT_REAL);
-    bool c2r_transform = (outputLayout == CLFFT_REAL);
-    bool real_transform = (r2c_transform || c2r_transform);
-
-    return (!real_transform) || r2c_transform;
-}
-
-bool FFTGeneratedTransposeVLIWAction::buildBackwardKernel()
-{
-    clfftLayout inputLayout = this->getSignatureData()->fft_inputLayout;
-    clfftLayout outputLayout = this->getSignatureData()->fft_outputLayout;
-
-    bool r2c_transform = (inputLayout == CLFFT_REAL);
-    bool c2r_transform = (outputLayout == CLFFT_REAL);
-    bool real_transform = (r2c_transform || c2r_transform);
-
-    return (!real_transform) || c2r_transform;
-}
-
-
-
-#define QUOTEMARK(x) #x
-
-#define PLANNAR_READ(z0, z1, z2, z3, gp) \
-		ss	<< INDENT2 << QUOTEMARK(z0 = gp[0*HSTRIDE/4*8];\n) \
-			<< INDENT2 << QUOTEMARK(z1 = gp[1*HSTRIDE/4*8];\n) \
-			<< INDENT2 << QUOTEMARK(z2 = gp[2*HSTRIDE/4*8];\n) \
-			<< INDENT2 << QUOTEMARK(z3 = gp[3*HSTRIDE/4*8];\n);
-
-#define INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31) \
-		ss	<< INDENT2 << QUOTEMARK(z00 = gp[0*HSTRIDE/4*16];\n)     \
-			<< INDENT2 << QUOTEMARK(z01 = gp[0*HSTRIDE/4*16 + 1];\n) \
-			<< INDENT2 << QUOTEMARK(z10 = gp[1*HSTRIDE/4*16];\n)     \
-			<< INDENT2 << QUOTEMARK(z11 = gp[1*HSTRIDE/4*16 + 1];\n) \
-			<< INDENT2 << QUOTEMARK(z20 = gp[2*HSTRIDE/4*16];\n)     \
-			<< INDENT2 << QUOTEMARK(z21 = gp[2*HSTRIDE/4*16 + 1];\n) \
-			<< INDENT2 << QUOTEMARK(z30 = gp[3*HSTRIDE/4*16];\n)     \
-			<< INDENT2 << QUOTEMARK(z31 = gp[3*HSTRIDE/4*16 + 1];\n);
-
-#define PLANNAR_WRITE(z0, z1, z2, z3, gp) \
-		ss  << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*8] = z0;\n) \
-			<< INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*8] = z1;\n) \
-			<< INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*8] = z2;\n) \
-			<< INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*8] = z3;\n);
-
-#define INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31) \
-		ss  << INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*16]   = z00;\n) \
-			<< INDENT2 << QUOTEMARK(gp[0*VSTRIDE/4*16+1] = z01;\n) \
-			<< INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*16]   = z10;\n) \
-			<< INDENT2 << QUOTEMARK(gp[1*VSTRIDE/4*16+1] = z11;\n) \
-			<< INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*16]   = z20;\n) \
-			<< INDENT2 << QUOTEMARK(gp[2*VSTRIDE/4*16+1] = z21;\n) \
-			<< INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*16]   = z30;\n) \
-			<< INDENT2 << QUOTEMARK(gp[3*VSTRIDE/4*16+1] = z31;\n);
-
-#define WRITE_TO_LDS(lp, jump, z0, z1, z2, z3, part) \
-		ss	<< INDENT2 << QUOTEMARK(lp[0*jump] = z0.part;\n) \
-			<< INDENT2 << QUOTEMARK(lp[1*jump] = z1.part;\n) \
-			<< INDENT2 << QUOTEMARK(lp[2*jump] = z2.part;\n) \
-			<< INDENT2 << QUOTEMARK(lp[3*jump] = z3.part;\n) \
-			<< INDENT2 << QUOTEMARK(lp += jump*4;\n)          \
-			<< "\n";
-
-typedef enum inputoutputflag_
-{
-	PLANNAR_PLANNAR		= 1,
-	PLANNAR_INTERLEAVED,
-	INTERLEAVED_PLANNAR,
-	INTERLEAVED_INTERLEAVED,
-	ENDTRANSIO
-} transio;
-
-static clfftStatus GenerateTransposeKernel (FFTGeneratedTransposeVLIWAction::Signature  & params,
-	std::string & kernel)
-{
-	kernel.reserve (8000);
-	std::stringstream ss         (std::stringstream::out);
-
-	const char * szIn0 = "";
-	const char * szIn1 = "";
-	const char * szOut0 = "";
-	const char * szOut1 = "";
-	const char * typeIn = "";
-	const char * typeOut = "";
-	const char * INDENT2 = "";
-	const char * INDENT = "    ";
-	const char * datatype="";
-	const char * datatype2="";
-	bool xyflag  = (params.fft_N[0] == params.fft_N[1]) ? false : true;
-	transio iotype;
-
-
-	if (params.fft_precision == CLFFT_SINGLE)
-	{
-		datatype  = "float";
-		datatype2 = "float2";
-	}
-	else
-	{
-		datatype  = "double";
-		datatype2 = "double2";
-		ss << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n\n";
-	}
-
-	size_t hstride = params.fft_N[0];
-	size_t vstride = params.fft_N[1];
-
-	ss << "#define HSTRIDE " << hstride << "\n";
-	ss << "#define VSTRIDE " << vstride << "\n";
-	if (xyflag)
-	{
-		ss << "#define DIMX " << hstride/32 << "\n";
-		ss << "#define DIMY " << vstride/32 << "\n";
-		ss << "#define DIM ((DIMX > DIMY) ? DIMX : DIMY) \n\n";
-		INDENT2 = "        ";
-	}
-	else
-	{
-		ss << "#define DIM " << vstride/32 << "\n\n";
-		INDENT2 = "    ";
-	}
-
-	//	Generate the kernel entry point and parameter list
-	//
-	ss << "__attribute__((reqd_work_group_size(" << unsigned(params.fft_SIMD) << ",1,1)))\n"
-	   << "__kernel void\n"
-	   << "fft_trans" << "(";
-
-	if (xyflag && params.fft_placeness == CLFFT_INPLACE) return CLFFT_INVALID_ARG_VALUE;
-
-	switch (params.fft_inputLayout) {
-	case CLFFT_COMPLEX_INTERLEAVED:
-		typeIn = datatype2;
-		if (params.fft_placeness == CLFFT_INPLACE)
-		{
-			szIn0 = szOut0 = "gcomplx";
-			typeOut = datatype2;
-			ss  << "__global " << typeIn << " * restrict " << szIn0;
-			iotype = INTERLEAVED_INTERLEAVED;
-		}
-		else
-		{
-			szIn0  = "gcomplxIn";
-			ss  << "__global " << typeIn << " * restrict " << szIn0;
-
-			if (params.fft_outputLayout == CLFFT_COMPLEX_PLANAR)
-			{
-				szOut0 = "grealOut";
-				szOut1 = "gimagOut";
-				typeOut = datatype;
-				ss  <<", __global " << typeOut <<" * restrict " << szOut0
-					<<", __global " << typeOut <<" * restrict " << szOut1;
-				iotype = INTERLEAVED_PLANNAR;
-			}
-			else
-			{
-				szOut0 = "gcomplxOut";
-				typeOut = datatype2;
-				ss  <<", __global " << typeOut <<" * restrict " << szOut0;
-				iotype = INTERLEAVED_INTERLEAVED;
-			}
-		}
-		break;
-	case CLFFT_COMPLEX_PLANAR:
-		typeIn = datatype;
-		if (params.fft_placeness == CLFFT_INPLACE)
-		{
-			szIn0 = szOut0 = "greal";
-			szIn1 = szOut1 = "gimag";
-			typeOut = datatype;
-			ss << "__global " << typeIn << " * restrict " << szIn0 <<", __global " << typeIn <<" * restrict " << szIn1;
-			iotype = PLANNAR_PLANNAR;
-		}
-		else
-		{
-			szIn0  = "greadIn";
-			szIn1  = "gimagIn";
-			ss  <<"__global " << typeIn << " * restrict " << szIn0 <<", __global " << typeIn <<" * restrict " << szIn1;
-			if (params.fft_outputLayout == CLFFT_COMPLEX_PLANAR)
-			{
-				szOut0 = "grealOut";
-				szOut1 = "gimagOut";
-				typeOut = datatype;
-				ss  << ", __global " << typeOut <<" * restrict " << szOut0 <<", __global " << typeOut <<" * restrict " << szOut1;
-				iotype = PLANNAR_PLANNAR;
-			}
-			else
-			{
-				szOut0 = "gcomplxOut";
-				typeOut = datatype2;
-				ss  << ", __global " << typeOut <<" * restrict " << szOut0;
-				iotype = PLANNAR_INTERLEAVED;
-			}
-		}
-		break;
-	default:
-		return CLFFT_NOTIMPLEMENTED;
-	}
-	ss << ")\n{\n";
-
-	// Support plannar and interleaved format
-	switch (iotype)
-	{
-		case PLANNAR_INTERLEAVED:
-			ss  << INDENT << "__local " << typeIn << " ldsa[2048];\n"
-				<< INDENT << "__local " << typeIn << " ldsb[2048];\n";
-			break;
-		case INTERLEAVED_PLANNAR:
-		case PLANNAR_PLANNAR:
-		case INTERLEAVED_INTERLEAVED:
-			ss  << INDENT << "__local " << typeIn << " ldsa[1024];\n"
-				<< INDENT << "__local " << typeIn << " ldsb[1024];\n";
-			break;
-		default:
-		return CLFFT_NOTIMPLEMENTED;
-	}
-
-	ss	<< INDENT << "uint gid = get_global_id(0);\n"
-		<< INDENT << "uint me = gid & 0x3fU;\n"
-		<< INDENT << "uint k = (gid >> 6) % ";
-
-	// add batch support
-	size_t batchnum = (vstride > hstride) ? ((vstride/32) * (vstride/32 +1) /2)
-		: ((hstride/32) * (hstride/32 +1) /2);
-	ss  << batchnum
-		<< ";\n"
-		<< "\n";
-
-	ss	<< INDENT << "// Compute location of blocks\n"
-		<< INDENT << "int l = DIM+0.5f - native_sqrt((DIM+0.5f)*(DIM+0.5f) - 2.0f * (float)as_int(k));\n"
-		<< INDENT << "int kl = ((DIM*2+1 - l) * l) >> 1;\n"
-		<< INDENT << "uint j = k - kl;\n"
-		<< INDENT << "uint i = l + j;\n"
-		<< "\n";
-
-	ss  << INDENT << "uint goa, gob;\n"
-		<< INDENT << "uint go = ((me & 0x7U) << 2) + ((gid>>6)/" << batchnum << ") * VSTRIDE * HSTRIDE;\n"
-		<< INDENT << "__global " << datatype << "4 *gp;\n"
-		<< INDENT << "__local " << datatype << "4 *lp4;\n"
-		<< INDENT << "uint lo = ((me & 0x7U) << 7) + (me >> 3);\n"
-		<< INDENT << "uint lot = (me<<2); \n";
-
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		ss  << INDENT << datatype <<"4 z0, z1, z2, z3;\n\n"
-			<< INDENT << "__local " << typeIn <<" *lp;\n";
-		break;
-	case PLANNAR_INTERLEAVED:
-		ss  << INDENT << "__global " << datatype << "4 *gpi;\n"
-			<< INDENT << "__local " << typeIn <<" *lp;\n"
-			<< INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
-		break;
-	case INTERLEAVED_PLANNAR:
-		ss  << INDENT << "__local " << typeOut <<" *lp;\n"
-			<< INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
-		break;
-	case INTERLEAVED_INTERLEAVED:
-		ss  << INDENT << "__local " << typeIn <<" *lp;\n"
-			<< INDENT << datatype << "4 z00, z01, z10, z11, z20, z21, z30, z31;\n\n";
-		break;
-	}
-
-	if (xyflag)
-	{
-		ss	<< INDENT << "if ( i < DIMX  && j < DIMY) \n"
-			<< INDENT << "{\n";
-	}
-
-	ss  << INDENT2 << "// Array offsets\n"
-		<< INDENT2 << "goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
-		<<"\n";
-
-	ss  << INDENT2 << "// Load A block\n"
-		<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn0 <<" + goa);\n";
-
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		PLANNAR_READ(z0, z1, z2, z3, gp);
-		break;
-	case PLANNAR_INTERLEAVED:
-		PLANNAR_READ(z00, z10, z20, z30, gp);
-		ss << INDENT2 << "gpi = (__global " << datatype << "4 *)(" << szIn1 <<" + goa);\n";
-		PLANNAR_READ(z01, z11, z21, z31, gpi);
-		break;
-	default:
-		INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31);
-		break;
-	}
-	ss	<< "\n";
-
-	ss  << INDENT2 << "// Save into LDS\n";
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		ss	<< INDENT2 << "lp = ldsa + lo;\n";
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
-		break;
-	case PLANNAR_INTERLEAVED:
-		ss	<< INDENT2 << "lp = ldsa + lo*2;\n";
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, x);
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, y);
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, z);
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, w);
-		ss  << INDENT2 << "lp = ldsa + lo*2 + 1;\n";
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, x);
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, y);
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, z);
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, w);
-		break;
-	case INTERLEAVED_INTERLEAVED:
-		ss	<< INDENT2 << "lp = ldsa + lo;\n";
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, xy);
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, zw);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, xy);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, zw);
-		break;
-	case INTERLEAVED_PLANNAR:
-		ss	<< INDENT2 << "lp = (__local " << typeOut << "*)ldsa + lo;\n";
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, x);
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, z);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, x);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, z);
-		//next write to lp = ldsa+lo+1024
-		ss  << INDENT2 << "lp += (1024-32*4);\n";
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, y);
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, w);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, y);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, w);
-		break;
-	}
-
-	ss << INDENT;
-	if (xyflag)	ss << "} ";
-	ss <<"//End load A block\n\n";
-
-	if (xyflag)
-		ss  << INDENT << "if (i < DIMY  && j < DIMX) \n"
-			<< INDENT << "{\n";
-
-	ss  << INDENT2 << "//  Load B block\n"
-		<< INDENT2 << "gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
-		<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn0 << " + gob);\n";
-
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		PLANNAR_READ(z0, z1, z2, z3, gp);
-		break;
-	case PLANNAR_INTERLEAVED:
-		PLANNAR_READ(z00, z10, z20, z30, gp);
-		ss << INDENT2 << "gpi = (__global " << datatype << "4 *)(" << szIn1 <<" + gob);\n";
-		PLANNAR_READ(z01, z11, z21, z31, gpi);
-		break;
-	default:
-		INTERLEAVED_READ(z00, z01, z10, z11, z20, z21, z30, z31);
-		break;
-	}
-	ss	<< "\n";
-
-	ss  << INDENT2 << "// Save into LDS\n";
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		ss	<< INDENT2 << "lp = ldsb + lo;\n";
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
-		break;
-	case PLANNAR_INTERLEAVED:
-		ss	<< INDENT2 << "lp = ldsb + lo*2;\n";
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, x);
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, y);
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, z);
-		WRITE_TO_LDS(lp, 16, z00, z10, z20, z30, w);
-		ss  << INDENT2 << "lp = ldsb + lo*2 + 1;\n";
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, x);
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, y);
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, z);
-		WRITE_TO_LDS(lp, 16, z01, z11, z21, z31, w);
-		break;
-	case INTERLEAVED_INTERLEAVED:
-		ss	<< INDENT2 << "lp = ldsb + lo;\n";
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, xy);
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, zw);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, xy);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, zw);
-		break;
-	case INTERLEAVED_PLANNAR:
-		ss	<< INDENT2 << "lp = (__local " << typeOut << "*) ldsb + lo;\n";
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, x);
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, z);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, x);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, z);
-		//next write to lp = ldsa+lo+1024
-		ss  << INDENT2 << "lp += (1024-32*4);\n";
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, y);
-		WRITE_TO_LDS(lp, 8, z00, z10, z20, z30, w);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, y);
-		WRITE_TO_LDS(lp, 8, z01, z11, z21, z31, w);
-		break;
-	}
-
-	ss  << INDENT;
-	if (xyflag) ss<< "} ";
-	ss  << "// End load B block\n\n";
-
-	ss  << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
-		<< "\n";
-
-	if (xyflag) ss  << INDENT << "if (i < DIMY  && j < DIMX)\n"
-		<< INDENT << "{\n";
-
-	ss  << INDENT2 << "// write A block\n";
-
-	ss  << INDENT2 << "goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
-		<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut0 << " + goa);\n";
-
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n\n";
-		ss  << INDENT2 << "z0 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z1 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z2 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z3 = lp4[0];\n\n";
-
-		PLANNAR_WRITE(z0, z1, z2, z3, gp);
-		break;
-	case INTERLEAVED_PLANNAR:
-		ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)((__local " << typeOut << "*)ldsb + lot);\n\n";
-		ss  << INDENT2 << "z00 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z10 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z20 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z30 = lp4[0];\n\n";
-
-		PLANNAR_WRITE(z00, z10, z20, z30, gp);
-
-		ss  << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + goa);\n";
-		ss  << INDENT2 << "lp4 += (256 - 32*6);\n";
-		ss  << INDENT2 << "z01 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z11 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z21 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z31 = lp4[0];\n\n";
-
-		PLANNAR_WRITE(z01, z11, z21, z31, gp);
-		break;
-
-	case PLANNAR_INTERLEAVED:
-	case INTERLEAVED_INTERLEAVED:
-		if (iotype == PLANNAR_INTERLEAVED)
-			ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot*2);\n";
-		else
-			ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n";
-
-		ss  << INDENT2 << "z00 = lp4[0];\n"
-			<< INDENT2 << "z01 = lp4[1];\n"
-			<< INDENT2 << "lp4 += 32*4;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z10 = lp4[0];\n"
-			<< INDENT2 << "z11 = lp4[1];\n"
-			<< INDENT2 << "lp4 += 32*4;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z20 = lp4[0];\n"
-			<< INDENT2 << "z21 = lp4[1];\n"
-			<< INDENT2 << "lp4 += 32*4;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z30 = lp4[0];\n"
-			<< INDENT2 << "z31 = lp4[1];\n\n";
-
-		INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31);
-		break;
-	}
-	ss << "\n";
-
-	ss  << INDENT;
-	if (xyflag) ss << "} ";
-	ss  << "// End write A block;\n\n";
-
-	if (xyflag)	ss  << INDENT << "if (i < DIMX  && j < DIMY)\n"
-		<< INDENT << "{\n";
-
-	ss  << INDENT2 << "// write B block\n\n";
-	ss	<< INDENT2 << "gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
-		<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut0 << " + gob);\n";
-
-	switch (iotype)
-	{
-	case PLANNAR_PLANNAR:
-		ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n\n";
-		ss  << INDENT2 << "z0 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z1 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z2 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z3 = lp4[0];\n\n";
-
-		PLANNAR_WRITE(z0, z1, z2, z3, gp);
-		break;
-	case INTERLEAVED_PLANNAR:
-		ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)((__local " << typeOut << "*)ldsa + lot);\n\n";
-		ss  << INDENT2 << "z00 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z10 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z20 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z30 = lp4[0];\n\n";
-
-		PLANNAR_WRITE(z00, z10, z20, z30, gp);
-
-		ss  << INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + gob);\n";
-		ss  << INDENT2 << "lp4 += (256 - 32*6);\n";
-		ss  << INDENT2 << "z01 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z11 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z21 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z31 = lp4[0];\n\n";
-
-		PLANNAR_WRITE(z01, z11, z21, z31, gp);
-		break;
-
-	case PLANNAR_INTERLEAVED:
-	case INTERLEAVED_INTERLEAVED:
-		if (iotype == PLANNAR_INTERLEAVED)
-			ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot*2);\n\n";
-		else
-			ss  << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n\n";
-
-		ss  << INDENT2 << "z00 = lp4[0];\n"
-			<< INDENT2 << "z01 = lp4[1];\n"
-			<< INDENT2 << "lp4 += 32*4;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z10 = lp4[0];\n"
-			<< INDENT2 << "z11 = lp4[1];\n"
-			<< INDENT2 << "lp4 += 32*4;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z20 = lp4[0];\n"
-			<< INDENT2 << "z21 = lp4[1];\n"
-			<< INDENT2 << "lp4 += 32*4;\n"
-			<< "\n";
-
-		ss	<< INDENT2 << "z30 = lp4[0];\n"
-			<< INDENT2 << "z31 = lp4[1];\n\n";
-
-		INTERLEAVED_WRITE(z00, z01, z10, z11, z20, z21, z30, z31);
-		break;
-	}
-	ss << "\n";
-
-	ss  << INDENT;
-	if(xyflag) ss  << "} ";
-	ss  << "// End write B block;\n\n";
-
-	if (iotype == PLANNAR_PLANNAR)
-	{
-		ss  << INDENT << "// Identical handling for imaginary data\n"
-			<< INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
-			<< "\n";
-
-		if (xyflag) ss	<< INDENT << "if (i < DIMX  && j < DIMY)\n"
-			<< INDENT << "{\n";
-
-		ss  << INDENT2 << "//load A block\n"
-			<< INDENT2 << "goa = go + (i << 5) + j * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
-			<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn1 << " + goa);\n"
-			<< "\n";
-
-		PLANNAR_READ(z0, z1, z2, z3, gp);
-
-		ss  << INDENT2 << "lp = ldsa + lo;\n"
-			<< "\n";
-
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
-
-		ss  << INDENT;
-		if (xyflag) ss  << "} ";
-		ss  << "//end load A block\n\n";
-
-		if (xyflag) ss	<< INDENT << "if (i < DIMY  && j < DIMX)\n"
-			<< INDENT << "{\n";
-
-		ss  << INDENT2 << "//load B block\n"
-			<< INDENT2 << "gob = go + (j << 5) + i * (HSTRIDE*32) + (me >> 3)*HSTRIDE;\n"
-			<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szIn1 <<" + gob);\n"
-			<< "\n";
-
-		PLANNAR_READ(z0, z1, z2, z3, gp);
-
-		ss  << INDENT2 << "lp = ldsb + lo;\n"
-			<< "\n";
-
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, x);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, y);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, z);
-		WRITE_TO_LDS(lp, 8, z0, z1, z2, z3, w);
-
-		ss  << INDENT;
-		if (xyflag) ss << "} ";
-		ss  << "// end load B block\n\n";
-
-		ss  << INDENT << "barrier(CLK_LOCAL_MEM_FENCE);\n"
-			<< "\n";
-
-		if (xyflag) ss	<< INDENT << "if (i < DIMY  && j < DIMX)\n"
-			<< INDENT << "{\n";
-
-		ss  << INDENT2 << "//Write A block\n"
-		    << INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsb + lot);\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z0 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z1 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z2 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z3 = lp4[0];\n"
-			<< "\n";
-
-		ss  << INDENT2 << "goa = go + (i << 5) + j * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
-			<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + goa);\n"
-			<< "\n";
-
-		PLANNAR_WRITE(z0, z1, z2, z3, gp);
-
-		ss  << INDENT;
-		if (xyflag) ss  << "} ";
-		ss  << "// end write A block\n\n";
-
-		if (xyflag) ss	<< INDENT << "if (i < DIMX  && j < DIMY)\n"
-			<< INDENT << "{\n";
-
-		ss  << INDENT2 << "//write B block\n"
-			<< INDENT2 << "lp4 = (__local " << datatype << "4 *)(ldsa + lot);\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z0 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z1 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z2 = lp4[0];\n"
-			<< INDENT2 << "lp4 += 32*2;\n"
-			<< "\n";
-
-		ss  << INDENT2 << "z3 = lp4[0];\n"
-			<< "\n";
-
-		ss  << INDENT2 << "gob = go + (j << 5) + i * (VSTRIDE*32) + (me >> 3)*VSTRIDE;\n"
-			<< INDENT2 << "gp = (__global " << datatype << "4 *)(" << szOut1 << " + gob);\n";
-		PLANNAR_WRITE(z0, z1, z2, z3, gp);
-
-		ss  << INDENT;
-		if (xyflag) ss  << "} ";
-		ss << "// end write B block\n";
-	}
-
-	ss << "}\n\n";
-	kernel = ss.str();
-	return CLFFT_SUCCESS;
-}
-
-clfftStatus FFTGeneratedTransposeVLIWAction::initParams ()
-{
-
-	//	Query the devices in this context for their local memory sizes
-	//	How we generate a kernel depends on the *minimum* LDS size for all devices.
-	//
-	const FFTEnvelope * pEnvelope = NULL;
-	OPENCL_V(this->plan->GetEnvelope (& pEnvelope), _T("GetEnvelope failed"));
-	BUG_CHECK (NULL != pEnvelope);
-
-
-	this->signature.fft_precision    = this->plan->precision;
-	this->signature.fft_placeness    = this->plan->placeness;
-	this->signature.fft_inputLayout  = this->plan->inputLayout;
-
-	ARG_CHECK (this->plan->inStride.size() == this->plan->outStride.size())
-
-	if (CLFFT_INPLACE == this->plan->placeness) {
-		//	If this is an in-place transform the
-		//	input and output layout, dimensions and strides
-		//	*MUST* be the same.
-		//
-		ARG_CHECK (this->plan->inputLayout == this->plan->outputLayout)
-		this->signature.fft_outputLayout = this->plan->inputLayout;
-		for (size_t u = this->plan->inStride.size(); u-- > 0; ) {
-			ARG_CHECK (this->plan->inStride[u] == this->plan->outStride[u]);
-		}
-	} else {
-		this->signature.fft_outputLayout = this->plan->outputLayout;
-	}
-
-	//we only support 2D transpose
-	switch (this->plan->inStride.size()) {
-		//	2-D array is a 3-D data structure
-		//	2-D unit is a speical case of 2-D array.
-	case 2:
-		ARG_CHECK(this->plan->length   .size() > 1);
-		ARG_CHECK(this->plan->outStride.size() > 1);
-		this->signature.fft_DataDim      = 3;
-		this->signature.fft_N[0]         = this->plan->length[0];
-		this->signature.fft_N[1]         = this->plan->length[1];
-		this->signature.fft_inStride[0]  = this->plan->inStride[0];
-		this->signature.fft_inStride[1]  = this->plan->inStride[1];
-		this->signature.fft_inStride[2]  = this->plan->iDist;
-		this->signature.fft_outStride[0] = this->plan->outStride[0];
-		this->signature.fft_outStride[1] = this->plan->outStride[1];
-		this->signature.fft_outStride[2] = this->plan->oDist;
-		break;
-	default:
-		ARG_CHECK (false);
-	}
-
-	//ToDO: work group size setup
-	this->signature.fft_R = 32; // divide the element into 32x32 blocks
-	this->signature.fft_SIMD = 64; //work group size
-
-	return CLFFT_SUCCESS;
-}
-
-clfftStatus  FFTGeneratedTransposeVLIWAction::getWorkSizes (std::vector<size_t> & globalWS, std::vector<size_t> & localWS)
-{
-
-	unsigned long long count, count0, count1;
-	count0 = DivRoundingUp<unsigned long long> (this->plan->length[0], this->signature.fft_R);
-	count1 = DivRoundingUp<unsigned long long> (this->plan->length[1], this->signature.fft_R);
-	count  = (count0>count1) ? count0 : count1;
-	count  = (count * (count+1)) /2;
-	count *= this->signature.fft_SIMD;
-	count *= this->plan->batchsize;
-
-	globalWS.push_back( static_cast< size_t >( count ) );
-	localWS.push_back( this->signature.fft_SIMD );
-
-	return	CLFFT_SUCCESS;
-}
-
-
-//	OpenCL does not take unicode strings as input, so this routine returns only ASCII strings
-//	Feed this generator the FFTPlan, and it returns the generated program as a string
-clfftStatus FFTGeneratedTransposeVLIWAction::generateKernel ( FFTRepo& fftRepo, const cl_command_queue commQueueFFT )
-{
-
-	std::string programCode;
-	OPENCL_V( GenerateTransposeKernel(  this->signature, programCode ), _T( "GenerateTransposeKernel() failed!" ) );
-
-    cl_int status = CL_SUCCESS;
-    cl_device_id Device = NULL;
-    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_DEVICE, sizeof(cl_device_id), &Device, NULL);
-    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
-    cl_context QueueContext = NULL;
-    status = clGetCommandQueueInfo(commQueueFFT, CL_QUEUE_CONTEXT, sizeof(cl_context), &QueueContext, NULL);
-    OPENCL_V( status, _T( "clGetCommandQueueInfo failed" ) );
-
-	OPENCL_V( fftRepo.setProgramCode( Transpose_VLIW, this->getSignatureData(), programCode, Device, QueueContext ), _T( "fftRepo.setclString() failed!" ) );
-	OPENCL_V( fftRepo.setProgramEntryPoints( Transpose_VLIW,  this->getSignatureData(), "fft_trans", "fft_trans", Device, QueueContext ), _T( "fftRepo.setProgramEntryPoint() failed!" ) );
-
-	return CLFFT_SUCCESS;
-}
diff --git a/src/library/generator.transpose.vliw.h b/src/library/generator.transpose.vliw.h
deleted file mode 100644
index 12ad701..0000000
--- a/src/library/generator.transpose.vliw.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* ************************************************************************
- * Copyright 2013 Advanced Micro Devices, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- * ************************************************************************/
-
-#pragma once
-#if !defined( AMD_CLFFT_generator_transpose_H )
-#define AMD_CLFFT_generator_transpose_H
-#include "private.h"
-#include "repo.h"
-#include "plan.h"
-
-#endif
-
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index a6736bc..a5c605e 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -240,7 +240,6 @@ std::string getKernelName(const clfftGenerators gen, const clfftPlanHandle plHan
     case Stockham:			generatorName = "Stockham"; break;
 	case Transpose_GCN:		generatorName = "Transpose"; break;
 	case Transpose_SQUARE:	generatorName = "Transpose"; break;
-    case Transpose_VLIW:	generatorName = "Transpose"; break;
 	case Copy:				generatorName = "Copy"; break;
     }
 
@@ -276,12 +275,6 @@ clfftStatus selectAction(FFTPlan * fftPlan, FFTAction *& action, cl_command_queu
 		}
 		break;
 
-    case Transpose_VLIW: 
-		{
-			action = new FFTGeneratedTransposeVLIWAction(fftPlan->plHandle, fftPlan, *commQueueFFT, err);
-			OPENCL_V( err, "FFTGeneratedTransposeVLIWAction() failed");
-		}
-		break;
 
     case Copy:
 		{
@@ -1879,8 +1872,9 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				else if(fftPlan->gen == Transpose_SQUARE)
 					fftPlan->action = new FFTGeneratedTransposeSquareAction(plHandle, fftPlan, *commQueueFFT, err);
 				else
-					fftPlan->action = new FFTGeneratedTransposeVLIWAction(plHandle, fftPlan, *commQueueFFT, err);
-                OPENCL_V( err, "FFTGeneratedTransposeVLIWAction failed");
+					fftPlan->action = new FFTGeneratedTransposeGCNAction(plHandle, fftPlan, *commQueueFFT, err);
+
+                OPENCL_V( err, "FFTGeneratedTransposeXXXAction failed");
 
 				fftPlan->baked		= true;
 				return	CLFFT_SUCCESS;
@@ -1890,28 +1884,6 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 			size_t length1 = fftPlan->length[1];
 
 
-			if (fftPlan->length[0]==256 && fftPlan->length[1]==256)
-			{
-				length0 += 8;
-				length1 += 1;
-			}
-			else if (fftPlan->length[0]==512 && fftPlan->length[1]==512)
-			{
-				length0 += 1;
-				length1 += 1;//length1 += 0;
-			}
-			else if (fftPlan->length[0]==1024 && fftPlan->length[1]==512)
-			{
-				length0 += 2;
-				length1 += 2;//length1 += 0;
-			}
-			else if (fftPlan->length[0]==1024 && fftPlan->length[1]==1024)
-			{
-				length0 += 1;
-				length1 += 1;//length1 += 0;
-			}
-
-
 			if (fftPlan->length[0] > Large1DThreshold ||
 				fftPlan->length[1] > Large1DThreshold)
 				fftPlan->large2D = true;
@@ -2003,7 +1975,8 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				clLengths[0] = fftPlan->length[0];
 				clLengths[1] = fftPlan->length[1];
 
-				bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
+				// bool xyflag = (clLengths[0]==clLengths[1]) ? false : true;
+				bool xyflag = true;
 				if (xyflag && fftPlan->tmpBufSize==0 && fftPlan->length.size()<=2)
 				{
 					// we need tmp buffer for x!=y case
@@ -2022,7 +1995,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				transPlanX->inputLayout     = fftPlan->outputLayout;
 				transPlanX->precision       = fftPlan->precision;
 				transPlanX->tmpBufSize      = 0;
-				transPlanX->gen			    = Transpose_VLIW;
+				transPlanX->gen = Transpose_GCN;
 				transPlanX->envelope		= fftPlan->envelope;
 				transPlanX->batchsize       = fftPlan->batchsize;
 				transPlanX->inStride[0]     = fftPlan->outStride[0];
@@ -2035,7 +2008,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 					transPlanX->outputLayout    = CLFFT_COMPLEX_INTERLEAVED;
 					transPlanX->placeness       = CLFFT_OUTOFPLACE;
 					transPlanX->outStride[0]    = 1;
-					transPlanX->outStride[1]    = clLengths[0];
+					transPlanX->outStride[1]    = clLengths[1];
 					transPlanX->oDist           = clLengths[0] * clLengths[1];
 				}
 				else
@@ -2149,7 +2122,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				transPlanY->oDist           = fftPlan->oDist;
 				transPlanY->precision       = fftPlan->precision;
 				transPlanY->tmpBufSize      = 0;
-				transPlanY->gen			    = Transpose_VLIW;
+				transPlanY->gen = Transpose_GCN;
 				transPlanY->envelope		= fftPlan->envelope;
 				transPlanY->batchsize       = fftPlan->batchsize;
 				transPlanY->transflag       = true;
@@ -4158,8 +4131,6 @@ clfftStatus FFTPlan::GetMax1DLength (size_t *longest ) const
 	switch(gen)
 	{
 	case Stockham:		return GetMax1DLengthStockham(longest);
-	//No restriction for Transpose_VLIW kernel
-	case Transpose_VLIW:		*longest = 4096; return CLFFT_SUCCESS;
     case Transpose_GCN:			*longest = 4096; return CLFFT_SUCCESS;
     case Transpose_SQUARE:     *longest = 4096; return CLFFT_SUCCESS;
     case Copy:					*longest = 4096; return CLFFT_SUCCESS;
diff --git a/src/library/transform.cpp b/src/library/transform.cpp
index d061dad..0efad17 100644
--- a/src/library/transform.cpp
+++ b/src/library/transform.cpp
@@ -671,7 +671,8 @@ clfftStatus clfftEnqueueTransform(
 
 				cl_event transXOutEvents = NULL;
 				cl_event colOutEvents = NULL;
-				bool xyflag = (fftPlan->length[0] == fftPlan->length[1]) ? false : true;
+				//bool xyflag = (fftPlan->length[0] == fftPlan->length[1]) ? false : true;
+				bool xyflag = true;
 
 				if (xyflag)
 				{

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git