[clfft] 19/32: adding radix 11/13 capability - first commit - 1d small sizes are working

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Tue Apr 26 08:34:10 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit a8240d9877daf0fb2ce3b2caed0030e81863fba4
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Wed Apr 6 11:26:23 2016 -0700

    adding radix 11/13 capability - first commit - 1d small sizes are working
---
 src/library/generator.stockham.cpp | 121 +++++++++---
 src/library/generator.stockham.h   | 393 ++++++++++++++++++++++++++++++++++++-
 src/library/private.h              |   6 +-
 3 files changed, 491 insertions(+), 29 deletions(-)

diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index 3d425a8..f6b25f9 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -396,7 +396,7 @@ namespace StockhamGenerator
 			return;
 		}
 
-		size_t baseRadix[] = {7,5,3,2}; // list only supported primes
+		size_t baseRadix[] = {13,11,7,5,3,2}; // list only supported primes
 		size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
 
 		size_t l = length;
@@ -437,7 +437,19 @@ namespace StockhamGenerator
 		{
 			workGroupSize = 49;
 			numTrans = length >= 7*workGroupSize ? 1 : (7*workGroupSize)/length;
-		} else {
+		}
+		else if (primeFactorsExpanded[11] == length) // Length is pure power of 11
+		{
+			workGroupSize = 121;
+			numTrans = length >= 11 * workGroupSize ? 1 : (11 * workGroupSize) / length;
+		}
+		else if (primeFactorsExpanded[13] == length) // Length is pure power of 13
+		{
+			workGroupSize = 169;
+			numTrans = length >= 13 * workGroupSize ? 1 : (13 * workGroupSize) / length;
+		}
+		else
+		{
 			size_t leastNumPerWI = 1; // least number of elements in one work item
 			size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired
 
@@ -3019,7 +3031,7 @@ namespace StockhamGenerator
 			else
 			{
 				// Possible radices
-				size_t cRad[] = {10,8,7,6,5,4,3,2,1}; // Must be in descending order
+				size_t cRad[] = {13,11,10,8,7,6,5,4,3,2,1}; // Must be in descending order
 				size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
 
 				// Generate the radix and pass objects
@@ -3236,29 +3248,6 @@ namespace StockhamGenerator
 			// Vector type
 			str += "#define fvect2 "; str += RegBaseType<PR>(2); str += "\n\n";
 
-			//constants
-			str += "#define C8Q  0.70710678118654752440084436210485"; str += sfx; str += "\n";
-
-			str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
-			str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
-			str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
-			str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
-			str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";
-
-			str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
-			str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
-
-			str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
-			str += "#define C7Q2  0.79015646852540022404554065360571"; str += sfx; str += "\n";
-			str += "#define C7Q3  0.05585426728964774240049351305970"; str += sfx; str += "\n";
-			str += "#define C7Q4  0.73430220123575240531721419756650"; str += sfx; str += "\n";
-			str += "#define C7Q5  0.44095855184409837868031445395900"; str += sfx; str += "\n";
-			str += "#define C7Q6  0.34087293062393136944265847887436"; str += sfx; str += "\n";
-			str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
-			str += "#define C7Q8  0.87484229096165666561546458979137"; str += sfx; str += "\n";
-
-			str += "\n";
-
 			bool cReg = linearRegs ? true : false;
 
 			// Generate butterflies for all unique radices
@@ -3269,6 +3258,86 @@ namespace StockhamGenerator
 			uradices.sort();
 			uradices.unique();
 
+
+			//constants
+			if (length%8 == 0)
+			{
+				str += "#define C8Q  0.70710678118654752440084436210485"; str += sfx; str += "\n";
+			}
+
+			if (length % 5 == 0)
+			{
+				str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
+				str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
+				str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
+				str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
+				str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";
+			}
+
+			if (length % 3 == 0)
+			{
+				str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
+				str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
+			}
+
+			if (length % 7 == 0)
+			{
+				str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
+				str += "#define C7Q2  0.79015646852540022404554065360571"; str += sfx; str += "\n";
+				str += "#define C7Q3  0.05585426728964774240049351305970"; str += sfx; str += "\n";
+				str += "#define C7Q4  0.73430220123575240531721419756650"; str += sfx; str += "\n";
+				str += "#define C7Q5  0.44095855184409837868031445395900"; str += sfx; str += "\n";
+				str += "#define C7Q6  0.34087293062393136944265847887436"; str += sfx; str += "\n";
+				str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
+				str += "#define C7Q8  0.87484229096165666561546458979137"; str += sfx; str += "\n";
+			}
+
+			if (length % 11 == 0)
+			{
+				str += "#define b11_0 0.9898214418809327"; str += sfx; str += "\n";
+				str += "#define b11_1 0.9594929736144973"; str += sfx; str += "\n";
+				str += "#define b11_2 0.9189859472289947"; str += sfx; str += "\n";
+				str += "#define b11_3 0.8767688310025893"; str += sfx; str += "\n";
+				str += "#define b11_4 0.8308300260037728"; str += sfx; str += "\n";
+				str += "#define b11_5 0.7784344533346518"; str += sfx; str += "\n";
+				str += "#define b11_6 0.7153703234534297"; str += sfx; str += "\n";
+				str += "#define b11_7 0.6343562706824244"; str += sfx; str += "\n";
+				str += "#define b11_8 0.3425847256816375"; str += sfx; str += "\n";
+				str += "#define b11_9 0.5211085581132027"; str += sfx; str += "\n";
+			}
+
+			if (length % 13 == 0)
+			{
+				str += "#define b13_0  0.9682872443619840"; str += sfx; str += "\n";
+				str += "#define b13_1  0.9578059925946651"; str += sfx; str += "\n";
+				str += "#define b13_2  0.8755023024091479"; str += sfx; str += "\n";
+				str += "#define b13_3  0.8660254037844386"; str += sfx; str += "\n";
+				str += "#define b13_4  0.8595425350987748"; str += sfx; str += "\n";
+				str += "#define b13_5  0.8534800018598239"; str += sfx; str += "\n";
+				str += "#define b13_6  0.7693388175729806"; str += sfx; str += "\n";
+				str += "#define b13_7  0.6865583707817543"; str += sfx; str += "\n";
+				str += "#define b13_8  0.6122646503767565"; str += sfx; str += "\n";
+				str += "#define b13_9  0.6004772719326652"; str += sfx; str += "\n";
+				str += "#define b13_10 0.5817047785105157"; str += sfx; str += "\n";
+				str += "#define b13_11 0.5751407294740031"; str += sfx; str += "\n";
+				str += "#define b13_12 0.5220263851612750"; str += sfx; str += "\n";
+				str += "#define b13_13 0.5200285718888646"; str += sfx; str += "\n";
+				str += "#define b13_14 0.5165207806234897"; str += sfx; str += "\n";
+				str += "#define b13_15 0.5149187780863157"; str += sfx; str += "\n";
+				str += "#define b13_16 0.5035370328637666"; str += sfx; str += "\n";
+				str += "#define b13_17 0.5000000000000000"; str += sfx; str += "\n";
+				str += "#define b13_18 0.3027756377319946"; str += sfx; str += "\n";
+				str += "#define b13_19 0.3014792600477098"; str += sfx; str += "\n";
+				str += "#define b13_20 0.3004626062886657"; str += sfx; str += "\n";
+				str += "#define b13_21 0.2517685164318833"; str += sfx; str += "\n";
+				str += "#define b13_22 0.2261094450357824"; str += sfx; str += "\n";
+				str += "#define b13_23 0.0833333333333333"; str += sfx; str += "\n";
+				str += "#define b13_24 0.0386329546443481"; str += sfx; str += "\n";
+			}
+
+			str += "\n";
+
+
 			//If pre-callback is set for the plan
 			std::string callbackstr;
 			if (params.fft_hasPreCallback)
diff --git a/src/library/generator.stockham.h b/src/library/generator.stockham.h
index 3323693..18adf48 100644
--- a/src/library/generator.stockham.h
+++ b/src/library/generator.stockham.h
@@ -389,7 +389,7 @@ namespace StockhamGenerator
 			// Temporary variables
 			// Allocate temporary variables if we are not using complex registers (cReg = 0) or if cReg is true, then
 			// allocate temporary variables only for non power-of-2 radices
-			if (!(radix == 7 && cReg))
+			if (!( (radix == 7 && cReg) || (radix == 11 && cReg) || (radix == 13 && cReg) ))
 			{
 			if( (radix & (radix-1)) || (!cReg) )
 			{
@@ -1654,6 +1654,395 @@ namespace StockhamGenerator
 						}
 					}
 				} break;
+			case 11:
+				{
+					static const char *radix11str = " \
+						float p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; \n\
+						p0 = ((*R1).x - (*R10).x)*dir; \n\
+						p1 = (*R1).x + (*R10).x; \n\
+						p2 = ((*R5).x - (*R6).x)*dir; \n\
+						p3 = (*R5).x + (*R6).x; \n\
+						p4 = ((*R2).x - (*R9).x)*dir; \n\
+						p5 = (*R2).x + (*R9).x; \n\
+						p6 = ((*R3).x - (*R8).x)*dir; \n\
+						p7 = (*R3).x + (*R8).x; \n\
+						p8 = (*R4).x + (*R7).x; \n\
+						p9 = ((*R4).x - (*R7).x)*dir; \n\
+						\n\
+						float r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; \n\
+						r0 = p4 - p0 * b11_9; \n\
+						r1 = p0 + p2 * b11_9; \n\
+						r2 = p2 + p6 * b11_9; \n\
+						r3 = p6 + p9 * b11_9; \n\
+						r4 = p9 - p4 * b11_9; \n\
+						r5 = p7 - p1 * b11_8; \n\
+						r6 = p5 - p7 * b11_8; \n\
+						r7 = p1 - p8 * b11_8; \n\
+						r8 = p3 - p5 * b11_8; \n\
+						r9 = p8 - p3 * b11_8; \n\
+						\n\
+						float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9; \n\
+						s0 = p6 - r0 * b11_6; \n\
+						s1 = p9 + r1 * b11_6; \n\
+						s2 = p4 - r2 * b11_6; \n\
+						s3 = p0 + r3 * b11_6; \n\
+						s4 = p2 + r4 * b11_6; \n\
+						s5 = p3 - r5 * b11_7; \n\
+						s6 = p8 - r6 * b11_7; \n\
+						s7 = p5 - r7 * b11_7; \n\
+						s8 = p1 - r8 * b11_7; \n\
+						s9 = p7 - r9 * b11_7; \n\
+						\n\
+						float p10, p11, p12, p13, p14, p15, p16, p17, p18, p19; \n\
+						p10 = ((*R10).y - (*R1).y)*dir; \n\
+						p11 = (*R1).y + (*R10).y; \n\
+						p12 = ((*R9).y - (*R2).y)*dir; \n\
+						p13 = (*R2).y + (*R9).y; \n\
+						p14 = ((*R8).y - (*R3).y)*dir; \n\
+						p15 = (*R3).y + (*R8).y; \n\
+						p16 = ((*R7).y - (*R4).y)*dir; \n\
+						p17 = (*R4).y + (*R7).y; \n\
+						p18 = ((*R6).y - (*R5).y)*dir; \n\
+						p19 = (*R5).y + (*R6).y; \n\
+						\n\
+						float r10, r11, r12, r13, r14, r15, r16, r17, r18, r19; \n\
+						r10 = p12 - p10 * b11_9; \n\
+						r11 = p16 - p12 * b11_9; \n\
+						r12 = p18 + p14 * b11_9; \n\
+						r13 = p14 + p16 * b11_9; \n\
+						r14 = p10 + p18 * b11_9; \n\
+						r15 = p15 - p11 * b11_8; \n\
+						r16 = p19 - p13 * b11_8; \n\
+						r17 = p13 - p15 * b11_8; \n\
+						r18 = p11 - p17 * b11_8; \n\
+						r19 = p17 - p19 * b11_8; \n\
+						\n\
+						float s10, s11, s12, s13, s14, s15, s16, s17, s18, s19; \n\
+						s10 = p14 - r10 * b11_6; \n\
+						s11 = p18 + r11 * b11_6; \n\
+						s12 = p12 - r12 * b11_6; \n\
+						s13 = p10 + r13 * b11_6; \n\
+						s14 = p16 + r14 * b11_6; \n\
+						s15 = p19 - r15 * b11_7; \n\
+						s16 = p11 - r16 * b11_7; \n\
+						s17 = p17 - r17 * b11_7; \n\
+						s18 = p13 - r18 * b11_7; \n\
+						s19 = p15 - r19 * b11_7; \n\
+						\n\
+						float v0, v1, v2, v3, v4, v5, v6, v7, v8, v9; \n\
+						float v10, v11, v12, v13, v14, v15, v16, v17, v18, v19; \n\
+						v0 = p9 - s0 * b11_4; \n\
+						v1 = p4 + s1 * b11_4; \n\
+						v2 = p0 + s2 * b11_4; \n\
+						v3 = p2 - s3 * b11_4; \n\
+						v4 = p6 - s4 * b11_4; \n\
+						v5 = p8 - s5 * b11_5; \n\
+						v6 = p1 - s6 * b11_5; \n\
+						v7 = p3 - s7 * b11_5; \n\
+						v8 = p7 - s8 * b11_5; \n\
+						v9 = p5 - s9 * b11_5; \n\
+						v10 = p16 - s10 * b11_4; \n\
+						v11 = p14 - s11 * b11_4; \n\
+						v12 = p10 + s12 * b11_4; \n\
+						v13 = p18 - s13 * b11_4; \n\
+						v14 = p12 + s14 * b11_4; \n\
+						v15 = p17 - s15 * b11_5; \n\
+						v16 = p15 - s16 * b11_5; \n\
+						v17 = p11 - s17 * b11_5; \n\
+						v18 = p19 - s18 * b11_5; \n\
+						v19 = p13 - s19 * b11_5; \n\
+						\n\
+						float w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; \n\
+						float w10, w11, w12, w13, w14, w15, w16, w17, w18, w19; \n\
+						w0 = p2 - v0 * b11_2; \n\
+						w1 = p6 + v1 * b11_2; \n\
+						w2 = p9 - v2 * b11_2; \n\
+						w3 = p4 + v3 * b11_2; \n\
+						w4 = p0 - v4 * b11_2; \n\
+						w5 = p5 - v5 * b11_3; \n\
+						w6 = p3 - v6 * b11_3; \n\
+						w7 = p7 - v7 * b11_3; \n\
+						w8 = p8 - v8 * b11_3; \n\
+						w9 = p1 - v9 * b11_3; \n\
+						w10 = p18 - v10 * b11_2; \n\
+						w11 = p10 - v11 * b11_2; \n\
+						w12 = p16 - v12 * b11_2; \n\
+						w13 = p12 + v13 * b11_2; \n\
+						w14 = p14 + v14 * b11_2; \n\
+						w15 = p13 - v15 * b11_3; \n\
+						w16 = p17 - v16 * b11_3; \n\
+						w17 = p19 - v17 * b11_3; \n\
+						w18 = p15 - v18 * b11_3; \n\
+						w19 = p11 - v19 * b11_3; \n\
+						\n\
+						float z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; \n\
+						z0 = (*R0).x - w5 * b11_1; \n\
+						z1 = (*R0).x - w6 * b11_1; \n\
+						z2 = (*R0).x - w7 * b11_1; \n\
+						z3 = (*R0).x - w8 * b11_1; \n\
+						z4 = (*R0).x - w9 * b11_1; \n\
+						z5 = (*R0).y - w15 * b11_1; \n\
+						z6 = (*R0).y - w16 * b11_1; \n\
+						z7 = (*R0).y - w17 * b11_1; \n\
+						z8 = (*R0).y - w18 * b11_1; \n\
+						z9 = (*R0).y - w19 * b11_1; \n\
+						\n\
+						(*R0).x = (*R0).x + p1 + p3 + p5 + p7 + p8; \n\
+						(*R0).y = (*R0).y + p11 + p13 + p15 + p17 + p19; \n\
+						(*R1).x = z1 + w14* b11_0; \n\
+						(*R1).y = z7 + w1* b11_0; \n\
+						(*R2).x = z2 - w12* b11_0; \n\
+						(*R2).y = z8 - w2* b11_0; \n\
+						(*R3).x = z0 + w11* b11_0; \n\
+						(*R3).y = z5 + w4* b11_0; \n\
+						(*R4).x = z3 - w13* b11_0; \n\
+						(*R4).y = z6 - w3* b11_0; \n\
+						(*R5).x = z4 + w10* b11_0; \n\
+						(*R5).y = z9 + w0* b11_0; \n\
+						(*R6).x = z4 - w10* b11_0; \n\
+						(*R6).y = z9 - w0* b11_0; \n\
+						(*R7).x = z3 + w13* b11_0; \n\
+						(*R7).y = z6 + w3* b11_0; \n\
+						(*R8).x = z0 - w11* b11_0; \n\
+						(*R8).y = z5 - w4* b11_0; \n\
+						(*R9).x = z2 + w12* b11_0; \n\
+						(*R9).y = z8 + w2* b11_0; \n\
+						(*R10).x = z1 - w14* b11_0; \n\
+						(*R10).y = z7 - w1* b11_0; \n";
+
+					if (fwd)
+					{
+						bflyStr += "float dir = -1;\n\n";
+					}
+					else
+					{
+						bflyStr += "float dir = 1;\n\n";
+					}
+
+					bflyStr += radix11str;
+
+				} break;
+			case 13:
+				{
+
+					static const char *radix13str = " \
+						float p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;\n\
+						p0 = (*R7).x - (*R2).x;\n\
+						p1 = (*R7).x + (*R2).x;\n\
+						p2 = (*R8).x - (*R5).x;\n\
+						p3 = (*R8).x + (*R5).x;\n\
+						p4 = (*R9).x - (*R3).x;\n\
+						p5 = (*R3).x + (*R9).x;\n\
+						p6 = (*R10).x + (*R4).x;\n\
+						p7 = (*R10).x - (*R4).x;\n\
+						p8 = (*R11).x + (*R6).x;\n\
+						p9 = (*R11).x - (*R6).x;\n\
+						\n\
+						float p10, p11, p12, p13, p14, p15, p16, p17, p18, p19;\n\
+						p10 = (*R12).x + p6;\n\
+						p11 = (*R1).x + p5;\n\
+						p12 = p8 - p1;\n\
+						p13 = p8 + p1;\n\
+						p14 = p9 + p0;\n\
+						p15 = p9 - p0;\n\
+						p16 = p7 - p4;\n\
+						p17 = p4 + p7;\n\
+						p18 = p11 + p10;\n\
+						p19 = p11 - p10;\n\
+						\n\
+						float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11;\n\
+						s0 = p3 + p13;\n\
+						s1 = p2 + p14;\n\
+						s2 = p16 - p15;\n\
+						s3 = p16 + p15;\n\
+						s4 = -(*R12).x + p6 * b13_17;\n\
+						s5 =   (*R1).x - p5 * b13_17;\n\
+						s6 = s5 - s4;\n\
+						s7 = s5 + s4;\n\
+						s8 = p18 + s0;\n\
+						s9 = p18 - s0;\n\
+						float c2 = p3 - p13 * b13_17;\n\
+						s10 = s6 - c2;\n\
+						s11 = s6 + c2;\n\
+						\n\
+						float r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;\n\
+						r0 = (*R7).y + (*R2).y;\n\
+						r1 = (*R7).y - (*R2).y;\n\
+						r2 = (*R8).y + (*R5).y;\n\
+						r3 = (*R8).y - (*R5).y;\n\
+						r4 = (*R9).y - (*R3).y;\n\
+						r5 = (*R3).y + (*R9).y;\n\
+						r6 = (*R10).y + (*R4).y;\n\
+						r7 = (*R10).y - (*R4).y;\n\
+						r8 = (*R11).y - (*R6).y;\n\
+						r9 = (*R11).y + (*R6).y;\n\
+						r10 = (*R12).y + r6;\n\
+						r11 = (*R1).y + r5;\n\
+						\n\
+						float m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10;\n\
+						float m11, m12, m13, m14, m15, m16, m17, m18, m19, m20;\n\
+						m0 = r4 + r7;\n\
+						m1 = r7 - r4;\n\
+						m2 = r8 - r1;\n\
+						m3 = r8 + r1;\n\
+						m4 = r9 + r0;\n\
+						m5 = r9 - r0;\n\
+						m6 = r11 + r10;\n\
+						m7 = r11 - r10;\n\
+						m8 = m1 - m2;\n\
+						m9 = m1 + m2;\n\
+						m10 = r3 + m3;\n\
+						m11 = r2 + m4;\n\
+						m12 = m6 - m11;\n\
+						m13 = m6 + m11;\n\
+						\n\
+						m14 =  (*R1).y - r5 * b13_17;\n\
+						m15 = -(*R12).y + r6 * b13_17;\n\
+						m16 =  r2      - m4 * b13_17;\n\
+						\n\
+						m17 = m14 + m15;\n\
+						m18 = m14 - m15;\n\
+						m19 = m18 + m16;\n\
+						m20 = m18 - m16;\n\
+						\n\
+						float c0, c1, c3, c4, c5, c6, c7, c8, c9;\n\
+						float c10, c11, c12, c13, c14, c15, c16, c17, c18, c19;\n\
+						float c20, c21, c22, c23, c24;\n\
+						c0  =  s7 - p12 * b13_3;\n\
+						c1  =  s7 + p12 * b13_3;\n\
+						c3  =  p2 - p14 * b13_17;\n\
+						c4  =  s1 - p19 * b13_18;\n\
+						c5  = p19 + s1 * b13_18;\n\
+						c6  = s10 - s2 * b13_15;\n\
+						c7  = s11 - s3 * b13_22;\n\
+						c8  = (*R0).x - s8 * b13_23;\n\
+						c9  =  s2 + s10 * b13_7;\n\
+						c10 =  s3 + s11 * b13_19;\n\
+						c11 =  r3 - m3 * b13_17;\n\
+						c12 = m17 - m5 * b13_3;\n\
+						c13 = m17 + m5 * b13_3;\n\
+						c14 = m10 - m7 * b13_18;\n\
+						c15 = m20 - m8 * b13_15;\n\
+						c16 = m19 - m9 * b13_22;\n\
+						c17 =  m7 + m10 * b13_18;\n\
+						c18 = (*R0).y- m13 * b13_23;\n\
+						c19 =  m9 + m19 * b13_19;\n\
+						c20 =  m8 + m20 * b13_7;\n\
+						c21 =  c3 + p17 * b13_3;\n\
+						c22 =  c3 - p17 * b13_3;\n\
+						c23 = c11 + m0 * b13_3;\n\
+						c24 = c11 - m0 * b13_3;\n\
+						\n\
+						float d0, d1, d2, d3, d4, d5, d6, d7, d8, d9;\n\
+						float d10, d11, d12, d13, d14, d15, d16, d17, d18, d19;\n\
+						d0  = c22 +  c0 * b13_8;\n\
+						d1  =  c0 - c22 * b13_8;\n\
+						d2  = c21 +  c1 * b13_24;\n\
+						d3  =  c1 - c21 * b13_24;\n\
+						d4  =  s9 -  c6 * b13_4;\n\
+						d5  =  c6 +  s9 * b13_10;\n\
+						d6  =  c7 +  c9 * b13_6;\n\
+						d7  =  c7 -  c9 * b13_6;\n\
+						d8  =  c8 - c10 * b13_21;\n\
+						d9  =  c8 + c10 * b13_16;\n\
+						d10 = c24 + c12 * b13_8;\n\
+						d11 = c12 - c24 * b13_8;\n\
+						d12 = c23 + c13 * b13_24;\n\
+						d13 = c13 - c23 * b13_24;\n\
+						d14 = m12 - c15 * b13_4;\n\
+						d15 = c15 + m12 * b13_10;\n\
+						d16 = c18 + c19 * b13_16;\n\
+						d17 = c18 - c19 * b13_21;\n\
+						d18 = c16 - c20 * b13_6;\n\
+						d19 = c16 + c20 * b13_6;\n\
+						\n\
+						float e0, e1, e2, e3, e4, e5, e6, e7, e8, e9;\n\
+						float e10, e11, e12, e13, e14, e15;\n\
+						e0  = d2  +  d0 * b13_5;\n\
+						e1  = d2  -  d0 * b13_5;\n\
+						e2  = d3  -  d1 * b13_5;\n\
+						e3  = d3  +  d1 * b13_5;\n\
+						e4  = d8  -  d4 * b13_20;\n\
+						e5  = d8  +  d4 * b13_20;\n\
+						e6  = d9  +  d5 * b13_14;\n\
+						e7  = d9  -  d5 * b13_14;\n\
+						e8  = d12 + d10 * b13_5;\n\
+						e9  = d12 - d10 * b13_5;\n\
+						e10 = d13 - d11 * b13_5;\n\
+						e11 = d13 + d11 * b13_5;\n\
+						e12 = d16 + d15 * b13_14;\n\
+						e13 = d16 - d15 * b13_14;\n\
+						e14 = d17 + d14 * b13_20;\n\
+						e15 = d17 - d14 * b13_20;\n\
+						\n\
+						float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9;\n\
+						float f10, f11, f12, f13, f14, f15, f16, f17, f18, f19;\n\
+						float f20, f21, f22, f23;\n\
+						f0  = c17 - e10 * b13_12;\n\
+						f1  = e10 + c17 * b13_1;\n\
+						f2  = e9  + c14 * b13_1;\n\
+						f3  = c14 -  e9 * b13_12;\n\
+						f4  = e11 -  d7 * b13_0;\n\
+						f5  = e11 +  d7 * b13_0;\n\
+						f6  = e5  -  f3 * b13_11;\n\
+						f7  = e5  +  f3 * b13_11;\n\
+						f8  = e4  -  e8 * b13_13;\n\
+						f9  = e4  +  e8 * b13_13;\n\
+						f10 = f0  +  d6 * b13_2;\n\
+						f11 = f0  -  d6 * b13_2;\n\
+						f12 = e1  +  c4 * b13_1;\n\
+						f13 = c4  -  e1 * b13_12;\n\
+						f14 = c5  -  e2 * b13_12;\n\
+						f15 = e2  +  c5 * b13_1;\n\
+						f16 = f14 - d19 * b13_2;\n\
+						f17 = f14 + d19 * b13_2;\n\
+						f18 = e15 +  e0 * b13_13;\n\
+						f19 = e15 -  e0 * b13_13;\n\
+						f20 = e14 + f13 * b13_11;\n\
+						f21 = e14 - f13 * b13_11;\n\
+						f22 = e3  + d18 * b13_0;\n\
+						f23 = e3  - d18 * b13_0;\n\
+						\n\
+						(*R0).x  = (*R0).x + s8;\n\
+						(*R0).y  = (*R0).y + m13;\n\
+						(*R1).x  =  e6 +  f2 * dir * b13_9 ;\n\
+						(*R1).y  = e12 - f12 * dir * b13_9 ;\n\
+						(*R2).x  =  f9 - f10 * dir * b13_11;\n\
+						(*R2).y  = f19 + f16 * dir * b13_11;\n\
+						(*R3).x  =  f6 -  f5 * dir * b13_13;\n\
+						(*R3).y  = f20 + f23 * dir * b13_13;\n\
+						(*R4).x  =  f7 -  f4 * dir * b13_13;\n\
+						(*R4).y  = f21 + f22 * dir * b13_13;\n\
+						(*R5).x  =  e7 -  f1 * dir * b13_9 ;\n\
+						(*R5).y  = e13 + f15 * dir * b13_9 ;\n\
+						(*R6).x  =  f8 - f11 * dir * b13_11;\n\
+						(*R6).y  = f18 + f17 * dir * b13_11;\n\
+						(*R7).x  =  f9 + f10 * dir * b13_11;\n\
+						(*R7).y  = f19 - f16 * dir * b13_11;\n\
+						(*R8).x  =  e7 +  f1 * dir * b13_9 ;\n\
+						(*R8).y  = e13 - f15 * dir * b13_9 ;\n\
+						(*R9).x  =  f6 +  f5 * dir * b13_13;\n\
+						(*R9).y  = f20 - f23 * dir * b13_13;\n\
+						(*R10).x =  f7 +  f4 * dir * b13_13;\n\
+						(*R10).y = f21 - f22 * dir * b13_13;\n\
+						(*R11).x =  f8 + f11 * dir * b13_11;\n\
+						(*R11).y = f18 - f17 * dir * b13_11;\n\
+						(*R12).x =  e6 -  f2 * dir * b13_9 ;\n\
+						(*R12).y = e12 + f12 * dir * b13_9 ;\n";
+
+						if (fwd)
+						{
+							bflyStr += "float dir = -1;\n\n";
+						}
+						else
+						{
+							bflyStr += "float dir = 1;\n\n";
+						}
+
+						bflyStr += radix13str;
+
+				} break;
+
 			default:
 				assert(false);
 			}
@@ -1669,7 +2058,7 @@ namespace StockhamGenerator
 				{
 					if(cReg)
 					{
-						if (radix !=7) 
+						if ( (radix != 7) && (radix != 11) && (radix != 13) )
 						{
 						bflyStr += "((*R"; bflyStr += SztToStr(i); bflyStr += ").x) = TR"; bflyStr += SztToStr(i); bflyStr += "; ";
 						bflyStr += "((*R"; bflyStr += SztToStr(i); bflyStr += ").y) = TI"; bflyStr += SztToStr(i); bflyStr += ";\n\t";
diff --git a/src/library/private.h b/src/library/private.h
index 0d7320c..48f2fa5 100644
--- a/src/library/private.h
+++ b/src/library/private.h
@@ -143,7 +143,7 @@ class tofstreamRAII
 		}
 };
 
-//(currently) true if length is a power of 2,3,5
+//(currently) true if length is a power of 2,3,5,7,11,13
 inline bool IsASupportedLength( size_t length )
 {
 	while( length > 1 )
@@ -156,6 +156,10 @@ inline bool IsASupportedLength( size_t length )
 			length /= 5;
 		else if( length % 7 == 0 )
 			length /= 7;
+		else if (length % 11 == 0)
+			length /= 11;
+		else if (length % 13 == 0)
+			length /= 13;
 		else
 			return false;
 	}

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list