[clfft] 20/32: making changes to get next stage of sizes for rad 11/13 working

Ghislain Vaillant ghisvail-guest at moszumanska.debian.org
Tue Apr 26 08:34:10 UTC 2016


This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to branch master
in repository clfft.

commit b02e894741b76f91fcd50a7aeaa399e734f09db8
Author: bragadeesh <bragadeesh.natarajan at amd>
Date:   Wed Apr 6 16:01:22 2016 -0700

    making changes to get next stage of sizes for rad 11/13 working
---
 src/library/generator.stockham.cpp        |  9 +++++-
 src/library/plan.cpp                      | 54 +++++++++++++++++++++----------
 src/library/plan.h                        |  9 ++++++
 src/statTimer/statisticalTimer.GPU.cpp    | 13 ++++++++
 src/statTimer/statisticalTimer.GPU.h      |  4 ++-
 src/tests/accuracy_test_directed.cpp      | 52 +++++++++++++++++++----------
 src/tests/accuracy_test_mixed_radices.cpp | 42 ++++++++++++++++--------
 7 files changed, 134 insertions(+), 49 deletions(-)

diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index f6b25f9..928e50a 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -482,7 +482,14 @@ namespace StockhamGenerator
 				    leastNumPerWI = 70; maxWorkGroupSize = 36;
 			} else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) { 
 				    leastNumPerWI =105; maxWorkGroupSize = 24;
-			} else { 
+			}
+			else if (primeFactorsExpanded[2] * primeFactorsExpanded[11] == length) {
+				leastNumPerWI = 22; maxWorkGroupSize = 128;
+			}
+			else if (primeFactorsExpanded[2] * primeFactorsExpanded[13] == length) {
+				leastNumPerWI = 26; maxWorkGroupSize = 128;
+			}
+			else {
 				    leastNumPerWI =210; maxWorkGroupSize = 12;
 			}
 			if (pr==P_DOUBLE)
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index e67e175..9d4c032 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -570,23 +570,42 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
 				else
 				{
 					// This array must be kept sorted in the ascending order
-					size_t supported[] = {	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 24, 25, 27, 28,
-											30, 32, 35, 36, 40, 42, 45, 48, 49, 50, 54, 56, 60, 63, 64, 70, 72, 75, 80,
-											81, 84, 90, 96, 98, 100, 105, 108, 112, 120, 125, 126, 128, 135, 140, 144,
-											147, 150, 160, 162, 168, 175, 180, 189, 192, 196, 200, 210, 216, 224, 225,
-											240, 243, 245, 250, 252, 256, 270, 280, 288, 294, 300, 315, 320, 324, 336,
-											343, 350, 360, 375, 378, 384, 392, 400, 405, 420, 432, 441, 448, 450, 480,
-											486, 490, 500, 504, 512, 525, 540, 560, 567, 576, 588, 600, 625, 630, 640,
-											648, 672, 675, 686, 700, 720, 729, 735, 750, 756, 768, 784, 800, 810, 840,
-											864, 875, 882, 896, 900, 945, 960, 972, 980, 1000, 1008, 1024, 1029, 1050,
-											1080, 1120, 1125, 1134, 1152, 1176, 1200, 1215, 1225, 1250, 1260, 1280, 1296,
-											1323, 1344, 1350, 1372, 1400, 1440, 1458, 1470, 1500, 1512, 1536, 1568, 1575,
-											1600, 1620, 1680, 1701, 1715, 1728, 1750, 1764, 1792, 1800, 1875, 1890, 1920,
-											1944, 1960, 2000, 2016, 2025, 2048, 2058, 2100, 2160, 2187, 2205, 2240, 2250,
-											2268, 2304, 2352, 2400, 2401, 2430, 2450, 2500, 2520, 2560, 2592, 2625, 2646,
-											2688, 2700, 2744, 2800, 2835, 2880, 2916, 2940, 3000, 3024, 3072, 3087, 3125,
-											3136, 3150, 3200, 3240, 3360, 3375, 3402, 3430, 3456, 3500, 3528, 3584, 3600,
-											3645, 3675, 3750, 3780, 3840, 3888, 3920, 3969, 4000, 4032, 4050, 4096};
+
+					size_t supported[] = {	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 24,
+											25, 26, 27, 28, 30, 32, 33, 35, 36, 39, 40, 42, 44, 45, 48, 49, 50, 52, 54,
+											55, 56, 60, 63, 64, 65, 66, 70, 72, 75, 77, 78, 80, 81, 84, 88, 90, 91, 96,
+											98, 99, 100, 104, 105, 108, 110, 112, 117, 120, 121, 125, 126, 128, 130, 132,
+											135, 140, 143, 144, 147, 150, 154, 156, 160, 162, 165, 168, 169, 175, 176,
+											180, 182, 189, 192, 195, 196, 198, 200, 208, 210, 216, 220, 224, 225, 231,
+											234, 240, 242, 243, 245, 250, 252, 256, 260, 264, 270, 273, 275, 280, 286,
+											288, 294, 297, 300, 308, 312, 315, 320, 324, 325, 330, 336, 338, 343, 350,
+											351, 352, 360, 363, 364, 375, 378, 384, 385, 390, 392, 396, 400, 405, 416,
+											420, 429, 432, 440, 441, 448, 450, 455, 462, 468, 480, 484, 486, 490, 495,
+											500, 504, 507, 512, 520, 525, 528, 539, 540, 546, 550, 560, 567, 572, 576,
+											585, 588, 594, 600, 605, 616, 624, 625, 630, 637, 640, 648, 650, 660, 672,
+											675, 676, 686, 693, 700, 702, 704, 715, 720, 726, 728, 729, 735, 750, 756,
+											768, 770, 780, 784, 792, 800, 810, 819, 825, 832, 840, 845, 847, 858, 864,
+											875, 880, 882, 891, 896, 900, 910, 924, 936, 945, 960, 968, 972, 975, 980,
+											990, 1000, 1001, 1008, 1014, 1024, 1029, 1040, 1050, 1053, 1056, 1078, 1080,
+											1089, 1092, 1100, 1120, 1125, 1134, 1144, 1152, 1155, 1170, 1176, 1183, 1188,
+											1200, 1210, 1215, 1225, 1232, 1248, 1250, 1260, 1274, 1280, 1287, 1296, 1300,
+											1320, 1323, 1331, 1344, 1350, 1352, 1365, 1372, 1375, 1386, 1400, 1404, 1408,
+											1430, 1440, 1452, 1456, 1458, 1470, 1485, 1500, 1512, 1521, 1536, 1540, 1560,
+											1568, 1573, 1575, 1584, 1600, 1617, 1620, 1625, 1638, 1650, 1664, 1680, 1690,
+											1694, 1701, 1715, 1716, 1728, 1750, 1755, 1760, 1764, 1782, 1792, 1800, 1815,
+											1820, 1848, 1859, 1872, 1875, 1890, 1911, 1920, 1925, 1936, 1944, 1950, 1960,
+											1980, 2000, 2002, 2016, 2025, 2028, 2048, 2058, 2079, 2080, 2100, 2106, 2112,
+											2145, 2156, 2160, 2178, 2184, 2187, 2197, 2200, 2205, 2240, 2250, 2268, 2275,
+											2288, 2304, 2310, 2340, 2352, 2366, 2376, 2400, 2401, 2420, 2430, 2450, 2457,
+											2464, 2475, 2496, 2500, 2520, 2535, 2541, 2548, 2560, 2574, 2592, 2600, 2625,
+											2640, 2646, 2662, 2673, 2688, 2695, 2700, 2704, 2730, 2744, 2750, 2772, 2800,
+											2808, 2816, 2835, 2860, 2880, 2904, 2912, 2916, 2925, 2940, 2970, 3000, 3003,
+											3024, 3025, 3042, 3072, 3080, 3087, 3120, 3125, 3136, 3146, 3150, 3159, 3168,
+											3185, 3200, 3234, 3240, 3250, 3267, 3276, 3300, 3328, 3360, 3375, 3380, 3388,
+											3402, 3430, 3432, 3456, 3465, 3500, 3510, 3520, 3528, 3549, 3564, 3575, 3584,
+											3600, 3630, 3640, 3645, 3675, 3696, 3718, 3744, 3750, 3773, 3780, 3822, 3840,
+											3850, 3861, 3872, 3888, 3900, 3920, 3960, 3969, 3993, 4000, 4004, 4032, 4050,
+											4056, 4095, 4096 };
 
 					size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
 					size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
@@ -627,6 +646,7 @@ clfftStatus	clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
                 // Start of block where transposes are generated; 1D FFT
 				while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
 				{
+					if (fftPlan->length[0] <= Large1DThreshold) break;
 
 					if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
 
diff --git a/src/library/plan.h b/src/library/plan.h
index 2a051c9..8368e5f 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -609,9 +609,18 @@ static bool Is1DPossible(size_t length, size_t large1DThreshold)
 {
 	if (length > large1DThreshold)
 		return false;
+
 	if ( (length%7 == 0) && (length%5 == 0) && (length%3 == 0) )
 		return false;
 
+	// radix 11 & 2 is ok, anything else we cannot do in 1 kernel
+	if ( (length % 11 == 0) && ((length % 13 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
+		return false;
+	
+	// radix 13 & 2 is ok, anything else we cannot do in 1 kernel
+	if ( (length % 13 == 0) && ((length % 11 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
+		return false;
+
 	return true;
 }
 
diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp
index 133825d..403178e 100644
--- a/src/statTimer/statisticalTimer.GPU.cpp
+++ b/src/statTimer/statisticalTimer.GPU.cpp
@@ -564,6 +564,19 @@ GpuStatTimer::Print( )
 				tout << std::endl;
 			}
 
+
+			tout << std::setw(tableFourth) << _T("Generator:");
+			switch(mean[t].gen)
+			{
+			case Stockham:				tout << std::setw(tableThird) << _T("Stockham"); break;
+			case Transpose_GCN:			tout << std::setw(tableThird) << _T("Transpose_GCN"); break;
+			case Transpose_SQUARE:		tout << std::setw(tableThird) << _T("Transpose_SQUARE"); break;
+			case Transpose_NONSQUARE:	tout << std::setw(tableThird) << _T("Transpose_NONSQUARE"); break;
+			case Copy:					tout << std::setw(tableThird) << _T("Copy"); break;
+			}
+			tout << std::endl;
+
+
 			tout << std::setw( tableFourth ) << _T( "Length:" );
 			catLengths.str( _T( "" ) );
 			catLengths << _T( "(" );
diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h
index 2a9781e..5b65ffc 100644
--- a/src/statTimer/statisticalTimer.GPU.h
+++ b/src/statTimer/statisticalTimer.GPU.h
@@ -51,6 +51,8 @@ struct StatData
 	clfftPlanHandle planRCcopy;
 	clfftPlanHandle planCopy;
 
+	clfftGenerators gen;
+
 	std::vector< size_t > lengths;
 	std::vector< size_t > inStride;
 	std::vector< size_t > outStride;
@@ -69,7 +71,7 @@ struct StatData
 		deltaNanoSec( 0 ), kernel( kern ), batchSize( plan->batchsize ), dim( plan->dim ),
 		plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ),
 		planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ),
-		planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ),
+		planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ), gen(plan->gen),
 		inStride( plan->inStride ), outStride( plan->outStride ), iDist( plan->iDist ), oDist( plan->oDist ),
 		lengths( plan->length ), enqueueWorkSize( gWorkSize ), enqueueLocalWorkSize( lWorkSize ), placeness( plan->placeness )
 	{
diff --git a/src/tests/accuracy_test_directed.cpp b/src/tests/accuracy_test_directed.cpp
index 0bdcc9c..fc0be7e 100644
--- a/src/tests/accuracy_test_directed.cpp
+++ b/src/tests/accuracy_test_directed.cpp
@@ -172,23 +172,41 @@ namespace DirectedTest {
 		{
 			// This array must be kept sorted in the ascending order
 			static const size_t supported_length_array[] = {
-				1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 24, 25, 27, 28,
-				30, 32, 35, 36, 40, 42, 45, 48, 49, 50, 54, 56, 60, 63, 64, 70, 72, 75, 80,
-				81, 84, 90, 96, 98, 100, 105, 108, 112, 120, 125, 126, 128, 135, 140, 144,
-				147, 150, 160, 162, 168, 175, 180, 189, 192, 196, 200, 210, 216, 224, 225,
-				240, 243, 245, 250, 252, 256, 270, 280, 288, 294, 300, 315, 320, 324, 336,
-				343, 350, 360, 375, 378, 384, 392, 400, 405, 420, 432, 441, 448, 450, 480,
-				486, 490, 500, 504, 512, 525, 540, 560, 567, 576, 588, 600, 625, 630, 640,
-				648, 672, 675, 686, 700, 720, 729, 735, 750, 756, 768, 784, 800, 810, 840,
-				864, 875, 882, 896, 900, 945, 960, 972, 980, 1000, 1008, 1024, 1029, 1050,
-				1080, 1120, 1125, 1134, 1152, 1176, 1200, 1215, 1225, 1250, 1260, 1280, 1296,
-				1323, 1344, 1350, 1372, 1400, 1440, 1458, 1470, 1500, 1512, 1536, 1568, 1575,
-				1600, 1620, 1680, 1701, 1715, 1728, 1750, 1764, 1792, 1800, 1875, 1890, 1920,
-				1944, 1960, 2000, 2016, 2025, 2048, 2058, 2100, 2160, 2187, 2205, 2240, 2250,
-				2268, 2304, 2352, 2400, 2401, 2430, 2450, 2500, 2520, 2560, 2592, 2625, 2646,
-				2688, 2700, 2744, 2800, 2835, 2880, 2916, 2940, 3000, 3024, 3072, 3087, 3125,
-				3136, 3150, 3200, 3240, 3360, 3375, 3402, 3430, 3456, 3500, 3528, 3584, 3600,
-				3645, 3675, 3750, 3780, 3840, 3888, 3920, 3969, 4000, 4032, 4050, 4096 };
+				1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 24,
+				25, 26, 27, 28, 30, 32, 33, 35, 36, 39, 40, 42, 44, 45, 48, 49, 50, 52, 54,
+				55, 56, 60, 63, 64, 65, 66, 70, 72, 75, 77, 78, 80, 81, 84, 88, 90, 91, 96,
+				98, 99, 100, 104, 105, 108, 110, 112, 117, 120, 121, 125, 126, 128, 130, 132,
+				135, 140, 143, 144, 147, 150, 154, 156, 160, 162, 165, 168, 169, 175, 176,
+				180, 182, 189, 192, 195, 196, 198, 200, 208, 210, 216, 220, 224, 225, 231,
+				234, 240, 242, 243, 245, 250, 252, 256, 260, 264, 270, 273, 275, 280, 286,
+				288, 294, 297, 300, 308, 312, 315, 320, 324, 325, 330, 336, 338, 343, 350,
+				351, 352, 360, 363, 364, 375, 378, 384, 385, 390, 392, 396, 400, 405, 416,
+				420, 429, 432, 440, 441, 448, 450, 455, 462, 468, 480, 484, 486, 490, 495,
+				500, 504, 507, 512, 520, 525, 528, 539, 540, 546, 550, 560, 567, 572, 576,
+				585, 588, 594, 600, 605, 616, 624, 625, 630, 637, 640, 648, 650, 660, 672,
+				675, 676, 686, 693, 700, 702, 704, 715, 720, 726, 728, 729, 735, 750, 756,
+				768, 770, 780, 784, 792, 800, 810, 819, 825, 832, 840, 845, 847, 858, 864,
+				875, 880, 882, 891, 896, 900, 910, 924, 936, 945, 960, 968, 972, 975, 980,
+				990, 1000, 1001, 1008, 1014, 1024, 1029, 1040, 1050, 1053, 1056, 1078, 1080,
+				1089, 1092, 1100, 1120, 1125, 1134, 1144, 1152, 1155, 1170, 1176, 1183, 1188,
+				1200, 1210, 1215, 1225, 1232, 1248, 1250, 1260, 1274, 1280, 1287, 1296, 1300,
+				1320, 1323, 1331, 1344, 1350, 1352, 1365, 1372, 1375, 1386, 1400, 1404, 1408,
+				1430, 1440, 1452, 1456, 1458, 1470, 1485, 1500, 1512, 1521, 1536, 1540, 1560,
+				1568, 1573, 1575, 1584, 1600, 1617, 1620, 1625, 1638, 1650, 1664, 1680, 1690,
+				1694, 1701, 1715, 1716, 1728, 1750, 1755, 1760, 1764, 1782, 1792, 1800, 1815,
+				1820, 1848, 1859, 1872, 1875, 1890, 1911, 1920, 1925, 1936, 1944, 1950, 1960,
+				1980, 2000, 2002, 2016, 2025, 2028, 2048, 2058, 2079, 2080, 2100, 2106, 2112,
+				2145, 2156, 2160, 2178, 2184, 2187, 2197, 2200, 2205, 2240, 2250, 2268, 2275,
+				2288, 2304, 2310, 2340, 2352, 2366, 2376, 2400, 2401, 2420, 2430, 2450, 2457,
+				2464, 2475, 2496, 2500, 2520, 2535, 2541, 2548, 2560, 2574, 2592, 2600, 2625,
+				2640, 2646, 2662, 2673, 2688, 2695, 2700, 2704, 2730, 2744, 2750, 2772, 2800,
+				2808, 2816, 2835, 2860, 2880, 2904, 2912, 2916, 2925, 2940, 2970, 3000, 3003,
+				3024, 3025, 3042, 3072, 3080, 3087, 3120, 3125, 3136, 3146, 3150, 3159, 3168,
+				3185, 3200, 3234, 3240, 3250, 3267, 3276, 3300, 3328, 3360, 3375, 3380, 3388,
+				3402, 3430, 3432, 3456, 3465, 3500, 3510, 3520, 3528, 3549, 3564, 3575, 3584,
+				3600, 3630, 3640, 3645, 3675, 3696, 3718, 3744, 3750, 3773, 3780, 3822, 3840,
+				3850, 3861, 3872, 3888, 3900, 3920, 3960, 3969, 3993, 4000, 4004, 4032, 4050,
+				4056, 4095, 4096 };
 
 			supported_length = supported_length_array;
 			size_supported_length = sizeof(supported_length_array) / sizeof(supported_length_array[0]);
diff --git a/src/tests/accuracy_test_mixed_radices.cpp b/src/tests/accuracy_test_mixed_radices.cpp
index 81ec183..98bacbf 100644
--- a/src/tests/accuracy_test_mixed_radices.cpp
+++ b/src/tests/accuracy_test_mixed_radices.cpp
@@ -168,8 +168,8 @@ public:
 	Supported_Fft_Sizes()
 	: max_mixed_radices_to_test( 4096 )
 	{
-		size_t i=0, j=0, k=0, l=0;
-		size_t sum, sumi, sumj, sumk, suml;
+		size_t i=0, j=0, k=0, l=0, m=0, n=0;
+		size_t sum, sumi, sumj, sumk, suml, summ, sumn;
 
 		sumi = 1; i = 0;
 		while(1)
@@ -183,27 +183,43 @@ public:
 					suml = 1; l = 0;
 					while(1)
 					{
-						sum = (sumi*sumj*sumk*suml);
-						if( sum > max_mixed_radices_to_test ) break;
-
-						sizes.push_back(sum);
+						summ = 1; m = 0;
+						while (1)
+						{
+							sumn = 1; n = 0;
+							while (1)
+							{
+								sum = (sumi*sumj*sumk*suml*summ*sumn);
+								if (sum > max_mixed_radices_to_test) break;
+
+								sizes.push_back(sum);
+								n++;
+								sumn *= 2;
+							}
+
+							if(n == 0) break;
+							m++;
+							summ *= 3;
+						}
+
+						if( (m == 0) && (n == 0) ) break;
 						l++;
-						suml *= 2;
+						suml *= 5;
 					}
 
-					if(l == 0) break;
+					if( (l == 0) && (m == 0) && (n == 0) ) break;
 					k++;
-					sumk *= 3;
+					sumk *= 7;
 				}
 
-				if( (k == 0) && (l == 0) ) break;
+				if( (k == 0) && (l == 0) && (m == 0) && (n == 0) ) break;
 				j++;
-				sumj *= 5;
+				sumj *= 11;
 			}
 
-			if( (j == 0) && (k == 0) && (l == 0) ) break;
+			if( (j == 0) &&  (k == 0) && (l == 0) && (m == 0) && (n == 0) ) break;
 			i++;
-			sumi *= 7;
+			sumi *= 13;
 		}
 	}
 } supported_sizes;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git



More information about the debian-science-commits mailing list