[clfft] 20/32: making changes to get next stage of sizes for rad 11/13 working
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Apr 26 08:34:10 UTC 2016
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clfft.
commit b02e894741b76f91fcd50a7aeaa399e734f09db8
Author: bragadeesh <bragadeesh.natarajan at amd>
Date: Wed Apr 6 16:01:22 2016 -0700
making changes to get next stage of sizes for rad 11/13 working
---
src/library/generator.stockham.cpp | 9 +++++-
src/library/plan.cpp | 54 +++++++++++++++++++++----------
src/library/plan.h | 9 ++++++
src/statTimer/statisticalTimer.GPU.cpp | 13 ++++++++
src/statTimer/statisticalTimer.GPU.h | 4 ++-
src/tests/accuracy_test_directed.cpp | 52 +++++++++++++++++++----------
src/tests/accuracy_test_mixed_radices.cpp | 42 ++++++++++++++++--------
7 files changed, 134 insertions(+), 49 deletions(-)
diff --git a/src/library/generator.stockham.cpp b/src/library/generator.stockham.cpp
index f6b25f9..928e50a 100644
--- a/src/library/generator.stockham.cpp
+++ b/src/library/generator.stockham.cpp
@@ -482,7 +482,14 @@ namespace StockhamGenerator
leastNumPerWI = 70; maxWorkGroupSize = 36;
} else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) {
leastNumPerWI =105; maxWorkGroupSize = 24;
- } else {
+ }
+ else if (primeFactorsExpanded[2] * primeFactorsExpanded[11] == length) {
+ leastNumPerWI = 22; maxWorkGroupSize = 128;
+ }
+ else if (primeFactorsExpanded[2] * primeFactorsExpanded[13] == length) {
+ leastNumPerWI = 26; maxWorkGroupSize = 128;
+ }
+ else {
leastNumPerWI =210; maxWorkGroupSize = 12;
}
if (pr==P_DOUBLE)
diff --git a/src/library/plan.cpp b/src/library/plan.cpp
index e67e175..9d4c032 100644
--- a/src/library/plan.cpp
+++ b/src/library/plan.cpp
@@ -570,23 +570,42 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
else
{
// This array must be kept sorted in the ascending order
- size_t supported[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 24, 25, 27, 28,
- 30, 32, 35, 36, 40, 42, 45, 48, 49, 50, 54, 56, 60, 63, 64, 70, 72, 75, 80,
- 81, 84, 90, 96, 98, 100, 105, 108, 112, 120, 125, 126, 128, 135, 140, 144,
- 147, 150, 160, 162, 168, 175, 180, 189, 192, 196, 200, 210, 216, 224, 225,
- 240, 243, 245, 250, 252, 256, 270, 280, 288, 294, 300, 315, 320, 324, 336,
- 343, 350, 360, 375, 378, 384, 392, 400, 405, 420, 432, 441, 448, 450, 480,
- 486, 490, 500, 504, 512, 525, 540, 560, 567, 576, 588, 600, 625, 630, 640,
- 648, 672, 675, 686, 700, 720, 729, 735, 750, 756, 768, 784, 800, 810, 840,
- 864, 875, 882, 896, 900, 945, 960, 972, 980, 1000, 1008, 1024, 1029, 1050,
- 1080, 1120, 1125, 1134, 1152, 1176, 1200, 1215, 1225, 1250, 1260, 1280, 1296,
- 1323, 1344, 1350, 1372, 1400, 1440, 1458, 1470, 1500, 1512, 1536, 1568, 1575,
- 1600, 1620, 1680, 1701, 1715, 1728, 1750, 1764, 1792, 1800, 1875, 1890, 1920,
- 1944, 1960, 2000, 2016, 2025, 2048, 2058, 2100, 2160, 2187, 2205, 2240, 2250,
- 2268, 2304, 2352, 2400, 2401, 2430, 2450, 2500, 2520, 2560, 2592, 2625, 2646,
- 2688, 2700, 2744, 2800, 2835, 2880, 2916, 2940, 3000, 3024, 3072, 3087, 3125,
- 3136, 3150, 3200, 3240, 3360, 3375, 3402, 3430, 3456, 3500, 3528, 3584, 3600,
- 3645, 3675, 3750, 3780, 3840, 3888, 3920, 3969, 4000, 4032, 4050, 4096};
+
+ size_t supported[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 24,
+ 25, 26, 27, 28, 30, 32, 33, 35, 36, 39, 40, 42, 44, 45, 48, 49, 50, 52, 54,
+ 55, 56, 60, 63, 64, 65, 66, 70, 72, 75, 77, 78, 80, 81, 84, 88, 90, 91, 96,
+ 98, 99, 100, 104, 105, 108, 110, 112, 117, 120, 121, 125, 126, 128, 130, 132,
+ 135, 140, 143, 144, 147, 150, 154, 156, 160, 162, 165, 168, 169, 175, 176,
+ 180, 182, 189, 192, 195, 196, 198, 200, 208, 210, 216, 220, 224, 225, 231,
+ 234, 240, 242, 243, 245, 250, 252, 256, 260, 264, 270, 273, 275, 280, 286,
+ 288, 294, 297, 300, 308, 312, 315, 320, 324, 325, 330, 336, 338, 343, 350,
+ 351, 352, 360, 363, 364, 375, 378, 384, 385, 390, 392, 396, 400, 405, 416,
+ 420, 429, 432, 440, 441, 448, 450, 455, 462, 468, 480, 484, 486, 490, 495,
+ 500, 504, 507, 512, 520, 525, 528, 539, 540, 546, 550, 560, 567, 572, 576,
+ 585, 588, 594, 600, 605, 616, 624, 625, 630, 637, 640, 648, 650, 660, 672,
+ 675, 676, 686, 693, 700, 702, 704, 715, 720, 726, 728, 729, 735, 750, 756,
+ 768, 770, 780, 784, 792, 800, 810, 819, 825, 832, 840, 845, 847, 858, 864,
+ 875, 880, 882, 891, 896, 900, 910, 924, 936, 945, 960, 968, 972, 975, 980,
+ 990, 1000, 1001, 1008, 1014, 1024, 1029, 1040, 1050, 1053, 1056, 1078, 1080,
+ 1089, 1092, 1100, 1120, 1125, 1134, 1144, 1152, 1155, 1170, 1176, 1183, 1188,
+ 1200, 1210, 1215, 1225, 1232, 1248, 1250, 1260, 1274, 1280, 1287, 1296, 1300,
+ 1320, 1323, 1331, 1344, 1350, 1352, 1365, 1372, 1375, 1386, 1400, 1404, 1408,
+ 1430, 1440, 1452, 1456, 1458, 1470, 1485, 1500, 1512, 1521, 1536, 1540, 1560,
+ 1568, 1573, 1575, 1584, 1600, 1617, 1620, 1625, 1638, 1650, 1664, 1680, 1690,
+ 1694, 1701, 1715, 1716, 1728, 1750, 1755, 1760, 1764, 1782, 1792, 1800, 1815,
+ 1820, 1848, 1859, 1872, 1875, 1890, 1911, 1920, 1925, 1936, 1944, 1950, 1960,
+ 1980, 2000, 2002, 2016, 2025, 2028, 2048, 2058, 2079, 2080, 2100, 2106, 2112,
+ 2145, 2156, 2160, 2178, 2184, 2187, 2197, 2200, 2205, 2240, 2250, 2268, 2275,
+ 2288, 2304, 2310, 2340, 2352, 2366, 2376, 2400, 2401, 2420, 2430, 2450, 2457,
+ 2464, 2475, 2496, 2500, 2520, 2535, 2541, 2548, 2560, 2574, 2592, 2600, 2625,
+ 2640, 2646, 2662, 2673, 2688, 2695, 2700, 2704, 2730, 2744, 2750, 2772, 2800,
+ 2808, 2816, 2835, 2860, 2880, 2904, 2912, 2916, 2925, 2940, 2970, 3000, 3003,
+ 3024, 3025, 3042, 3072, 3080, 3087, 3120, 3125, 3136, 3146, 3150, 3159, 3168,
+ 3185, 3200, 3234, 3240, 3250, 3267, 3276, 3300, 3328, 3360, 3375, 3380, 3388,
+ 3402, 3430, 3432, 3456, 3465, 3500, 3510, 3520, 3528, 3549, 3564, 3575, 3584,
+ 3600, 3630, 3640, 3645, 3675, 3696, 3718, 3744, 3750, 3773, 3780, 3822, 3840,
+ 3850, 3861, 3872, 3888, 3900, 3920, 3960, 3969, 3993, 4000, 4004, 4032, 4050,
+ 4056, 4095, 4096 };
size_t lenSupported = sizeof(supported)/sizeof(supported[0]);
size_t maxFactoredLength = (supported[lenSupported-1] < Large1DThreshold) ? supported[lenSupported-1] : Large1DThreshold;
@@ -627,6 +646,7 @@ clfftStatus clfftBakePlan( clfftPlanHandle plHandle, cl_uint numQueues, cl_comma
// Start of block where transposes are generated; 1D FFT
while (1 && (fftPlan->inputLayout != CLFFT_REAL) && (fftPlan->outputLayout != CLFFT_REAL))
{
+ if (fftPlan->length[0] <= Large1DThreshold) break;
if (fftPlan->inStride[0] != 1 || fftPlan->outStride[0] != 1) break;
diff --git a/src/library/plan.h b/src/library/plan.h
index 2a051c9..8368e5f 100644
--- a/src/library/plan.h
+++ b/src/library/plan.h
@@ -609,9 +609,18 @@ static bool Is1DPossible(size_t length, size_t large1DThreshold)
{
if (length > large1DThreshold)
return false;
+
if ( (length%7 == 0) && (length%5 == 0) && (length%3 == 0) )
return false;
+ // radix 11 & 2 is ok, anything else we cannot do in 1 kernel
+ if ( (length % 11 == 0) && ((length % 13 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
+ return false;
+
+ // radix 13 & 2 is ok, anything else we cannot do in 1 kernel
+ if ( (length % 13 == 0) && ((length % 11 == 0) || (length % 7 == 0) || (length % 5 == 0) || (length % 3 == 0)) )
+ return false;
+
return true;
}
diff --git a/src/statTimer/statisticalTimer.GPU.cpp b/src/statTimer/statisticalTimer.GPU.cpp
index 133825d..403178e 100644
--- a/src/statTimer/statisticalTimer.GPU.cpp
+++ b/src/statTimer/statisticalTimer.GPU.cpp
@@ -564,6 +564,19 @@ GpuStatTimer::Print( )
tout << std::endl;
}
+
+ tout << std::setw(tableFourth) << _T("Generator:");
+ switch(mean[t].gen)
+ {
+ case Stockham: tout << std::setw(tableThird) << _T("Stockham"); break;
+ case Transpose_GCN: tout << std::setw(tableThird) << _T("Transpose_GCN"); break;
+ case Transpose_SQUARE: tout << std::setw(tableThird) << _T("Transpose_SQUARE"); break;
+ case Transpose_NONSQUARE: tout << std::setw(tableThird) << _T("Transpose_NONSQUARE"); break;
+ case Copy: tout << std::setw(tableThird) << _T("Copy"); break;
+ }
+ tout << std::endl;
+
+
tout << std::setw( tableFourth ) << _T( "Length:" );
catLengths.str( _T( "" ) );
catLengths << _T( "(" );
diff --git a/src/statTimer/statisticalTimer.GPU.h b/src/statTimer/statisticalTimer.GPU.h
index 2a9781e..5b65ffc 100644
--- a/src/statTimer/statisticalTimer.GPU.h
+++ b/src/statTimer/statisticalTimer.GPU.h
@@ -51,6 +51,8 @@ struct StatData
clfftPlanHandle planRCcopy;
clfftPlanHandle planCopy;
+ clfftGenerators gen;
+
std::vector< size_t > lengths;
std::vector< size_t > inStride;
std::vector< size_t > outStride;
@@ -69,7 +71,7 @@ struct StatData
deltaNanoSec( 0 ), kernel( kern ), batchSize( plan->batchsize ), dim( plan->dim ),
plHandle( id ), planX( plan->planX ), planY( plan->planY ), planZ( plan->planZ ),
planTX( plan->planTX ), planTY( plan->planTY ), planTZ( plan->planTZ ),
- planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ),
+ planRCcopy( plan->planRCcopy ), planCopy( plan->planCopy ), gen(plan->gen),
inStride( plan->inStride ), outStride( plan->outStride ), iDist( plan->iDist ), oDist( plan->oDist ),
lengths( plan->length ), enqueueWorkSize( gWorkSize ), enqueueLocalWorkSize( lWorkSize ), placeness( plan->placeness )
{
diff --git a/src/tests/accuracy_test_directed.cpp b/src/tests/accuracy_test_directed.cpp
index 0bdcc9c..fc0be7e 100644
--- a/src/tests/accuracy_test_directed.cpp
+++ b/src/tests/accuracy_test_directed.cpp
@@ -172,23 +172,41 @@ namespace DirectedTest {
{
// This array must be kept sorted in the ascending order
static const size_t supported_length_array[] = {
- 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 24, 25, 27, 28,
- 30, 32, 35, 36, 40, 42, 45, 48, 49, 50, 54, 56, 60, 63, 64, 70, 72, 75, 80,
- 81, 84, 90, 96, 98, 100, 105, 108, 112, 120, 125, 126, 128, 135, 140, 144,
- 147, 150, 160, 162, 168, 175, 180, 189, 192, 196, 200, 210, 216, 224, 225,
- 240, 243, 245, 250, 252, 256, 270, 280, 288, 294, 300, 315, 320, 324, 336,
- 343, 350, 360, 375, 378, 384, 392, 400, 405, 420, 432, 441, 448, 450, 480,
- 486, 490, 500, 504, 512, 525, 540, 560, 567, 576, 588, 600, 625, 630, 640,
- 648, 672, 675, 686, 700, 720, 729, 735, 750, 756, 768, 784, 800, 810, 840,
- 864, 875, 882, 896, 900, 945, 960, 972, 980, 1000, 1008, 1024, 1029, 1050,
- 1080, 1120, 1125, 1134, 1152, 1176, 1200, 1215, 1225, 1250, 1260, 1280, 1296,
- 1323, 1344, 1350, 1372, 1400, 1440, 1458, 1470, 1500, 1512, 1536, 1568, 1575,
- 1600, 1620, 1680, 1701, 1715, 1728, 1750, 1764, 1792, 1800, 1875, 1890, 1920,
- 1944, 1960, 2000, 2016, 2025, 2048, 2058, 2100, 2160, 2187, 2205, 2240, 2250,
- 2268, 2304, 2352, 2400, 2401, 2430, 2450, 2500, 2520, 2560, 2592, 2625, 2646,
- 2688, 2700, 2744, 2800, 2835, 2880, 2916, 2940, 3000, 3024, 3072, 3087, 3125,
- 3136, 3150, 3200, 3240, 3360, 3375, 3402, 3430, 3456, 3500, 3528, 3584, 3600,
- 3645, 3675, 3750, 3780, 3840, 3888, 3920, 3969, 4000, 4032, 4050, 4096 };
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 24,
+ 25, 26, 27, 28, 30, 32, 33, 35, 36, 39, 40, 42, 44, 45, 48, 49, 50, 52, 54,
+ 55, 56, 60, 63, 64, 65, 66, 70, 72, 75, 77, 78, 80, 81, 84, 88, 90, 91, 96,
+ 98, 99, 100, 104, 105, 108, 110, 112, 117, 120, 121, 125, 126, 128, 130, 132,
+ 135, 140, 143, 144, 147, 150, 154, 156, 160, 162, 165, 168, 169, 175, 176,
+ 180, 182, 189, 192, 195, 196, 198, 200, 208, 210, 216, 220, 224, 225, 231,
+ 234, 240, 242, 243, 245, 250, 252, 256, 260, 264, 270, 273, 275, 280, 286,
+ 288, 294, 297, 300, 308, 312, 315, 320, 324, 325, 330, 336, 338, 343, 350,
+ 351, 352, 360, 363, 364, 375, 378, 384, 385, 390, 392, 396, 400, 405, 416,
+ 420, 429, 432, 440, 441, 448, 450, 455, 462, 468, 480, 484, 486, 490, 495,
+ 500, 504, 507, 512, 520, 525, 528, 539, 540, 546, 550, 560, 567, 572, 576,
+ 585, 588, 594, 600, 605, 616, 624, 625, 630, 637, 640, 648, 650, 660, 672,
+ 675, 676, 686, 693, 700, 702, 704, 715, 720, 726, 728, 729, 735, 750, 756,
+ 768, 770, 780, 784, 792, 800, 810, 819, 825, 832, 840, 845, 847, 858, 864,
+ 875, 880, 882, 891, 896, 900, 910, 924, 936, 945, 960, 968, 972, 975, 980,
+ 990, 1000, 1001, 1008, 1014, 1024, 1029, 1040, 1050, 1053, 1056, 1078, 1080,
+ 1089, 1092, 1100, 1120, 1125, 1134, 1144, 1152, 1155, 1170, 1176, 1183, 1188,
+ 1200, 1210, 1215, 1225, 1232, 1248, 1250, 1260, 1274, 1280, 1287, 1296, 1300,
+ 1320, 1323, 1331, 1344, 1350, 1352, 1365, 1372, 1375, 1386, 1400, 1404, 1408,
+ 1430, 1440, 1452, 1456, 1458, 1470, 1485, 1500, 1512, 1521, 1536, 1540, 1560,
+ 1568, 1573, 1575, 1584, 1600, 1617, 1620, 1625, 1638, 1650, 1664, 1680, 1690,
+ 1694, 1701, 1715, 1716, 1728, 1750, 1755, 1760, 1764, 1782, 1792, 1800, 1815,
+ 1820, 1848, 1859, 1872, 1875, 1890, 1911, 1920, 1925, 1936, 1944, 1950, 1960,
+ 1980, 2000, 2002, 2016, 2025, 2028, 2048, 2058, 2079, 2080, 2100, 2106, 2112,
+ 2145, 2156, 2160, 2178, 2184, 2187, 2197, 2200, 2205, 2240, 2250, 2268, 2275,
+ 2288, 2304, 2310, 2340, 2352, 2366, 2376, 2400, 2401, 2420, 2430, 2450, 2457,
+ 2464, 2475, 2496, 2500, 2520, 2535, 2541, 2548, 2560, 2574, 2592, 2600, 2625,
+ 2640, 2646, 2662, 2673, 2688, 2695, 2700, 2704, 2730, 2744, 2750, 2772, 2800,
+ 2808, 2816, 2835, 2860, 2880, 2904, 2912, 2916, 2925, 2940, 2970, 3000, 3003,
+ 3024, 3025, 3042, 3072, 3080, 3087, 3120, 3125, 3136, 3146, 3150, 3159, 3168,
+ 3185, 3200, 3234, 3240, 3250, 3267, 3276, 3300, 3328, 3360, 3375, 3380, 3388,
+ 3402, 3430, 3432, 3456, 3465, 3500, 3510, 3520, 3528, 3549, 3564, 3575, 3584,
+ 3600, 3630, 3640, 3645, 3675, 3696, 3718, 3744, 3750, 3773, 3780, 3822, 3840,
+ 3850, 3861, 3872, 3888, 3900, 3920, 3960, 3969, 3993, 4000, 4004, 4032, 4050,
+ 4056, 4095, 4096 };
supported_length = supported_length_array;
size_supported_length = sizeof(supported_length_array) / sizeof(supported_length_array[0]);
diff --git a/src/tests/accuracy_test_mixed_radices.cpp b/src/tests/accuracy_test_mixed_radices.cpp
index 81ec183..98bacbf 100644
--- a/src/tests/accuracy_test_mixed_radices.cpp
+++ b/src/tests/accuracy_test_mixed_radices.cpp
@@ -168,8 +168,8 @@ public:
Supported_Fft_Sizes()
: max_mixed_radices_to_test( 4096 )
{
- size_t i=0, j=0, k=0, l=0;
- size_t sum, sumi, sumj, sumk, suml;
+ size_t i=0, j=0, k=0, l=0, m=0, n=0;
+ size_t sum, sumi, sumj, sumk, suml, summ, sumn;
sumi = 1; i = 0;
while(1)
@@ -183,27 +183,43 @@ public:
suml = 1; l = 0;
while(1)
{
- sum = (sumi*sumj*sumk*suml);
- if( sum > max_mixed_radices_to_test ) break;
-
- sizes.push_back(sum);
+ summ = 1; m = 0;
+ while (1)
+ {
+ sumn = 1; n = 0;
+ while (1)
+ {
+ sum = (sumi*sumj*sumk*suml*summ*sumn);
+ if (sum > max_mixed_radices_to_test) break;
+
+ sizes.push_back(sum);
+ n++;
+ sumn *= 2;
+ }
+
+ if(n == 0) break;
+ m++;
+ summ *= 3;
+ }
+
+ if( (m == 0) && (n == 0) ) break;
l++;
- suml *= 2;
+ suml *= 5;
}
- if(l == 0) break;
+ if( (l == 0) && (m == 0) && (n == 0) ) break;
k++;
- sumk *= 3;
+ sumk *= 7;
}
- if( (k == 0) && (l == 0) ) break;
+ if( (k == 0) && (l == 0) && (m == 0) && (n == 0) ) break;
j++;
- sumj *= 5;
+ sumj *= 11;
}
- if( (j == 0) && (k == 0) && (l == 0) ) break;
+ if( (j == 0) && (k == 0) && (l == 0) && (m == 0) && (n == 0) ) break;
i++;
- sumi *= 7;
+ sumi *= 13;
}
}
} supported_sizes;
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clfft.git
More information about the debian-science-commits
mailing list