[clblas] 61/67: merged develop to master; bumped version to 2.8.0
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:16 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 0482e1c6fb2763ada71a8f1853e5a6021176ce1e
Merge: 9731ea2 feadbbb
Author: David Tanner <guacamoleo at gmail.com>
Date: Fri Oct 16 09:57:19 2015 -0500
merged develop to master; bumped version to 2.8.0
.gitignore | 3 +
.travis.yml | 168 ++-
README.md | 57 +-
appveyor.yml | 105 ++
.../S9150/cgemmNT_S9150_14.50.2_2.6.0_8.csv | 721 +++++++++
.../S9150/dgemmNT_S9150_14.50.2_2.6.0_8.csv | 721 +++++++++
doc/performance/clBLAS_2.6.0/S9150/dtrsm_192.csv | 60 +-
.../S9150/sgemmNT_S9150_14.50.2_2.6.0_8.csv | 721 +++++++++
doc/performance/clBLAS_2.6.0/S9150/sgemm_32.csv | 360 ++---
.../S9150/zgemmNT_S9150_14.50.2_2.6.0_8.csv | 721 +++++++++
doc/performance/clBLAS_2.6.0/W9100/README.txt | 35 +
.../W9100/clblas_sgemmNT_w9100_14502.csv | 181 +++
.../{S9150/sgemm_32.csv => W9100/dgemm_32.csv} | 360 ++---
doc/performance/clBLAS_2.6.0/W9100/dgemm_96.csv | 61 +
.../W9100/dtrsm_w9100_14502.csv} | 60 +-
doc/performance/clBLAS_2.6.0/W9100/peak_dp.csv | 181 +++
doc/performance/clBLAS_2.6.0/W9100/peak_sp.csv | 181 +++
.../{S9150/sgemm_32.csv => W9100/zgemm_32.csv} | 360 ++---
doc/performance/clBLAS_2.6.0/W9100/zgemm_64.csv | 91 ++
.../S9150/cgemmNT_S9150_14.50.2_2.7.1_8.csv | 721 +++++++++
.../S9150/dgemmNT_S9150_14.50.2_2.7.1_8.csv | 721 +++++++++
.../S9150/sgemmNT_S9150_14.50.2_2.7.1_8.csv | 721 +++++++++
.../S9150/zgemmNT_S9150_14.50.2_2.7.1_8.csv | 721 +++++++++
...as271_w9100_dtrsm_col_left_lower_unit_14502.csv | 31 +
...as271_w9100_dtrsm_col_left_upper_unit_14502.csv | 31 +
...s271_w9100_dtrsm_col_right_lower_unit_14502.csv | 31 +
...s271_w9100_dtrsm_col_right_upper_unit_14502.csv | 31 +
doc/performance/cuBLAS_7.0/Tesla_K40/dtrsm.csv | 60 +-
doc/performance/cuBLAS_7.0/Tesla_K40/sgemm.csv | 360 ++---
.../cublas75_k40_dtrsm_col_left_lower_unit.csv | 31 +
.../cublas75_k40_dtrsm_col_left_upper_unit.csv | 31 +
.../cublas75_k40_dtrsm_col_right_lower_unit.csv | 31 +
.../cublas75_k40_dtrsm_col_right_upper_unit.csv | 31 +
.../cuBLAS_7.5/Tesla_K40/cublas_cgemm_8.csv | 721 +++++++++
.../cuBLAS_7.5/Tesla_K40/cublas_dgemm_8.csv | 721 +++++++++
.../cuBLAS_7.5/Tesla_K40/cublas_sgemm_8.csv | 721 +++++++++
.../cuBLAS_7.5/Tesla_K40/cublas_zgemm_8.csv | 721 +++++++++
doc/performance/cuBLAS_7.5/Tesla_K40/peak_dp.csv | 181 +++
doc/performance/cuBLAS_7.5/Tesla_K40/peak_sp.csv | 181 +++
src/CMakeLists.txt | 87 +-
src/client/clfunc_common.hpp | 18 +-
src/client/clfunc_xgemm.hpp | 192 ++-
src/client/clfunc_xgemv.hpp | 22 +-
src/client/clfunc_xger.hpp | 16 +-
src/client/clfunc_xgerc.hpp | 12 +-
src/client/clfunc_xgeru.hpp | 12 +-
src/client/clfunc_xhemm.hpp | 34 +-
src/client/clfunc_xhemv.hpp | 12 +-
src/client/clfunc_xher.hpp | 10 +-
src/client/clfunc_xher2.hpp | 12 +-
src/client/clfunc_xher2k.hpp | 20 +-
src/client/clfunc_xherk.hpp | 20 +-
src/client/clfunc_xsymm.hpp | 58 +-
src/client/clfunc_xsymv.hpp | 12 +-
src/client/clfunc_xsyr.hpp | 10 +-
src/client/clfunc_xsyr2.hpp | 12 +-
src/client/clfunc_xsyr2k.hpp | 34 +-
src/client/clfunc_xsyrk.hpp | 32 +-
src/client/clfunc_xtrmm.hpp | 48 +-
src/client/clfunc_xtrmv.hpp | 14 +-
src/client/clfunc_xtrsm.hpp | 50 +-
src/client/clfunc_xtrsv.hpp | 14 +-
src/client/client.cpp | 12 +-
src/include/msvc.h | 2 +
src/library/CMakeLists.txt | 464 +++++-
src/library/OCLBinaryGenerator.cmake | 86 ++
src/library/bingen.cmake | 1 +
src/library/blas/AutoGemm/.gitignore | 4 +
src/library/blas/AutoGemm/AutoGemm.py | 47 +
src/library/blas/AutoGemm/AutoGemmParameters.py | 149 ++
.../AutoGemmTools/AutoGemmPreCompileKernels.cpp | 925 ++++++++++++
.../blas/AutoGemm/AutoGemmTools/AutoGemmUtil.h | 793 ++++++++++
.../AutoGemm/AutoGemmTools/ProfileAutoGemm.cpp | 1392 ++++++++++++++++++
.../blas/AutoGemm/AutoGemmTools/TestAutoGemm.cpp | 995 +++++++++++++
src/library/blas/AutoGemm/Common.py | 60 +
src/library/blas/AutoGemm/Includes.py | 465 ++++++
src/library/blas/AutoGemm/KernelOpenCL.py | 587 ++++++++
src/library/blas/AutoGemm/KernelParameters.py | 253 ++++
src/library/blas/AutoGemm/KernelSelection.py | 683 +++++++++
src/library/blas/AutoGemm/KernelsToPreCompile.py | 91 ++
src/library/blas/AutoGemm/README.txt | 0
.../UserGemmKernelSources/UserGemmClKernels.h | 23 +
.../UserGemmKernelSourceIncludes.cpp | 57 +
.../UserGemmKernelSourceIncludes.h | 80 +
.../dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp | 203 +++
.../dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp | 203 +++
.../dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp | 196 +++
.../dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp | 193 +++
.../dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp | 195 +++
.../dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp | 195 +++
.../sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp | 129 ++
.../sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp | 160 ++
.../sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp | 208 +++
...sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 149 ++
.../sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp | 129 ++
.../sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp | 161 +++
.../sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp | 207 +++
.../sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp | 126 ++
.../sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp | 165 +++
.../sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp | 210 +++
...sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp | 148 ++
...sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp | 158 ++
.../sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp | 126 ++
.../sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp | 161 +++
.../sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp | 157 ++
.../sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp | 160 ++
.../sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp | 208 +++
.../sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp | 290 ++++
.../sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp | 128 ++
.../sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp | 165 +++
.../sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp | 209 +++
...sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp | 148 ++
.../sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp | 127 ++
.../sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp | 165 +++
.../sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp | 209 +++
src/library/blas/functor/functor.cc | 3 +-
src/library/blas/functor/hawaii.cc | 19 +
.../blas/functor/hawaii_sgemmBig1024Kernel.cc | 506 +++++++
.../blas/functor/hawaii_sgemmSplitKernel.cc | 147 ++
.../functor/include/hawaii_sgemmBig1024Kernel.h | 48 +
src/library/blas/generic/binary_lookup.cc | 6 +-
src/library/blas/generic/common.c | 16 +-
.../blas/gens/clTemplates/sgemm_gcn_bigMatrices.cl | 264 ++++
src/library/blas/include/xgemm.h | 39 +
src/library/blas/ixamax.c | 16 +-
src/library/blas/specialCases/GemmSpecialCases.cpp | 994 +++++++++++++
.../blas/specialCases/include/GemmSpecialCases.h | 42 +
src/library/blas/trtri/TrtriClKernels.h | 44 +
.../blas/trtri/TrtriKernelSourceIncludes.cpp | 81 ++
src/library/blas/trtri/TrtriKernelSourceIncludes.h | 124 ++
.../blas/trtri/diag_dtrtri_lower_128_16.cpp | 172 +++
.../blas/trtri/diag_dtrtri_upper_128_16.cpp | 151 ++
.../blas/trtri/diag_dtrtri_upper_192_12.cpp | 149 ++
.../trtri/triple_dgemm_update_128_16_PART1_L.cpp | 161 +++
.../trtri/triple_dgemm_update_128_16_PART2_L.cpp | 143 ++
.../blas/trtri/triple_dgemm_update_128_16_R.cpp | 239 +++
.../trtri/triple_dgemm_update_128_32_PART1_L.cpp | 150 ++
.../trtri/triple_dgemm_update_128_32_PART1_R.cpp | 151 ++
.../trtri/triple_dgemm_update_128_32_PART2_L.cpp | 135 ++
.../trtri/triple_dgemm_update_128_32_PART2_R.cpp | 136 ++
.../trtri/triple_dgemm_update_128_64_PART1_L.cpp | 145 ++
.../trtri/triple_dgemm_update_128_64_PART1_R.cpp | 145 ++
.../trtri/triple_dgemm_update_128_64_PART2_L.cpp | 133 ++
.../trtri/triple_dgemm_update_128_64_PART2_R.cpp | 134 ++
.../triple_dgemm_update_128_ABOVE64_PART1_L.cpp | 146 ++
.../triple_dgemm_update_128_ABOVE64_PART1_R.cpp | 144 ++
.../triple_dgemm_update_128_ABOVE64_PART2_L.cpp | 134 ++
.../triple_dgemm_update_128_ABOVE64_PART2_R.cpp | 135 ++
.../triple_dgemm_update_128_ABOVE64_PART3_L.cpp | 91 ++
.../triple_dgemm_update_128_ABOVE64_PART3_R.cpp | 94 ++
.../blas/trtri/triple_dgemm_update_192_12_R.cpp | 194 +++
.../trtri/triple_dgemm_update_192_24_PART1_R.cpp | 117 ++
.../trtri/triple_dgemm_update_192_24_PART2_R.cpp | 112 ++
.../trtri/triple_dgemm_update_192_48_PART1_R.cpp | 144 ++
.../trtri/triple_dgemm_update_192_48_PART2_R.cpp | 145 ++
.../trtri/triple_dgemm_update_192_96_PART1_R.cpp | 156 ++
.../trtri/triple_dgemm_update_192_96_PART2_R.cpp | 157 ++
src/library/blas/xasum.c | 16 +-
src/library/blas/xaxpy.c | 6 +
src/library/blas/xcopy.c | 6 +
src/library/blas/xdot.c | 20 +-
src/library/blas/xgemm.cc | 872 ++++++++---
src/library/blas/xger.c | 8 +
src/library/blas/xher.c | 8 +-
src/library/blas/xher2.c | 8 +
src/library/blas/xrot.c | 12 +-
src/library/blas/xrotg.c | 24 +-
src/library/blas/xrotm.c | 8 +
src/library/blas/xrotmg.c | 14 +
src/library/blas/xscal.c | 8 +-
src/library/blas/xswap.c | 6 +
src/library/blas/xsymm.c | 19 +-
src/library/blas/xsyr.c | 8 +-
src/library/blas/xsyr2.c | 8 +
src/library/blas/xtbmv.c | 16 +-
src/library/blas/xtrmv.c | 16 +-
src/library/blas/xtrsm.cc | 1525 ++++++++++++++++++++
.../tools/OCLBinaryGenerator/CMakeLists.txt | 33 +
.../OCLBinaryGenerator/OCLBinaryGenerator.cpp | 347 +++++
src/scripts/perf/blasPerformanceTesting.py | 14 +-
src/tests/common.cpp | 29 +-
src/tests/correctness/corr-gemm.cpp | 12 +-
src/tests/include/gemm.h | 6 +-
183 files changed, 32982 insertions(+), 1582 deletions(-)
diff --cc README.md
index 0148627,51d61ae..cd734da
--- a/README.md
+++ b/README.md
@@@ -119,71 -113,71 +112,71 @@@ The simple example below shows how to u
int main( void )
{
- cl_int err;
- cl_platform_id platform = 0;
- cl_device_id device = 0;
- cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
- cl_context ctx = 0;
- cl_command_queue queue = 0;
- cl_mem bufA, bufB, bufC;
- cl_event event = NULL;
- int ret = 0;
-
- /* Setup OpenCL environment. */
- err = clGetPlatformIDs( 1, &platform, NULL );
- err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
-
- props[1] = (cl_context_properties)platform;
- ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
- queue = clCreateCommandQueue( ctx, device, 0, &err );
-
- /* Setup clBLAS */
- err = clblasSetup( );
-
- /* Prepare OpenCL memory objects and place matrices inside them. */
- bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
- NULL, &err );
- bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
- NULL, &err );
- bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
- NULL, &err );
-
- err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
- M * K * sizeof( *A ), A, 0, NULL, NULL );
- err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
- K * N * sizeof( *B ), B, 0, NULL, NULL );
- err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
- M * N * sizeof( *C ), C, 0, NULL, NULL );
+ cl_int err;
+ cl_platform_id platform = 0;
+ cl_device_id device = 0;
+ cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+ cl_context ctx = 0;
+ cl_command_queue queue = 0;
+ cl_mem bufA, bufB, bufC;
+ cl_event event = NULL;
+ int ret = 0;
+
+ /* Setup OpenCL environment. */
+ err = clGetPlatformIDs( 1, &platform, NULL );
+ err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
+
+ props[1] = (cl_context_properties)platform;
+ ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
+ queue = clCreateCommandQueue( ctx, device, 0, &err );
+
+ /* Setup clBLAS */
+ err = clblasSetup( );
+
+ /* Prepare OpenCL memory objects and place matrices inside them. */
+ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
+ NULL, &err );
+ bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
+ NULL, &err );
+ bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+ NULL, &err );
+
+ err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
+ M * K * sizeof( *A ), A, 0, NULL, NULL );
+ err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
+ K * N * sizeof( *B ), B, 0, NULL, NULL );
+ err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
+ M * N * sizeof( *C ), C, 0, NULL, NULL );
- /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
- err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
- M, N, K,
- alpha, bufA, 0, lda,
- bufB, 0, ldb, beta,
- bufC, 0, ldc,
- 1, &queue, 0, NULL, &event );
+ /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
+ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
+ M, N, K,
+ alpha, bufA, 0, lda,
+ bufB, 0, ldb, beta,
+ bufC, 0, ldc,
+ 1, &queue, 0, NULL, &event );
- /* Wait for calculations to be finished. */
- err = clWaitForEvents( 1, &event );
+ /* Wait for calculations to be finished. */
+ err = clWaitForEvents( 1, &event );
- /* Fetch results of calculations from GPU memory. */
- err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
- M * N * sizeof(*result),
- result, 0, NULL, NULL );
+ /* Fetch results of calculations from GPU memory. */
+ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
+ M * N * sizeof(*result),
+ result, 0, NULL, NULL );
- /* Release OpenCL memory objects. */
- clReleaseMemObject( bufC );
- clReleaseMemObject( bufB );
- clReleaseMemObject( bufA );
+ /* Release OpenCL memory objects. */
+ clReleaseMemObject( bufC );
+ clReleaseMemObject( bufB );
+ clReleaseMemObject( bufA );
- /* Finalize work with clBLAS */
- clblasTeardown( );
+ /* Finalize work with clBLAS */
+ clblasTeardown( );
- /* Release OpenCL working objects. */
- clReleaseCommandQueue( queue );
- clReleaseContext( ctx );
+ /* Release OpenCL working objects. */
+ clReleaseCommandQueue( queue );
+ clReleaseContext( ctx );
- return ret;
+ return ret;
}
```
diff --cc src/CMakeLists.txt
index d4ee66a,73bac0e..a176cac
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@@ -105,7 -108,7 +108,7 @@@ if( NOT DEFINED clBLAS_VERSION_MAJOR
endif( )
if( NOT DEFINED clBLAS_VERSION_MINOR )
- set( clBLAS_VERSION_MINOR 6 )
- set( clBLAS_VERSION_MINOR 7 )
++ set( clBLAS_VERSION_MINOR 8 )
endif( )
if( NOT DEFINED clBLAS_VERSION_PATCH )
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list