[clblas] 42/67: dtrsm lower left
Ghislain Vaillant
ghisvail-guest at moszumanska.debian.org
Tue Oct 27 08:02:14 UTC 2015
This is an automated email from the git hooks/post-receive script.
ghisvail-guest pushed a commit to branch master
in repository clblas.
commit 5ee9e5fa5933c9e1f97a1b473b174ca296213f14
Author: Timmy <timmy.liu at amd.com>
Date: Tue Sep 22 17:14:07 2015 -0500
dtrsm lower left
---
src/library/blas/trtri/TrtriClKernels.h | 13 ++
.../blas/trtri/TrtriKernelSourceIncludes.cpp | 13 ++
src/library/blas/trtri/TrtriKernelSourceIncludes.h | 42 +++++
.../blas/trtri/diag_dtrtri_lower_128_16.cpp | 169 +++++++++++++++++++++
.../trtri/triple_dgemm_update_128_16_PART1_L.cpp | 160 +++++++++++++++++++
.../trtri/triple_dgemm_update_128_16_PART2_L.cpp | 142 +++++++++++++++++
.../trtri/triple_dgemm_update_128_32_PART1_L.cpp | 149 ++++++++++++++++++
.../trtri/triple_dgemm_update_128_32_PART2_L.cpp | 134 ++++++++++++++++
.../trtri/triple_dgemm_update_128_64_PART1_L.cpp | 144 ++++++++++++++++++
.../trtri/triple_dgemm_update_128_64_PART2_L.cpp | 132 ++++++++++++++++
.../triple_dgemm_update_128_ABOVE64_PART1_L.cpp | 145 ++++++++++++++++++
.../triple_dgemm_update_128_ABOVE64_PART2_L.cpp | 133 ++++++++++++++++
.../triple_dgemm_update_128_ABOVE64_PART3_L.cpp | 88 +++++++++++
src/library/blas/xtrsm.cc | 163 ++++++++++++++------
14 files changed, 1577 insertions(+), 50 deletions(-)
diff --git a/src/library/blas/trtri/TrtriClKernels.h b/src/library/blas/trtri/TrtriClKernels.h
index 0769968..0e963c7 100644
--- a/src/library/blas/trtri/TrtriClKernels.h
+++ b/src/library/blas/trtri/TrtriClKernels.h
@@ -14,6 +14,7 @@ static cl_kernel triple_dgemm_update_192_96_PART1_R_clKernel = NULL;
static cl_kernel triple_dgemm_update_192_96_PART2_R_clKernel = NULL;
/*mod 128 dtrsm*/
+/*upper*/
static cl_kernel diag_dtrtri_upper_128_16_clKernel = NULL;
static cl_kernel triple_dgemm_update_128_16_R_clKernel = NULL;
static cl_kernel triple_dgemm_update_128_32_PART1_R_clKernel = NULL;
@@ -24,4 +25,16 @@ static cl_kernel triple_dgemm_update_128_ABOVE64_PART1_R_clKernel = NULL;
static cl_kernel triple_dgemm_update_128_ABOVE64_PART2_R_clKernel = NULL;
static cl_kernel triple_dgemm_update_128_ABOVE64_PART3_R_clKernel = NULL;
+/*lower*/
+static cl_kernel diag_dtrtri_lower_128_16_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_16_PART1_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_16_PART2_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_32_PART1_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_32_PART2_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_64_PART1_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_64_PART2_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_ABOVE64_PART1_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_ABOVE64_PART2_L_clKernel = NULL;
+static cl_kernel triple_dgemm_update_128_ABOVE64_PART3_L_clKernel = NULL;
+
#endif
\ No newline at end of file
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
index acef64d..0a229b2 100644
--- a/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp
@@ -17,6 +17,7 @@
#include "triple_dgemm_update_192_96_PART2_R.cpp"
/*mod 128 dtrsm*/
+/*upper*/
#include "diag_dtrtri_upper_128_16.cpp"
#include "triple_dgemm_update_128_16_R.cpp"
#include "triple_dgemm_update_128_32_PART1_R.cpp"
@@ -27,4 +28,16 @@
#include "triple_dgemm_update_128_ABOVE64_PART2_R.cpp"
#include "triple_dgemm_update_128_ABOVE64_PART3_R.cpp"
+/*lower*/
+#include "diag_dtrtri_lower_128_16.cpp"
+#include "triple_dgemm_update_128_16_PART1_L.cpp"
+#include "triple_dgemm_update_128_16_PART2_L.cpp"
+#include "triple_dgemm_update_128_32_PART1_L.cpp"
+#include "triple_dgemm_update_128_32_PART2_L.cpp"
+#include "triple_dgemm_update_128_64_PART1_L.cpp"
+#include "triple_dgemm_update_128_64_PART2_L.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART1_L.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART2_L.cpp"
+#include "triple_dgemm_update_128_ABOVE64_PART3_L.cpp"
+
#endif
diff --git a/src/library/blas/trtri/TrtriKernelSourceIncludes.h b/src/library/blas/trtri/TrtriKernelSourceIncludes.h
index 0fcc133..c7cc6bb 100644
--- a/src/library/blas/trtri/TrtriKernelSourceIncludes.h
+++ b/src/library/blas/trtri/TrtriKernelSourceIncludes.h
@@ -43,6 +43,7 @@ extern unsigned char *triple_dgemm_update_192_96_PART2_R_bin;
extern size_t triple_dgemm_update_192_96_PART2_R_binSize;
/*mod 128 dtrsm*/
+/*upper*/
extern const char * const diag_dtrtri_upper_128_16_src;
extern unsigned char *diag_dtrtri_upper_128_16_bin;
extern size_t diag_dtrtri_upper_128_16_binSize;
@@ -79,4 +80,45 @@ extern const char * const triple_dgemm_update_128_ABOVE64_PART3_R_src;
extern unsigned char *triple_dgemm_update_128_ABOVE64_PART3_R_bin;
extern size_t triple_dgemm_update_128_ABOVE64_PART3_R_binSize;
+/*lower*/
+extern const char * const diag_dtrtri_lower_128_16_src;
+extern unsigned char *diag_dtrtri_lower_128_16_bin;
+extern size_t diag_dtrtri_lower_128_16_binSize;
+
+extern const char * const triple_dgemm_update_128_16_PART1_L_src;
+extern unsigned char *triple_dgemm_update_128_16_PART1_L_bin;
+extern size_t triple_dgemm_update_128_16_PART1_L_binSize;
+
+extern const char * const triple_dgemm_update_128_16_PART2_L_src;
+extern unsigned char *triple_dgemm_update_128_16_PART2_L_bin;
+extern size_t triple_dgemm_update_128_16_PART2_L_binSize;
+
+extern const char * const triple_dgemm_update_128_32_PART1_L_src;
+extern unsigned char *triple_dgemm_update_128_32_PART1_L_bin;
+extern size_t triple_dgemm_update_128_32_PART1_L_binSize;
+
+extern const char * const triple_dgemm_update_128_32_PART2_L_src;
+extern unsigned char *triple_dgemm_update_128_32_PART2_L_bin;
+extern size_t triple_dgemm_update_128_32_PART2_L_binSize;
+
+extern const char * const triple_dgemm_update_128_64_PART1_L_src;
+extern unsigned char *triple_dgemm_update_128_64_PART1_L_bin;
+extern size_t triple_dgemm_update_128_64_PART1_L_binSize;
+
+extern const char * const triple_dgemm_update_128_64_PART2_L_src;
+extern unsigned char *triple_dgemm_update_128_64_PART2_L_bin;
+extern size_t triple_dgemm_update_128_64_PART2_L_binSize;
+
+extern const char * const triple_dgemm_update_128_ABOVE64_PART1_L_src;
+extern unsigned char *triple_dgemm_update_128_ABOVE64_PART1_L_bin;
+extern size_t triple_dgemm_update_128_ABOVE64_PART1_L_binSize;
+
+extern const char * const triple_dgemm_update_128_ABOVE64_PART2_L_src;
+extern unsigned char *triple_dgemm_update_128_ABOVE64_PART2_L_bin;
+extern size_t triple_dgemm_update_128_ABOVE64_PART2_L_binSize;
+
+extern const char * const triple_dgemm_update_128_ABOVE64_PART3_L_src;
+extern unsigned char *triple_dgemm_update_128_ABOVE64_PART3_L_bin;
+extern size_t triple_dgemm_update_128_ABOVE64_PART3_L_binSize;
+
#endif
diff --git a/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
new file mode 100644
index 0000000..17b8be6
--- /dev/null
+++ b/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp
@@ -0,0 +1,169 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ ******************************************************************************/
+
+#ifndef KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP
+#define KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP
+#pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *diag_dtrtri_lower_128_16_bin = 0;
+size_t diag_dtrtri_lower_128_16_binSize = 0;
+
+const char * const diag_dtrtri_lower_128_16_src = STRINGIFY(
+#define BLOCK_SIZE 16 \n
+#define NB 128 \n
+#define ZERO ( 0.0) \n
+#define ONE ( 1.0) \n
+#ifdef DOUBLE_PRECISION \n
+#ifdef cl_khr_fp64 \n
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n
+#else \n
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n
+#endif \n
+#endif \n
+__kernel void diag_dtrtri_lower_128_16_src(\n
+int isDiagUnit,\n
+__global double const * restrict A, \n
+uint offA, \n
+__global double *d_dinvA, \n
+uint lda, \n
+uint na)\n
+{ \n
+int i, j;\n
+double Ystx = 0; \n
+__local double *Bw = 0, *x = 0, *y = 0; \n
+double switcher; \n
+double neg_switcher; \n
+
+
+// Thread index
+int tx = get_local_id(0); \n
+int txw; \n
+
+int gx = get_global_id(0); \n
+
+// Block index
+int bx = get_group_id(0); \n
+
+A = A + offA; \n
+
+__global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; \n
+int NumBLperNB = NB / BLOCK_SIZE; \n
+d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); \n
+
+__local double Bs[BLOCK_SIZE*BLOCK_SIZE]; \n
+__local double workspace[BLOCK_SIZE]; \n // workspace used to store the current working column
+
+// load A
+#pragma unroll\n
+for (i = 0; i < BLOCK_SIZE; i++)\n
+{ \n
+ if (tx >= i && gx < na)\n
+ { \n
+ Bs[i*BLOCK_SIZE + tx] = *(Aoff + i*lda + tx); \n
+ }\n
+ else\n
+ { \n
+ Bs[i*BLOCK_SIZE + tx] = ZERO; \n
+ }\n
+}\n
+
+// read in the whole square block of my A and zero out the non data triangular
+// not the upper or lower diagonal
+
+// Synchronize to make sure the matrices are loaded
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+
+
+// solve the diagonals
+
+if (isDiagUnit == 1)\n
+{ \n
+ Bs[tx*BLOCK_SIZE + tx] = ONE; \n
+}\n
+else\n
+{ \n
+ if (Bs[tx*BLOCK_SIZE + tx] == ZERO)\n
+ { \n
+ Bs[tx*BLOCK_SIZE + tx] = ONE; \n
+ }\n
+ else\n
+ { \n
+ Bs[tx*BLOCK_SIZE + tx] = ONE / (Bs[tx*BLOCK_SIZE + tx]); \n
+ }\n
+}\n
+
+/*
+* the lower case
+*/
+
+
+if (!(tx < BLOCK_SIZE - 1))\n
+{ \n
+ switcher = ONE; \n
+}\n
+else\n
+{ \n
+ switcher = ZERO; \n
+}\n
+
+Bs[(BLOCK_SIZE - 1)*BLOCK_SIZE + tx] = switcher * Bs[(BLOCK_SIZE - 1)*BLOCK_SIZE + tx]; \n
+// zero out the last column, except the diagonal element
+
+for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n
+ Ystx = ZERO; \n
+
+ if (tx > i)\n
+ { \n
+ switcher = ONE; \n
+ }\n
+ else\n
+ { \n
+ switcher = ZERO; \n
+ }\n
+
+ //dtrmv
+ Bw = Bs + (i + 1)*BLOCK_SIZE + i + 1; \n
+ workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n
+ x = workspace + i + 1; \n
+ y = Bs + i*BLOCK_SIZE; \n
+
+ txw = (tx - i - 1); \n
+
+#pragma unroll\n
+ for (j = 0; j < BLOCK_SIZE - i - 1; j++)\n
+ Ystx += switcher*(*(Bw + j*BLOCK_SIZE + txw)*x[j]); \n
+
+ //sscal
+
+ if (tx != i)\n
+ { \n
+ switcher = ONE; \n
+ neg_switcher = ZERO; \n
+ }\n
+ else\n
+ { \n
+ switcher = ZERO; \n
+ neg_switcher = ONE; \n
+ }\n
+
+ y[tx] = switcher * Ystx*(-Bs[i*BLOCK_SIZE + i]) + neg_switcher *y[tx]; \n
+
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+}\n
+
+// write back A
+#pragma unroll\n
+for (i = 0; i < BLOCK_SIZE; i++)\n
+ *(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
new file mode 100644
index 0000000..cfdf9ab
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_16_PART1_L_bin = 0;
+size_t triple_dgemm_update_128_16_PART1_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_16_PART1_L_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+c[0] += alpha * b[0]; \n
+c[1] += alpha * b[1]; \n
+c[2] += alpha * b[2]; \n
+c[3] += alpha * b[3]; \n
+c[4] += alpha * b[4]; \n
+c[5] += alpha * b[5]; \n
+c[6] += alpha * b[6]; \n
+c[7] += alpha * b[7]; \n
+c[8] += alpha * b[8]; \n
+c[9] += alpha * b[9]; \n
+c[10] += alpha * b[10]; \n
+c[11] += alpha * b[11]; \n
+c[12] += alpha * b[12]; \n
+c[13] += alpha * b[13]; \n
+c[14] += alpha * b[14]; \n
+c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_16_PART1_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n
+{ \n
+const int bIdy = get_group_id(1) / npages;\n
+//const int page = (get_group_id(1))%(npages);
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+//--------------------------part one---------------------------//
+{
+ // A21*inv(A11) -> A21
+ // A=A21, B=inv(A11), C=A21
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ int PagesPerNB = NB / (blk * 2); \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + blk + ibx + id; \n
+ int ya = page*blk * 2; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA; \n
+ C = d_dinvA + blk; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 4][iny] = B[4 + 0 * ldb]; \n
+ bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n
+ bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n
+ bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ bs[inx + 12][iny] = B[12 + 0 * ldb]; \n
+ bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n
+ bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n
+ bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+
+#undef READA\n
+
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
new file mode 100644
index 0000000..5ccbda2
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_16_PART2_L_bin = 0;
+size_t triple_dgemm_update_128_16_PART2_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_16_PART2_L_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+c[0] += alpha * b[0]; \n
+c[1] += alpha * b[1]; \n
+c[2] += alpha * b[2]; \n
+c[3] += alpha * b[3]; \n
+c[4] += alpha * b[4]; \n
+c[5] += alpha * b[5]; \n
+c[6] += alpha * b[6]; \n
+c[7] += alpha * b[7]; \n
+c[8] += alpha * b[8]; \n
+c[9] += alpha * b[9]; \n
+c[10] += alpha * b[10]; \n
+c[11] += alpha * b[11]; \n
+c[12] += alpha * b[12]; \n
+c[13] += alpha * b[13]; \n
+c[14] += alpha * b[14]; \n
+c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_16_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A22)*A21 -> A21
+ // A=inv(A22), B=A21, C=A21
+ __global double *A, *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ int PagesPerNB = NB / (blk * 2); \n
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA + blk*NB + blk; \n
+ B = C = d_dinvA + blk; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 4][iny] = B[4 + 0 * ldb]; \n
+ bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n
+ bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n
+ bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ bs[inx + 12][iny] = B[12 + 0 * ldb]; \n
+ bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n
+ bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n
+ bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
new file mode 100644
index 0000000..f0ce2f1
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_32_PART1_L_bin = 0;
+size_t triple_dgemm_update_128_32_PART1_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_32_PART1_L_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+c[0] += alpha * b[0]; \n
+c[1] += alpha * b[1]; \n
+c[2] += alpha * b[2]; \n
+c[3] += alpha * b[3]; \n
+c[4] += alpha * b[4]; \n
+c[5] += alpha * b[5]; \n
+c[6] += alpha * b[6]; \n
+c[7] += alpha * b[7]; \n
+c[8] += alpha * b[8]; \n
+c[9] += alpha * b[9]; \n
+c[10] += alpha * b[10]; \n
+c[11] += alpha * b[11]; \n
+c[12] += alpha * b[12]; \n
+c[13] += alpha * b[13]; \n
+c[14] += alpha * b[14]; \n
+c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART1_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n
+{ \n
+const int bIdy = get_group_id(1) / npages;\n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{
+ // A21*inv(A11) -> A21
+ // A=A21, B=inv(A11), C=A21
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + blk + ibx + id; \n
+ int ya = page*blk * 2;\n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA;\n
+ C = d_dinvA + blk; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
new file mode 100644
index 0000000..4b9b813
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A11)*A12*inv(A22)
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_32_PART2_L_bin = 0;
+size_t triple_dgemm_update_128_32_PART2_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_32_PART2_L_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+c[0] += alpha * b[0]; \n
+c[1] += alpha * b[1]; \n
+c[2] += alpha * b[2]; \n
+c[3] += alpha * b[3]; \n
+c[4] += alpha * b[4]; \n
+c[5] += alpha * b[5]; \n
+c[6] += alpha * b[6]; \n
+c[7] += alpha * b[7]; \n
+c[8] += alpha * b[8]; \n
+c[9] += alpha * b[9]; \n
+c[10] += alpha * b[10]; \n
+c[11] += alpha * b[11]; \n
+c[12] += alpha * b[12]; \n
+c[13] += alpha * b[13]; \n
+c[14] += alpha * b[14]; \n
+c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_32_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages;\n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny*get_local_size(0); \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part two---------------------------//
+{
+ // -inv(A22)*A21 -> A21
+ // A=inv(A22), B=A21, C=A21
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA + blk*NB + blk;\n
+ B = C = d_dinvA + blk; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ bs[inx + 8][iny] = B[8 + 0 * ldb]; \n
+ bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n
+ bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n
+ bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
new file mode 100644
index 0000000..5e80dc5
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A22)*A21*inv(A11)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_64_PART1_L_bin = 0;
+size_t triple_dgemm_update_128_64_PART1_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_64_PART1_L_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART1_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{
+ // A21*inv(A11) -> A21
+ // A=A21, B=inv(A11), C=A21
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+ int xa = page*blk * 2 + blk + ibx + id; \n
+ int ya = page*blk * 2; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0;\n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA;\n
+ C = d_dinvA + blk; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+#undef READA\n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
new file mode 100644
index 0000000..47e7fca
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A22)*A21*inv(A11)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_64_PART2_L_bin = 0;
+size_t triple_dgemm_update_128_64_PART2_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_64_PART2_L_src = STRINGIFY(
+ static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+ __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A22)*A21 -> A21
+ // A=inv(A22), B=A21, C=A21
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA + blk*NB + blk; \n
+ B = C = d_dinvA + blk; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
new file mode 100644
index 0000000..571cb72
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A22)*A21*inv(A11)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_ABOVE64_PART1_L_bin = 0;
+size_t triple_dgemm_update_128_ABOVE64_PART1_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_ABOVE64_PART1_L_src = STRINGIFY(
+static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+//--------------------------part one---------------------------//
+{
+ // A21*inv(A11) -> A21
+ // A=A21, B=inv(A11), C=A21
+ __global const double *A; \n
+ __global double *B, *C; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ int xa = page*blk * 2 + blk + ibx + id;\n
+ int ya = page*blk * 2; \n
+ int incA = ya * lda + xa; \n
+
+ // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???)
+
+ int maxA; \n
+ if (xa < na)\n
+ maxA = lda*na; \n // macro READA will detect overflow on y dimension
+ else\n
+ maxA = 0; \n // there is already an overflow on xa
+
+#define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n
+
+ B = d_dinvA;\n
+ C = d_dinvA + blk; \n
+
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4]; \n
+ a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n
+
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+#undef READA\n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = c[i]; \n
+ C += ldc; \n
+ }\n
+}\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
new file mode 100644
index 0000000..f2b99ba
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ *
+ * B21 = -inv(A22)*A21*inv(A11)
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_ABOVE64_PART2_L_bin = 0;
+size_t triple_dgemm_update_128_ABOVE64_PART2_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_ABOVE64_PART2_L_src = STRINGIFY(
+ static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages;\n
+const int page = qmod(get_group_id(1), npages);\n
+const int inx = get_local_id(0);\n
+const int iny = get_local_id(1);\n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+__local double bs[16][17]; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part two---------------------------//
+{
+ // -inv(A22)*A21 -> A21
+ // A=inv(A22), B=A21, C=A21
+ __global double *A, *B, *C; \n
+ int lda = NB; \n
+ int ldb = NB; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ A = d_dinvA + blk*NB + blk;\n
+ B = d_dinvA + blk; \n
+
+ C = d_dinvA + blk*NB; \n
+
+ A += ibx + id; \n
+ B += inx + __mul(iby + iny, ldb); \n
+ C += ibx + id + __mul(iby, ldc); \n
+
+ __global double *Blast = B + blk; \n
+
+ double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n
+
+ do {\n
+ double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n
+
+ bs[inx][iny] = B[0 * ldb]; \n
+ bs[inx][iny + 4] = B[4 * ldb]; \n
+ bs[inx][iny + 8] = B[8 * ldb]; \n
+ bs[inx][iny + 12] = B[12 * ldb]; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n
+ daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n
+ daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n
+ daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n
+
+ A += 4 * lda; \n
+ daxpy(a[0], &bs[12][0], c); \n
+ daxpy(a[1], &bs[13][0], c); \n
+ daxpy(a[2], &bs[14][0], c); \n
+ daxpy(a[3], &bs[15][0], c); \n
+
+ B += 16; \n
+ //__syncthreads();
+ barrier(CLK_LOCAL_MEM_FENCE); \n
+ } while (B < Blast); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C[0] = (-1)*c[i]; \n
+ C += ldc; \n
+ }\n
+ }\n
+}\n
+);
+#endif
diff --git a/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
new file mode 100644
index 0000000..994fabb
--- /dev/null
+++ b/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp
@@ -0,0 +1,88 @@
+/*******************************************************************************
+ * Hand-tuned kernel
+ * part 3: copy data back to position
+ *
+ ******************************************************************************/
+
+#ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP
+#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP
+#pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP.")
+
+#ifndef STRINGIFY
+#define STRINGIFY2(...) #__VA_ARGS__
+#define STRINGIFY(...) STRINGIFY2(__VA_ARGS__)
+#endif
+
+unsigned char *triple_dgemm_update_128_ABOVE64_PART3_L_bin = 0;
+size_t triple_dgemm_update_128_ABOVE64_PART3_L_binSize = 0;
+
+const char * const triple_dgemm_update_128_ABOVE64_PART3_L_src = STRINGIFY(
+ static void daxpy(\n
+double alpha, \n
+__local const double * __restrict__ b, \n
+double * __restrict__ c)\n
+{ \n
+ c[0] += alpha * b[0]; \n
+ c[1] += alpha * b[1]; \n
+ c[2] += alpha * b[2]; \n
+ c[3] += alpha * b[3]; \n
+ c[4] += alpha * b[4]; \n
+ c[5] += alpha * b[5]; \n
+ c[6] += alpha * b[6]; \n
+ c[7] += alpha * b[7]; \n
+ c[8] += alpha * b[8]; \n
+ c[9] += alpha * b[9]; \n
+ c[10] += alpha * b[10]; \n
+ c[11] += alpha * b[11]; \n
+ c[12] += alpha * b[12]; \n
+ c[13] += alpha * b[13]; \n
+ c[14] += alpha * b[14]; \n
+ c[15] += alpha * b[15]; \n
+}\n
+#define NB 128\n
+#define __mul(i,j) ((i)*(j))\n
+#define qmod(a, b) ((a)%(b))\n
+__kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n
+{\n
+const int bIdy = get_group_id(1) / npages; \n
+const int page = qmod(get_group_id(1), npages); \n
+const int inx = get_local_id(0); \n
+const int iny = get_local_id(1); \n
+const int ibx = get_group_id(0) * 64; \n
+const int iby = bIdy * 16; \n
+const int id = inx + iny * 16; \n
+
+Ain = Ain + offAin; \n
+
+int PagesPerNB = NB / (blk * 2); \n
+
+//--------------------------part three---------------------------//
+{
+ // -inv(A22)*A21 -> A21
+ // A=inv(A22), B=A21, C=A21
+ __global double *C_temp, *C_real; \n
+ int ldc = NB; \n
+
+ d_dinvA += NB*NB*(page / PagesPerNB)
+ + (qmod(page, PagesPerNB))*(blk * 2)*NB
+ + (qmod(page, PagesPerNB))*(blk * 2); \n
+
+ C_real = d_dinvA + blk;\n
+
+ C_temp = d_dinvA + blk*NB; \n
+
+ C_temp += ibx + id + __mul(iby, ldc); \n
+ C_real += ibx + id + __mul(iby, ldc); \n
+
+ for (int i = 0; i < 16; i++) {\n
+ C_real[0] = C_temp[0]; \n
+ C_temp[0] = ZERO; \n
+ C_real += ldc; \n
+ C_temp += ldc; \n
+ }\n
+}\n
+//__syncthreads();
+barrier(CLK_LOCAL_MEM_FENCE); \n
+}\n
+);
+#endif
diff --git a/src/library/blas/xtrsm.cc b/src/library/blas/xtrsm.cc
index 6610996..b62ebb2 100644
--- a/src/library/blas/xtrsm.cc
+++ b/src/library/blas/xtrsm.cc
@@ -770,26 +770,33 @@ cl_int diag_dtrtri128(
if (uplo == clblasLower) {
- /*
- cl_kernel diag_dtrtri_kernel_lower = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_LOWER", &err);
- if (err != CL_SUCCESS) {
- //printf( "create kernel -diag_dtrtri_kernel_lower- failed with %d\n", err );
- return err;
- }
+
+ diag_dtrtri_kernel_lower_KernelSource = diag_dtrtri_lower_128_16_src;
+ diag_dtrtri_kernel_lower_ClKernel = &diag_dtrtri_lower_128_16_clKernel;
+ diag_dtrtri_kernel_lower_KernelBinary = diag_dtrtri_lower_128_16_bin;
+ diag_dtrtri_kernel_lower_KernelBinarySize = diag_dtrtri_lower_128_16_binSize;
+
+ makeKernel(diag_dtrtri_kernel_lower_ClKernel,
+ queue,
+ diag_dtrtri_kernel_lower_KernelSource,
+ TrtriBuildOptions,
+ &diag_dtrtri_kernel_lower_KernelBinary,
+ &diag_dtrtri_kernel_lower_KernelBinarySize,
+ TrtribinBuildOptions);
int isDiagUnit = (diag == clblasUnit);
- clSetKernelArg(diag_dtrtri_kernel_lower, 0, sizeof(int), &isDiagUnit);
- clSetKernelArg(diag_dtrtri_kernel_lower, 1, sizeof(cl_mem), &A);
- clSetKernelArg(diag_dtrtri_kernel_lower, 2, sizeof(unsigned int), &offA);
- clSetKernelArg(diag_dtrtri_kernel_lower, 3, sizeof(cl_mem), &d_dinvA);
- clSetKernelArg(diag_dtrtri_kernel_lower, 4, sizeof(unsigned int), &lda);
- clSetKernelArg(diag_dtrtri_kernel_lower, 5, sizeof(unsigned int), &m);
+ clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 0, sizeof(int), &isDiagUnit);
+ clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 1, sizeof(cl_mem), &A);
+ clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 2, sizeof(unsigned int), &offA);
+ clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 3, sizeof(cl_mem), &d_dinvA);
+ clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 4, sizeof(unsigned int), &lda);
+ clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 5, sizeof(unsigned int), &m);
size_t globalThreads[1] = { nthreads };
- size_t globalLocal[1] = { BLOCK_SIZE };
+ size_t globalLocal[1] = { inner_block_size };
- err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_lower, 1, NULL,
+ err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_lower_ClKernel, 1, NULL,
globalThreads, globalLocal,
0, NULL, event);
@@ -798,41 +805,118 @@ cl_int diag_dtrtri128(
return err;
}
- err = clReleaseKernel(diag_dtrtri_kernel_lower);
- if (err != CL_SUCCESS) {
- return err;
- }
// update the inverse up to the size of BLOCK_SIZE
- for (int i = BLOCK_SIZE; i < NB; i *= 2) {
+ for (int i = inner_block_size; i < outer_block_size; i *= 2) {
switch (i) {
case 16:
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_16_PART1_L_clKernel,
+ triple_dgemm_update_128_16_PART1_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_16_PART1_L_bin,
+ &triple_dgemm_update_128_16_PART1_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_16_PART2_L_clKernel,
+ triple_dgemm_update_128_16_PART2_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_16_PART2_L_bin,
+ &triple_dgemm_update_128_16_PART2_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
break;
case 32:
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART1_L_clKernel,
+ triple_dgemm_update_128_32_PART1_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_32_PART1_L_bin,
+ &triple_dgemm_update_128_32_PART1_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART2_L_clKernel,
+ triple_dgemm_update_128_32_PART2_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_32_PART2_L_bin,
+ &triple_dgemm_update_128_32_PART2_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
break;
case 64:
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART1_L_clKernel,
+ triple_dgemm_update_128_64_PART1_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_64_PART1_L_bin,
+ &triple_dgemm_update_128_64_PART1_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART2_L_clKernel,
+ triple_dgemm_update_128_64_PART2_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_64_PART2_L_bin,
+ &triple_dgemm_update_128_64_PART2_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
break;
default:
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
- CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_L", prg, queue, A, offA, d_dinvA, i, lda, M, event);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART1_L_clKernel,
+ triple_dgemm_update_128_ABOVE64_PART1_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART1_L_bin,
+ &triple_dgemm_update_128_ABOVE64_PART1_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART2_L_clKernel,
+ triple_dgemm_update_128_ABOVE64_PART2_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART2_L_bin,
+ &triple_dgemm_update_128_ABOVE64_PART2_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
+ err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART3_L_clKernel,
+ triple_dgemm_update_128_ABOVE64_PART3_L_src,
+ TrtriBuildOptions,
+ (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART3_L_bin,
+ &triple_dgemm_update_128_ABOVE64_PART3_L_binSize,
+ TrtribinBuildOptions,
+ queue,
+ A, offA, d_dinvA, i, lda, M, event);
+ CL_CHECK(err);
break;
}
if (i * 2 >= M) break;
}
- */
+
}
else {
@@ -917,8 +1001,6 @@ cl_int diag_dtrtri128(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- //err = clFinish(queue);
- //CL_CHECK(err);
err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART2_R_clKernel,
triple_dgemm_update_128_32_PART2_R_src,
TrtriBuildOptions,
@@ -928,8 +1010,6 @@ cl_int diag_dtrtri128(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- //err = clFinish(queue);
- //CL_CHECK(err);
break;
@@ -946,9 +1026,6 @@ cl_int diag_dtrtri128(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- //err = clFinish(queue);
- //CL_CHECK(err);
-
err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART2_R_clKernel,
triple_dgemm_update_128_64_PART2_R_src,
TrtriBuildOptions,
@@ -958,8 +1035,6 @@ cl_int diag_dtrtri128(
queue,
A, offA, d_dinvA, i, lda, M, event);
CL_CHECK(err);
- //err = clFinish(queue);
- //CL_CHECK(err);
break;
@@ -1034,8 +1109,6 @@ static clblasStatus gpu_dtrsm128(
//for now
if (side == clblasRight)
return clblasNotImplemented;
- if (uplo == clblasLower)
- return clblasNotImplemented;
int inner_block_size = 16; // inner blocking size, <=32
int outer_block_size = 128;// outer blocking size, >BLOCK_SIZE
@@ -1110,11 +1183,6 @@ static clblasStatus gpu_dtrsm128(
/* the lower case */
/* handle the first block seperately with alpha */
- //return for now
- clReleaseMemObject(InvA);
- clReleaseMemObject(X);
- return clblasNotImplemented;
-
int mm = min(outer_block_size, (int)M);
DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, 0), _(B, 0, 0), zero, _(X, 0, 0));
@@ -1170,11 +1238,6 @@ static clblasStatus gpu_dtrsm128(
/* the lower case */
/* handle the first block seperately with alpha */
- //return for now
- clReleaseMemObject(InvA);
- clReleaseMemObject(X);
- return clblasNotImplemented;
-
int mm = (M % outer_block_size == 0) ? outer_block_size : (M % outer_block_size);
i = M - mm;
DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/clblas.git
More information about the debian-science-commits
mailing list